cmd: Go Coverage Report

package main

import (
        "context"
        "fmt"
        "log"
        "os/signal"
        "syscall"
        "time"

        "github.com/tychoish/fun/srv"
        "go.uber.org/zap"
        "go.uber.org/zap/zapcore"
        "go.uber.org/zap/zapio"

        "k8s.io/apimachinery/pkg/runtime"
        "k8s.io/klog/v2"
        "k8s.io/kubernetes/cmd/kube-scheduler/app"
        "k8s.io/kubernetes/pkg/scheduler/framework"

        "github.com/neondatabase/autoscaling/pkg/plugin"
        "github.com/neondatabase/autoscaling/pkg/util"
)

// all of the juicy bits are defined in pkg/plugin/

func main() {
        logConfig := zap.NewProductionConfig()
        logConfig.Sampling = nil           // Disable sampling, which the production config enables by default.
        logConfig.DisableStacktrace = true // No stack traces; reconcile failures spam the logs otherwise
        logger := zap.Must(logConfig.Build()).Named("autoscale-scheduler")

        if err := runProgram(logger); err != nil {
                log.Fatal(err)
        }
}

// runProgram is the "real" main, but returning an error means that
// the shutdown handling code doesn't have to call os.Exit, even indirectly.
func runProgram(logger *zap.Logger) (err error) {
        conf, err := plugin.ReadConfig(plugin.DefaultConfigPath)
        if err != nil {
                return fmt.Errorf("Error reading config at %q: %w", plugin.DefaultConfigPath, err)
        }

        // this: listens for sigterm, when we catch that signal, the
        // context gets canceled, a go routine waits for half a second, and
        // then closes the signal channel, which we block on in a
        // defer. because defers execute in LIFO errors, this just
        // pauses for a *very* short period of time before exiting.
        //
        // eventually, the constructed application will track it's
        // services and be able to more coherently wait for shutdown
        // without needing a sleep.
        ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGTERM)
        defer cancel()
        ctx = srv.SetShutdownSignal(ctx)
        ctx = srv.WithOrchestrator(ctx)
        ctx = srv.SetBaseContext(ctx)
        orca := srv.GetOrchestrator(ctx)
        defer func() { err = orca.Service().Wait() }()

        if err := orca.Add(srv.HTTP("scheduler-pprof", time.Second, util.MakePPROF("0.0.0.0:7777"))); err != nil {
                return err
        }

        // The normal scheduler outputs to klog, and there isn't *really* a way to stop that. So to make
        // everything fit nicely, we'll redirect it to zap as well.
        redirectKlog(logger.Named("klog"))

        constructor := func(_ctx context.Context, obj runtime.Object, h framework.Handle) (framework.Plugin, error) {
                return plugin.NewAutoscaleEnforcerPlugin(ctx, logger, h, conf)
        }

        command := app.NewSchedulerCommand(app.WithPlugin(plugin.PluginName, constructor))
        // Don't output the full usage whenever any error occurs (otherwise, startup errors get drowned
        // out by many pages of scheduler command flags)
        command.SilenceUsage = true

        if err := command.ExecuteContext(ctx); err != nil {
                return err
        }
        return
}

func redirectKlog(to *zap.Logger) {
        severityPairs := []struct {
                klogLevel string
                zapLevel  zapcore.Level
        }{
                {"info", zapcore.InfoLevel},
                {"warning", zapcore.WarnLevel},
                {"error", zapcore.ErrorLevel},
                {"fatal", zapcore.FatalLevel},
        }

        for _, pair := range severityPairs {
                klog.SetOutputBySeverity(pair.klogLevel, &zapio.Writer{
                        Log:   to,
                        Level: pair.zapLevel,
                })
        }

        // By default, we'll get LogToStderr(true), which completely bypasses any redirecting with
        // SetOutput or SetOutputBySeverity. So... we'd like to avoid that, which thankfully we can do.
        klog.LogToStderr(false)
}

package main

import (
        "context"
        "os/signal"
        "syscall"
        "time"

        "github.com/tychoish/fun/srv"
        "go.uber.org/zap"

        "k8s.io/client-go/kubernetes"
        scheme "k8s.io/client-go/kubernetes/scheme"
        "k8s.io/client-go/rest"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
        "github.com/neondatabase/autoscaling/pkg/agent"
        "github.com/neondatabase/autoscaling/pkg/util"
)

func main() {
        logConfig := zap.NewProductionConfig()
        logConfig.Sampling = nil                // Disable sampling, which the production config enables by default.
        logConfig.Level.SetLevel(zap.InfoLevel) // Only "info" level and above (i.e. not debug logs)
        logger := zap.Must(logConfig.Build()).Named("autoscaler-agent")
        defer logger.Sync() //nolint:errcheck // what are we gonna do, log something about it?

        envArgs, err := agent.ArgsFromEnv()
        if err != nil {
                logger.Panic("Failed to get args from environment", zap.Error(err))
        }
        logger.Info("Got environment args", zap.Any("args", envArgs))

        config, err := agent.ReadConfig(envArgs.ConfigPath)
        if err != nil {
                logger.Panic("Failed to read config", zap.Error(err))
        }
        logger.Info("Got config", zap.Any("config", config))

        kubeConfig, err := rest.InClusterConfig()
        if err != nil {
                logger.Panic("Failed to get in-cluster K8s config", zap.Error(err))
        }
        kubeClient, err := kubernetes.NewForConfig(kubeConfig)
        if err != nil {
                logger.Panic("Failed to make K8S client", zap.Error(err))
        }
        if err = vmv1.AddToScheme(scheme.Scheme); err != nil {
                logger.Panic("Failed to add NeonVM scheme", zap.Error(err))
        }

        vmClient, err := vmclient.NewForConfig(kubeConfig)
        if err != nil {
                logger.Panic("Failed to make VM client", zap.Error(err))
        }

        runner := agent.MainRunner{
                EnvArgs:    envArgs,
                Config:     config,
                KubeClient: kubeClient,
                VMClient:   vmClient,
        }

        ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGTERM)
        defer cancel()
        ctx = srv.SetShutdownSignal(ctx)
        ctx = srv.SetBaseContext(ctx)
        ctx = srv.WithOrchestrator(ctx)
        defer func() {
                if err := srv.GetOrchestrator(ctx).Wait(); err != nil {
                        logger.Panic("Failed to shut down orchestrator", zap.Error(err))
                }

                logger.Info("Main loop returned without issue. Exiting.")
        }()

        if err := srv.GetOrchestrator(ctx).Add(srv.HTTP("agent-pprof", time.Second, util.MakePPROF("0.0.0.0:7777"))); err != nil {
                logger.Panic("Failed to add pprof service", zap.Error(err))
        }

        if err = runner.Run(logger, ctx); err != nil {
                logger.Panic("Main loop failed", zap.Error(err))
        }
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
        "context"
        "encoding/json"
        "errors"
        "flag"
        "fmt"
        "net/http"
        "os"
        "os/signal"
        "strings"
        "syscall"
        "time"

        certv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1"
        "github.com/go-logr/zapr"
        "github.com/tychoish/fun/srv"
        "go.uber.org/zap"
        "go.uber.org/zap/zapcore"
        ctrl "sigs.k8s.io/controller-runtime"
        "sigs.k8s.io/controller-runtime/pkg/healthz"
        "sigs.k8s.io/controller-runtime/pkg/manager"
        metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"

        "k8s.io/apimachinery/pkg/runtime"
        "k8s.io/apimachinery/pkg/types"
        utilruntime "k8s.io/apimachinery/pkg/util/runtime"
        clientgoscheme "k8s.io/client-go/kubernetes/scheme"
        // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
        // to ensure that exec-entrypoint and run can make use of them.
        _ "k8s.io/client-go/plugin/pkg/client/auth"
        "k8s.io/klog/v2"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/neonvm/controllers"
        "github.com/neondatabase/autoscaling/pkg/neonvm/ipam"
        "github.com/neondatabase/autoscaling/pkg/util"
)

var (
        scheme   = runtime.NewScheme()
        setupLog = ctrl.Log.WithName("setup")
)

func init() {
        utilruntime.Must(clientgoscheme.AddToScheme(scheme))

        utilruntime.Must(vmv1.AddToScheme(scheme))
        utilruntime.Must(certv1.AddToScheme(scheme))
        //+kubebuilder:scaffold:scheme
}

func run(mgr manager.Manager) error {
        ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
        defer cancel()
        ctx = srv.SetShutdownSignal(ctx)
        ctx = srv.SetBaseContext(ctx)
        ctx = srv.WithOrchestrator(ctx)
        orca := srv.GetOrchestrator(ctx)

        defer func() {
                setupLog.Info("main loop returned, exiting")
        }()

        if err := orca.Add(srv.HTTP("pprof", time.Second, util.MakePPROF("0.0.0.0:7777"))); err != nil {
                return fmt.Errorf("failed to add pprof service: %w", err)
        }

        setupLog.Info("starting manager")
        if err := mgr.Start(ctx); err != nil {
                return fmt.Errorf("problem running manager: %w", err)
        }

        return nil
}

func main() {
        var metricsAddr string
        var enableLeaderElection bool
        var probeAddr string
        var concurrencyLimit int
        var skipUpdateValidationFor map[types.NamespacedName]struct{}
        var disableRunnerCgroup bool
        var defaultCpuScalingMode vmv1.CpuScalingMode
        var qemuDiskCacheSettings string
        var memhpAutoMovableRatio string
        var failurePendingPeriod time.Duration
        var failingRefreshInterval time.Duration
        var atMostOnePod bool
        flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
        flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
        flag.BoolVar(&enableLeaderElection, "leader-elect", false,
                "Enable leader election for controller manager. "+
                        "Enabling this will ensure there is only one active controller manager.")
        flag.IntVar(&concurrencyLimit, "concurrency-limit", 1, "Maximum number of concurrent reconcile operations")
        flag.Func(
                "skip-update-validation-for",
                "Comma-separated list of object names to skip webhook validation, like 'foo' or 'default/bar'",
                func(value string) error {
                        objSet := make(map[types.NamespacedName]struct{})

                        if value != "" {
                                for _, name := range strings.Split(value, ",") {
                                        if name == "" {
                                                return errors.New("name must not be empty")
                                        }

                                        var namespacedName types.NamespacedName
                                        splitBySlash := strings.SplitN(name, "/", 1)
                                        if len(splitBySlash) == 1 {
                                                namespacedName = types.NamespacedName{Namespace: "default", Name: splitBySlash[0]}
                                        } else {
                                                namespacedName = types.NamespacedName{Namespace: splitBySlash[0], Name: splitBySlash[1]}
                                        }
                                        objSet[namespacedName] = struct{}{}
                                }
                        }
                        skipUpdateValidationFor = objSet
                        return nil
                },
        )
        flag.Func("default-cpu-scaling-mode", "Set default cpu scaling mode to use for new VMs", defaultCpuScalingMode.FlagFunc)
        flag.BoolVar(&disableRunnerCgroup, "disable-runner-cgroup", false, "Disable creation of a cgroup in neonvm-runner for fractional CPU limiting")
        flag.StringVar(&qemuDiskCacheSettings, "qemu-disk-cache-settings", "cache=none", "Set neonvm-runner's QEMU disk cache settings")
        flag.StringVar(&memhpAutoMovableRatio, "memhp-auto-movable-ratio", "301", "For virtio-mem, set VM kernel's memory_hotplug.auto_movable_ratio")
        flag.DurationVar(&failurePendingPeriod, "failure-pending-period", 1*time.Minute,
                "the period for the propagation of reconciliation failures to the observability instruments")
        flag.DurationVar(&failingRefreshInterval, "failing-refresh-interval", 1*time.Minute,
                "the interval between consecutive updates of metrics and logs, related to failing reconciliations")
        flag.BoolVar(&atMostOnePod, "at-most-one-pod", false,
                "If true, the controller will ensure that at most one pod is running at a time. "+
                        "Otherwise, the outdated pod might be left to terminate, while the new one is already running.")
        flag.Parse()

        logConfig := zap.NewProductionConfig()
        logConfig.Sampling = nil // Disabling sampling; it's enabled by default for zap's production configs.
        logConfig.Level.SetLevel(zap.InfoLevel)
        logConfig.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder
        logger := zapr.NewLogger(zap.Must(logConfig.Build(zap.AddStacktrace(zapcore.PanicLevel))))

        ctrl.SetLogger(logger)
        // define klog settings (used in LeaderElector)
        klog.SetLogger(logger.V(2))

        // tune k8s client for manager
        cfg := ctrl.GetConfigOrDie()
        cfg.QPS = 1000
        cfg.Burst = 2000
        mgr, err := ctrl.NewManager(cfg, ctrl.Options{
                Scheme: scheme,
                Metrics: metricsserver.Options{
                        BindAddress: metricsAddr,
                },
                HealthProbeBindAddress: probeAddr,
                LeaderElection:         enableLeaderElection,
                LeaderElectionID:       "a3b22509.neon.tech",
                // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
                // when the Manager ends. This requires the binary to immediately end when the
                // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly
                // speeds up voluntary leader transitions as the new leader don't have to wait
                // LeaseDuration time first.
                //
                // This option is only safe as long as the program immediately exits after the manager
                // stops.
                LeaderElectionReleaseOnCancel: true,
        })
        if err != nil {
                setupLog.Error(err, "unable to start manager")
                panic(err)
        }

        reconcilerMetrics := controllers.MakeReconcilerMetrics()

        rc := &controllers.ReconcilerConfig{
                DisableRunnerCgroup:     disableRunnerCgroup,
                MaxConcurrentReconciles: concurrencyLimit,
                SkipUpdateValidationFor: skipUpdateValidationFor,
                QEMUDiskCacheSettings:   qemuDiskCacheSettings,
                MemhpAutoMovableRatio:   memhpAutoMovableRatio,
                FailurePendingPeriod:    failurePendingPeriod,
                FailingRefreshInterval:  failingRefreshInterval,
                AtMostOnePod:            atMostOnePod,
                DefaultCPUScalingMode:   defaultCpuScalingMode,
                NADConfig:               controllers.GetNADConfig(),
        }

        // Let's not have more than a quater of reconcilliation workers stuck
        // at IPAM mutex.
        ipam, err := ipam.New(rc.NADConfig.IPAMName, rc.NADConfig.IPAMNamespace, max(1, concurrencyLimit/4))
        if err != nil {
                setupLog.Error(err, "unable to create ipam")
                panic(err)
        }
        defer ipam.Close()

        vmReconciler := &controllers.VMReconciler{
                Client:   mgr.GetClient(),
                Scheme:   mgr.GetScheme(),
                Recorder: mgr.GetEventRecorderFor("virtualmachine-controller"),
                Config:   rc,
                Metrics:  reconcilerMetrics,
                IPAM:     ipam,
        }
        vmReconcilerMetrics, err := vmReconciler.SetupWithManager(mgr)
        if err != nil {
                setupLog.Error(err, "unable to create controller", "controller", "VirtualMachine")
                panic(err)
        }
        vmWebhook := &controllers.VMWebhook{
                Recorder: mgr.GetEventRecorderFor("virtualmachine-webhook"),
                Config:   rc,
        }
        if err := vmWebhook.SetupWithManager(mgr); err != nil {
                setupLog.Error(err, "unable to create webhook", "webhook", "VirtualMachine")
                panic(err)
        }

        migrationReconciler := &controllers.VirtualMachineMigrationReconciler{
                Client:   mgr.GetClient(),
                Scheme:   mgr.GetScheme(),
                Recorder: mgr.GetEventRecorderFor("virtualmachinemigration-controller"),
                Config:   rc,
                Metrics:  reconcilerMetrics,
        }
        migrationReconcilerMetrics, err := migrationReconciler.SetupWithManager(mgr)
        if err != nil {
                setupLog.Error(err, "unable to create controller", "controller", "VirtualMachineMigration")
                panic(err)
        }
        migrationWebhook := &controllers.VMMigrationWebhook{
                Recorder: mgr.GetEventRecorderFor("virtualmachinemigration-webhook"),
                Config:   rc,
        }
        if err := migrationWebhook.SetupWithManager(mgr); err != nil {
                setupLog.Error(err, "unable to create webhook", "webhook", "VirtualMachine")
                panic(err)
        }
        //+kubebuilder:scaffold:builder

        if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
                setupLog.Error(err, "unable to set up health check")
                panic(err)
        }
        if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
                setupLog.Error(err, "unable to set up ready check")
                panic(err)
        }

        dbgSrv := debugServerFunc(vmReconcilerMetrics, migrationReconcilerMetrics)
        if err := mgr.Add(dbgSrv); err != nil {
                setupLog.Error(err, "unable to set up debug server")
                panic(err)
        }

        if err := mgr.Add(vmReconcilerMetrics.FailingRefresher()); err != nil {
                setupLog.Error(err, "unable to set up failing refresher")
                panic(err)
        }

        // NOTE: THE CONTROLLER MUST IMMEDIATELY EXIT AFTER RUNNING THE MANAGER.
        if err := run(mgr); err != nil {
                setupLog.Error(err, "run manager error")
                panic(err)
        }
}

func debugServerFunc(reconcilers ...controllers.ReconcilerWithMetrics) manager.RunnableFunc {
        return manager.RunnableFunc(func(ctx context.Context) error {
                mux := http.NewServeMux()
                mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
                        defer r.Body.Close()

                        if r.Method != http.MethodGet {
                                w.WriteHeader(http.StatusMethodNotAllowed)
                                _, _ = w.Write([]byte(fmt.Sprintf("request method must be %s", http.MethodGet)))
                                return
                        }

                        response := make([]controllers.ReconcileSnapshot, 0, len(reconcilers))
                        for _, r := range reconcilers {
                                response = append(response, r.Snapshot())
                        }

                        responseBody, err := json.Marshal(&response)
                        if err != nil {
                                w.WriteHeader(http.StatusInternalServerError)
                                _, _ = w.Write([]byte(fmt.Sprintf("failed to marshal JSON response: %s", err)))
                                return
                        }

                        w.WriteHeader(http.StatusOK)
                        _, _ = w.Write(responseBody)
                })

                server := &http.Server{
                        Addr:    "0.0.0.0:7778",
                        Handler: mux,
                }
                ctx, cancel := context.WithCancel(ctx)
                defer cancel()

                go func() {
                        <-ctx.Done()
                        _ = server.Shutdown(context.TODO())
                }()

                return server.ListenAndServe()
        })
}

package main

import (
        "encoding/base64"
        "encoding/json"
        "flag"
        "fmt"
        "io"
        "net/http"
        "os"
        "path/filepath"
        "strconv"
        "sync"
        "time"

        "go.uber.org/zap"

        k8sutil "k8s.io/kubernetes/pkg/volume/util"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/neonvm/cpuscaling"
        "github.com/neondatabase/autoscaling/pkg/util"
)

func main() {
        addr := flag.String("addr", "", `address to bind for HTTP requests`)
        flag.Parse()

        if *addr == "" {
                fmt.Println("neonvm-daemon missing -addr flag")
                os.Exit(1)
        }

        logConfig := zap.NewProductionConfig()
        logConfig.Sampling = nil                // Disable sampling, which the production config enables by default.
        logConfig.Level.SetLevel(zap.InfoLevel) // Only "info" level and above (i.e. not debug logs)
        logger := zap.Must(logConfig.Build()).Named("neonvm-daemon")
        defer logger.Sync() //nolint:errcheck // what are we gonna do, log something about it?

        logger.Info("Starting neonvm-daemon", zap.String("addr", *addr))
        srv := cpuServer{
                cpuOperationsMutex:  &sync.Mutex{},
                cpuScaler:           cpuscaling.NewCPUScaler(),
                fileOperationsMutex: &sync.Mutex{},
                logger:              logger.Named("cpu-srv"),
        }
        srv.run(*addr)
}

type cpuServer struct {
        // Protects CPU operations from concurrent access to prevent multiple ensureOnlineCPUs calls from running concurrently
        // and ensure that status response is always actual
        cpuOperationsMutex  *sync.Mutex
        cpuScaler           *cpuscaling.CPUScaler
        fileOperationsMutex *sync.Mutex
        logger              *zap.Logger
}

func (s *cpuServer) handleGetCPUStatus(w http.ResponseWriter) {
        s.cpuOperationsMutex.Lock()
        defer s.cpuOperationsMutex.Unlock()

        activeCPUs, err := s.cpuScaler.ActiveCPUsCount()
        if err != nil {
                s.logger.Error("could not get active CPUs count", zap.Error(err))
                w.WriteHeader(http.StatusInternalServerError)
                return
        }
        w.WriteHeader(http.StatusOK)

        if _, err := w.Write([]byte(fmt.Sprintf("%d", activeCPUs*1000))); err != nil {
                s.logger.Error("could not write response", zap.Error(err))
        }
}

func (s *cpuServer) handleSetCPUStatus(w http.ResponseWriter, r *http.Request) {
        s.cpuOperationsMutex.Lock()
        defer s.cpuOperationsMutex.Unlock()

        body, err := io.ReadAll(r.Body)
        if err != nil {
                s.logger.Error("could not read request body", zap.Error(err))
                w.WriteHeader(http.StatusBadRequest)
                return
        }
        defer r.Body.Close()

        updateInt, err := strconv.Atoi(string(body))
        if err != nil {
                s.logger.Error("could not unmarshal request body", zap.Error(err))
                w.WriteHeader(http.StatusBadRequest)
                return
        }

        s.logger.Debug("Setting CPU status", zap.String("body", string(body)))
        update := vmv1.MilliCPU(updateInt)
        if err := s.cpuScaler.ReconcileOnlineCPU(int(update.RoundedUp())); err != nil {
                s.logger.Error("could not ensure online CPUs", zap.Error(err))
                w.WriteHeader(http.StatusInternalServerError)
                return
        }
        w.WriteHeader(http.StatusOK)
}

func (s *cpuServer) handleGetFileChecksum(w http.ResponseWriter, r *http.Request, path string) {
        s.fileOperationsMutex.Lock()
        defer s.fileOperationsMutex.Unlock()

        if err := r.Context().Err(); err != nil {
                w.WriteHeader(http.StatusRequestTimeout)
                return
        }

        dir := filepath.Join(path, "..data")
        checksum, err := util.ChecksumFlatDir(dir)
        if err != nil {
                s.logger.Error("could not checksum dir", zap.Error(err))
                w.WriteHeader(http.StatusBadRequest)
                return
        }

        w.WriteHeader(http.StatusOK)
        if _, err := w.Write([]byte(checksum)); err != nil {
                s.logger.Error("could not write response", zap.Error(err))
        }
}

type File struct {
        // base64 encoded file contents
        Data string `json:"data"`
}

func (s *cpuServer) handleUploadFile(w http.ResponseWriter, r *http.Request, path string) {
        s.fileOperationsMutex.Lock()
        defer s.fileOperationsMutex.Unlock()

        if err := r.Context().Err(); err != nil {
                w.WriteHeader(http.StatusRequestTimeout)
                return
        }

        if r.Body == nil {
                s.logger.Error("no body")
                w.WriteHeader(http.StatusBadRequest)
                return
        }
        defer r.Body.Close()

        body, err := io.ReadAll(r.Body)
        if err != nil {
                s.logger.Error("could not ready body", zap.Error(err))
                w.WriteHeader(http.StatusBadRequest)
                return
        }

        var files map[string]File
        if err := json.Unmarshal(body, &files); err != nil {
                s.logger.Error("could not ready body", zap.Error(err))
                w.WriteHeader(http.StatusBadRequest)
                return
        }

        payload := make(map[string]k8sutil.FileProjection)
        for k, v := range files {
                data, err := base64.StdEncoding.DecodeString(v.Data)
                if err != nil {
                        s.logger.Error("could not ready body", zap.Error(err))
                        w.WriteHeader(http.StatusBadRequest)
                        return
                }
                payload[k] = k8sutil.FileProjection{
                        Data: data,
                        // read-write by root
                        // read-only otherwise
                        Mode:   0o644,
                        FsUser: nil,
                }
        }

        aw, err := k8sutil.NewAtomicWriter(path, "neonvm-daemon")
        if err != nil {
                s.logger.Error("could not create writer", zap.Error(err))
                w.WriteHeader(http.StatusBadRequest)
                return
        }

        if err := aw.Write(payload, nil); err != nil {
                s.logger.Error("could not create files", zap.Error(err))
                w.WriteHeader(http.StatusInternalServerError)
                return
        }

        w.WriteHeader(http.StatusOK)
}

func (s *cpuServer) run(addr string) {
        mux := http.NewServeMux()
        mux.HandleFunc("/cpu", func(w http.ResponseWriter, r *http.Request) {
                if r.Method == http.MethodGet {
                        s.handleGetCPUStatus(w)
                        return
                } else if r.Method == http.MethodPut {
                        s.handleSetCPUStatus(w, r)
                        return
                } else {
                        // unknown method
                        w.WriteHeader(http.StatusNotFound)
                }
        })
        mux.HandleFunc("/files/{path...}", func(w http.ResponseWriter, r *http.Request) {
                path := fmt.Sprintf("/%s", r.PathValue("path"))
                if r.Method == http.MethodGet {
                        s.handleGetFileChecksum(w, r, path)
                        return
                } else if r.Method == http.MethodPut {
                        s.handleUploadFile(w, r, path)
                        return
                } else {
                        // unknown method
                        w.WriteHeader(http.StatusNotFound)
                }
        })

        timeout := 5 * time.Second
        server := http.Server{
                Addr:              addr,
                Handler:           mux,
                ReadTimeout:       timeout,
                ReadHeaderTimeout: timeout,
                WriteTimeout:      timeout,
        }

        err := server.ListenAndServe()
        if err != nil {
                s.logger.Fatal("CPU server exited with error", zap.Error(err))
        }
        s.logger.Info("CPU server exited without error")
}

package main

import (
        "errors"
        "fmt"
        "os"
        "strings"

        "github.com/containerd/cgroups/v3"
        "github.com/containerd/cgroups/v3/cgroup1"
        "github.com/containerd/cgroups/v3/cgroup2"
        "github.com/opencontainers/runtime-spec/specs-go"
        "go.uber.org/zap"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)

const (
        // cgroupPeriod is the period for evaluating cgroup quota
        // in microseconds. Min 1000 microseconds, max 1 second
        cgroupPeriod     = uint64(100000)
        cgroupMountPoint = "/sys/fs/cgroup"

        // cpuLimitOvercommitFactor sets the amount above the VM's spec.guest.cpus.use that we set the
        // QEMU cgroup's CPU limit to. e.g. if cpuLimitOvercommitFactor = 3 and the VM is using 0.5
        // CPUs, we set the cgroup to limit QEMU+VM to 1.5 CPUs.
        //
        // This exists because setting the cgroup exactly equal to the VM's CPU value is overly
        // pessimistic, results in a lot of unused capacity on the host, and particularly impacts
        // operations that parallelize between the VM and QEMU, like heavy disk access.
        //
        // See also: https://neondb.slack.com/archives/C03TN5G758R/p1693462680623239
        cpuLimitOvercommitFactor = 4
)

// setupQEMUCgroup sets up a cgroup for us to run QEMU in, returning the path of that cgroup
func setupQEMUCgroup(logger *zap.Logger, selfPodName string, initialCPU vmv1.MilliCPU) (string, error) {
        selfCgroupPath, err := getSelfCgroupPath(logger)
        if err != nil {
                return "", fmt.Errorf("Failed to get self cgroup path: %w", err)
        }
        // Sometimes we'll get just '/' as our cgroup path. If that's the case, we should reset it so
        // that the cgroup '/neonvm-qemu-...' still works.
        if selfCgroupPath == "/" {
                selfCgroupPath = ""
        }
        // ... but also we should have some uniqueness just in case, so we're not sharing a root level
        // cgroup if that *is* what's happening. This *should* only be relevant for local clusters.
        //
        // We don't want to just use the VM spec's .status.PodName because during migrations that will
        // be equal to the source pod, not this one, which may be... somewhat confusing.
        cgroupPath := fmt.Sprintf("%s/neonvm-qemu-%s", selfCgroupPath, selfPodName)

        logger.Info("Determined QEMU cgroup path", zap.String("path", cgroupPath))

        if err := setCgroupLimit(logger, initialCPU, cgroupPath); err != nil {
                return "", fmt.Errorf("Failed to set cgroup limit: %w", err)
        }

        return cgroupPath, nil
}

func getSelfCgroupPath(logger *zap.Logger) (string, error) {
        // There's some fun stuff here. For general information, refer to `man 7 cgroups` - specifically
        // the section titled "/proc files" - for "/proc/cgroups" and "/proc/pid/cgroup".
        //
        // In general, the idea is this: If we start QEMU outside of the cgroup for the container we're
        // running in, we run into multiple problems - it won't show up in metrics, and we'll have to
        // clean up the cgroup ourselves. (not good!).
        //
        // So we'd like to start it in the same cgroup - the question is just how to find the name of
        // the cgroup we're running in. Thankfully, this is visible in `/proc/self/cgroup`!
        // The only difficulty is the file format.
        //
        // In cgroup v1 (which is what we have on EKS [as of 2023-07]), the contents of
        // /proc/<pid>/cgroup tend to look like:
        //
        //   11:cpuset:/path/to/cgroup
        //   10:perf_event:/path/to/cgroup
        //   9:hugetlb:/path/to/cgroup
        //   8:blkio:/path/to/cgroup
        //   7:pids:/path/to/cgroup
        //   6:freezer:/path/to/cgroup
        //   5:memory:/path/to/cgroup
        //   4:net_cls,net_prio:/path/to/cgroup
        //   3:cpu,cpuacct:/path/to/cgroup
        //   2:devices:/path/to/cgroup
        //   1:name=systemd:/path/to/cgroup
        //
        // For cgroup v2, we have:
        //
        //   0::/path/to/cgroup
        //
        // The file format is defined to have 3 fields, separated by colons. The first field gives the
        // Hierarchy ID, which is guaranteed to be 0 if the cgroup is part of a cgroup v2 ("unified")
        // hierarchy.
        // The second field is a comma-separated list of the controllers. Or, if it's cgroup v2, nothing.
        // The third field is the "pathname" of the cgroup *in its hierarchy*, relative to the mount
        // point of the hierarchy.
        //
        // So we're looking for EITHER:
        //  1. an entry like '<N>:<controller...>,cpu,<controller...>:/path/to/cgroup (cgroup v1); OR
        //  2. an entry like '0::/path/to/cgroup', and we'll return the path (cgroup v2)
        // We primarily care about the 'cpu' controller, so for cgroup v1, we'll search for that instead
        // of e.g. "name=systemd", although it *really* shouldn't matter because the paths will be the
        // same anyways.
        //
        // Now: Technically it's possible to run a "hybrid" system with both cgroup v1 and v2
        // hierarchies. If this is the case, it's possible for /proc/self/cgroup to show *some* v1
        // hierarchies attached, in addition to the v2 "unified" hierarchy, for the same cgroup. To
        // handle this, we should look for a cgroup v1 "cpu" controller, and if we can't find it, try
        // for the cgroup v2 unified entry.
        //
        // As far as I (@sharnoff) can tell, the only case where that might actually get messed up is if
        // the CPU controller isn't available for the cgroup we're running in, in which case there's
        // nothing we can do about it! (other than e.g. using a cgroup higher up the chain, which would
        // be really bad tbh).

        // ---
        // On to the show!

        procSelfCgroupContents, err := os.ReadFile("/proc/self/cgroup")
        if err != nil {
                return "", fmt.Errorf("failed to read /proc/self/cgroup: %w", err)
        }
        logger.Info("Read /proc/self/cgroup", zap.String("contents", string(procSelfCgroupContents)))

        // Collect all candidate paths from the lines of the file. If there isn't exactly one,
        // something's wrong and we should make an error.
        var v1Candidates []string
        var v2Candidates []string
        for lineno, line := range strings.Split(string(procSelfCgroupContents), "\n") {
                if line == "" {
                        continue
                }

                // Split into the three ':'-delimited fields
                fields := strings.Split(line, ":")
                if len(fields) != 3 {
                        return "", fmt.Errorf("line %d of /proc/self/cgroup did not have 3 colon-delimited fields", lineno+1)
                }

                id := fields[0]
                controllers := fields[1]
                path := fields[2]
                if id == "0" {
                        v2Candidates = append(v2Candidates, path)
                        continue
                }

                // It's not cgroup v2, otherwise id would have been 0. So, check if the comma-separated list
                // of controllers contains 'cpu' as an entry.
                for _, c := range strings.Split(controllers, ",") {
                        if c == "cpu" {
                                v1Candidates = append(v1Candidates, path)
                                break // ... and then continue to the next loop iteration
                        }
                }
        }

        var errMsg string

        // Check v1, then v2
        if len(v1Candidates) == 1 {
                return v1Candidates[0], nil
        } else if len(v1Candidates) != 0 {
                errMsg = "More than one applicable cgroup v1 entry in /proc/self/cgroup"
        } else if len(v2Candidates) == 1 {
                return v2Candidates[0], nil
        } else if len(v2Candidates) != 0 {
                errMsg = "More than one applicable cgroup v2 entry in /proc/self/cgroup"
        } else {
                errMsg = "Couldn't find applicable entry in /proc/self/cgroup"
        }

        return "", errors.New(errMsg)
}

func setCgroupLimit(logger *zap.Logger, r vmv1.MilliCPU, cgroupPath string) error {
        r *= cpuLimitOvercommitFactor

        isV2 := cgroups.Mode() == cgroups.Unified
        period := cgroupPeriod
        // quota may be greater than period if the cgroup is allowed
        // to use more than 100% of a CPU.
        quota := int64(float64(r) / float64(1000) * float64(cgroupPeriod))
        logger.Info(fmt.Sprintf("setting cgroup CPU limit %v %v", quota, period))
        if isV2 {
                resources := cgroup2.Resources{
                        CPU: &cgroup2.CPU{
                                Max: cgroup2.NewCPUMax(&quota, &period),
                        },
                }
                _, err := cgroup2.NewManager(cgroupMountPoint, cgroupPath, &resources)
                if err != nil {
                        return err
                }
        } else {
                _, err := cgroup1.New(cgroup1.StaticPath(cgroupPath), &specs.LinuxResources{
                        CPU: &specs.LinuxCPU{
                                Quota:  &quota,
                                Period: &period,
                        },
                })
                if err != nil {
                        return err
                }
        }

        return nil
}

package main

import (
        "bytes"
        "encoding/json"
        "errors"
        "fmt"
        "math"
        "os"
        "os/exec"
        "path/filepath"
        "strings"

        "github.com/alessio/shellescape"
        "github.com/kdomanski/iso9660"
        "go.uber.org/zap"

        "k8s.io/apimachinery/pkg/api/resource"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)

const (
        rootDiskPath    = "/vm/images/rootdisk.qcow2"
        runtimeDiskPath = "/vm/images/runtime.iso"
        mountedDiskPath = "/vm/images"

        sshAuthorizedKeysDiskPath   = "/vm/images/ssh-authorized-keys.iso"
        sshAuthorizedKeysMountPoint = "/vm/ssh"

        swapName = "swapdisk"
)

// setupVMDisks creates the disks for the VM and returns the appropriate QEMU args
func setupVMDisks(
        logger *zap.Logger,
        diskCacheSettings string,
        enableSSH bool,
        swapSize *resource.Quantity,
        extraDisks []vmv1.Disk,
) ([]string, error) {
        var qemuCmd []string

        qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=rootdisk,file=%s,if=virtio,media=disk,index=0,%s", rootDiskPath, diskCacheSettings))
        qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=runtime,file=%s,if=virtio,media=cdrom,readonly=on,cache=none", runtimeDiskPath))

        if enableSSH {
                name := "ssh-authorized-keys"
                if err := createISO9660FromPath(logger, name, sshAuthorizedKeysDiskPath, sshAuthorizedKeysMountPoint); err != nil {
                        return nil, fmt.Errorf("Failed to create ISO9660 image: %w", err)
                }
                qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=%s,file=%s,if=virtio,media=cdrom,cache=none", name, sshAuthorizedKeysDiskPath))
        }

        if swapSize != nil {
                dPath := fmt.Sprintf("%s/swapdisk.qcow2", mountedDiskPath)
                logger.Info("creating QCOW2 image for swap", zap.String("diskPath", dPath))
                if err := createSwap(dPath, swapSize); err != nil {
                        return nil, fmt.Errorf("Failed to create swap disk: %w", err)
                }
                qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=%s,file=%s,if=virtio,media=disk,%s,discard=unmap", swapName, dPath, diskCacheSettings))
        }

        for _, disk := range extraDisks {
                switch {
                case disk.EmptyDisk != nil:
                        logger.Info("creating QCOW2 image with empty ext4 filesystem", zap.String("diskName", disk.Name))
                        dPath := fmt.Sprintf("%s/%s.qcow2", mountedDiskPath, disk.Name)
                        if err := createQCOW2(disk.Name, dPath, &disk.EmptyDisk.Size, nil); err != nil {
                                return nil, fmt.Errorf("Failed to create QCOW2 image: %w", err)
                        }
                        discard := ""
                        if disk.EmptyDisk.Discard {
                                discard = ",discard=unmap"
                        }
                        qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=%s,file=%s,if=virtio,media=disk,%s%s", disk.Name, dPath, diskCacheSettings, discard))
                case disk.ConfigMap != nil || disk.Secret != nil:
                        dPath := fmt.Sprintf("%s/%s.iso", mountedDiskPath, disk.Name)
                        mnt := fmt.Sprintf("/vm/mounts%s", disk.MountPath)
                        logger.Info("creating iso9660 image", zap.String("diskPath", dPath), zap.String("diskName", disk.Name), zap.String("mountPath", mnt))
                        if err := createISO9660FromPath(logger, disk.Name, dPath, mnt); err != nil {
                                return nil, fmt.Errorf("Failed to create ISO9660 image: %w", err)
                        }
                        qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=%s,file=%s,if=virtio,media=cdrom,cache=none", disk.Name, dPath))
                default:
                        // do nothing
                }
        }

        return qemuCmd, nil
}

func resizeRootDisk(logger *zap.Logger, vmSpec *vmv1.VirtualMachineSpec) error {
        // resize rootDisk image of size specified and new size more than current
        type QemuImgOutputPartial struct {
                VirtualSize int64 `json:"virtual-size"`
        }
        // get current disk size by qemu-img info command
        qemuImgOut, err := exec.Command(qemuImgBin, "info", "--output=json", rootDiskPath).Output()
        if err != nil {
                return fmt.Errorf("could not get root image size: %w", err)
        }
        var imageSize QemuImgOutputPartial
        if err := json.Unmarshal(qemuImgOut, &imageSize); err != nil {
                return fmt.Errorf("failed to unmarshal QEMU image size: %w", err)
        }
        imageSizeQuantity := resource.NewQuantity(imageSize.VirtualSize, resource.BinarySI)

        // going to resize
        if !vmSpec.Guest.RootDisk.Size.IsZero() {
                if vmSpec.Guest.RootDisk.Size.Cmp(*imageSizeQuantity) == 1 {
                        logger.Info(fmt.Sprintf("resizing rootDisk from %s to %s", imageSizeQuantity.String(), vmSpec.Guest.RootDisk.Size.String()))
                        if err := execFg(qemuImgBin, "resize", rootDiskPath, fmt.Sprintf("%d", vmSpec.Guest.RootDisk.Size.Value())); err != nil {
                                return fmt.Errorf("failed to resize rootDisk: %w", err)
                        }
                } else {
                        logger.Info(fmt.Sprintf("rootDisk.size (%s) is less than than image size (%s)", vmSpec.Guest.RootDisk.Size.String(), imageSizeQuantity.String()))
                }
        }
        return nil
}

func createISO9660runtime(
        diskPath string,
        command []string,
        args []string,
        sysctl []string,
        env []vmv1.EnvVar,
        disks []vmv1.Disk,
        enableSSH bool,
        swapSize *resource.Quantity,
        shmsize *resource.Quantity,
) error {
        writer, err := iso9660.NewWriter()
        if err != nil {
                return err
        }
        defer writer.Cleanup() //nolint:errcheck // Nothing to do with the error, maybe log it ? TODO

        if len(sysctl) != 0 {
                err = writer.AddFile(bytes.NewReader([]byte(strings.Join(sysctl, "\n"))), "sysctl.conf")
                if err != nil {
                        return err
                }
        }

        if len(command) != 0 {
                err = writer.AddFile(bytes.NewReader([]byte(shellescape.QuoteCommand(command))), "command.sh")
                if err != nil {
                        return err
                }
        }

        if len(args) != 0 {
                err = writer.AddFile(bytes.NewReader([]byte(shellescape.QuoteCommand(args))), "args.sh")
                if err != nil {
                        return err
                }
        }

        if len(env) != 0 {
                envstring := []string{}
                for _, e := range env {
                        envstring = append(envstring, fmt.Sprintf(`export %s=%s`, e.Name, shellescape.Quote(e.Value)))
                }
                envstring = append(envstring, "")
                err = writer.AddFile(bytes.NewReader([]byte(strings.Join(envstring, "\n"))), "env.sh")
                if err != nil {
                        return err
                }
        }

        mounts := []string{
                "set -euo pipefail",
        }
        if enableSSH {
                mounts = append(mounts, "/neonvm/bin/mkdir -p /mnt/ssh")
                mounts = append(mounts, "/neonvm/bin/mount  -t iso9660 -o ro,mode=0644 $(/neonvm/bin/blkid -L ssh-authorized-keys) /mnt/ssh")
        }

        if swapSize != nil {
                mounts = append(mounts, fmt.Sprintf("/neonvm/bin/sh /neonvm/runtime/resize-swap-internal.sh %d", swapSize.Value()))
        }

        if len(disks) != 0 {
                for _, disk := range disks {
                        if disk.MountPath != "" {
                                mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mkdir -p %s`, disk.MountPath))
                        }
                        if disk.Watch != nil && *disk.Watch {
                                // do nothing as we will mount it into the VM via neonvm-daemon later
                                continue
                        }
                        switch {
                        case disk.EmptyDisk != nil:
                                opts := ""
                                if disk.EmptyDisk.Discard {
                                        opts = "-o discard"
                                }

                                if disk.EmptyDisk.EnableQuotas {
                                        mounts = append(mounts, fmt.Sprintf(`tune2fs -Q prjquota $(/neonvm/bin/blkid -L %s)`, disk.Name))
                                        mounts = append(mounts, fmt.Sprintf(`tune2fs -E mount_opts=prjquota $(/neonvm/bin/blkid -L %s)`, disk.Name))
                                }

                                mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mount %s $(/neonvm/bin/blkid -L %s) %s`, opts, disk.Name, disk.MountPath))
                                // Note: chmod must be after mount, otherwise it gets overwritten by mount.
                                mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/chmod 0777 %s`, disk.MountPath))
                        case disk.ConfigMap != nil || disk.Secret != nil:
                                mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mount -t iso9660 -o ro,mode=0644 $(/neonvm/bin/blkid -L %s) %s`, disk.Name, disk.MountPath))
                        case disk.Tmpfs != nil:
                                mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/chmod 0777 %s`, disk.MountPath))
                                mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mount -t tmpfs -o size=%d %s %s`, disk.Tmpfs.Size.Value(), disk.Name, disk.MountPath))
                        default:
                                // do nothing
                        }
                }
        }

        if shmsize != nil {
                mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mount -o remount,size=%d /dev/shm`, shmsize.Value()))
        }

        mounts = append(mounts, "")
        err = writer.AddFile(bytes.NewReader([]byte(strings.Join(mounts, "\n"))), "mounts.sh")
        if err != nil {
                return err
        }

        if swapSize != nil {
                lines := []string{
                        `#!/neonvm/bin/sh`,
                        `set -euo pipefail`,
                        // this script may be run as root, so we should avoid potentially-malicious path
                        // injection
                        `export PATH="/neonvm/bin"`,
                        fmt.Sprintf(`swapdisk="$(/neonvm/bin/blkid -L %s)"`, swapName),
                        // disable swap. Allow it to fail if it's already disabled.
                        `swapoff "$swapdisk" || true`,
                        // if the requested size is zero, then... just exit. There's nothing we need to do.
                        `new_size="$1"`,
                        `if [ "$new_size" = '0' ]; then exit 0; fi`,
                        // re-make the swap.
                        // mkswap expects the size to be given in KiB, so divide the new size by 1K
                        fmt.Sprintf(`mkswap -L %s "$swapdisk" $(( new_size / 1024 ))`, swapName),
                        // ... and then re-enable the swap
                        //
                        // nb: busybox swapon only supports '-d', not its long form '--discard'.
                        `swapon -d "$swapdisk"`,
                }
                err = writer.AddFile(bytes.NewReader([]byte(strings.Join(lines, "\n"))), "resize-swap-internal.sh")
                if err != nil {
                        return err
                }
        }

        outputFile, err := os.OpenFile(diskPath, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0o644)
        if err != nil {
                return err
        }

        // uid=36(qemu) gid=34(kvm) groups=34(kvm)
        err = outputFile.Chown(36, 34)
        if err != nil {
                return err
        }

        err = writer.WriteTo(outputFile, "vmruntime")
        if err != nil {
                return err
        }

        err = outputFile.Close()
        if err != nil {
                return err
        }

        return nil
}

func calcDirUsage(dirPath string) (int64, error) {
        stat, err := os.Lstat(dirPath)
        if err != nil {
                return 0, err
        }

        size := stat.Size()

        if !stat.IsDir() {
                return size, nil
        }

        dir, err := os.Open(dirPath)
        if err != nil {
                return size, err
        }
        defer dir.Close()

        files, err := dir.Readdir(-1)
        if err != nil {
                return size, err
        }

        for _, file := range files {
                if file.Name() == "." || file.Name() == ".." {
                        continue
                }
                s, err := calcDirUsage(dirPath + "/" + file.Name())
                if err != nil {
                        return size, err
                }
                size += s
        }
        return size, nil
}

func createSwap(diskPath string, swapSize *resource.Quantity) error {
        tmpRawFile := "swap.raw"

        if err := execFg(qemuImgBin, "create", "-q", "-f", "raw", tmpRawFile, fmt.Sprintf("%d", swapSize.Value())); err != nil {
                return err
        }
        if err := execFg("mkswap", "-L", swapName, tmpRawFile); err != nil {
                return err
        }

        if err := execFg(qemuImgBin, "convert", "-q", "-f", "raw", "-O", "qcow2", "-o", "cluster_size=2M,lazy_refcounts=on", tmpRawFile, diskPath); err != nil {
                return err
        }

        if err := execFg("rm", "-f", tmpRawFile); err != nil {
                return err
        }

        // uid=36(qemu) gid=34(kvm) groups=34(kvm)
        if err := execFg("chown", "36:34", diskPath); err != nil {
                return err
        }

        return nil
}

func createQCOW2(diskName string, diskPath string, diskSize *resource.Quantity, contentPath *string) error {
        ext4blocksMin := int64(64)
        ext4blockSize := int64(4096)
        ext4blockCount := int64(0)

        if diskSize != nil {
                ext4blockCount = diskSize.Value() / ext4blockSize
        } else if contentPath != nil {
                dirSize, err := calcDirUsage(*contentPath)
                if err != nil {
                        return err
                }
                ext4blockCount = int64(math.Ceil(float64(ext4blocksMin) + float64((dirSize / ext4blockSize))))
        } else {
                return errors.New("diskSize or contentPath should be specified")
        }

        mkfsArgs := []string{
                "-q", // quiet
                "-L", // volume-label
                diskName,
        }

        if contentPath != nil {
                // [ -d root-directory|tarball ]
                mkfsArgs = append(mkfsArgs, "-d", *contentPath)
        }

        mkfsArgs = append(
                mkfsArgs,
                "-b", // block-size
                fmt.Sprintf("%d", ext4blockSize),
                "ext4.raw",                        // device
                fmt.Sprintf("%d", ext4blockCount), // fs-size
        )

        if err := execFg("mkfs.ext4", mkfsArgs...); err != nil {
                return err
        }

        if err := execFg(qemuImgBin, "convert", "-q", "-f", "raw", "-O", "qcow2", "-o", "cluster_size=2M,lazy_refcounts=on", "ext4.raw", diskPath); err != nil {
                return err
        }

        if err := execFg("rm", "-f", "ext4.raw"); err != nil {
                return err
        }

        // uid=36(qemu) gid=34(kvm) groups=34(kvm)
        if err := execFg("chown", "36:34", diskPath); err != nil {
                return err
        }

        return nil
}

func createISO9660FromPath(logger *zap.Logger, diskName string, diskPath string, contentPath string) error {
        writer, err := iso9660.NewWriter()
        if err != nil {
                return err
        }
        defer writer.Cleanup() //nolint:errcheck // Nothing to do with the error, maybe log it ? TODO

        dir, err := os.Open(contentPath)
        if err != nil {
                return err
        }
        dirEntrys, err := dir.ReadDir(0)
        if err != nil {
                return err
        }

        for _, file := range dirEntrys {
                fileName := fmt.Sprintf("%s/%s", contentPath, file.Name())
                outputPath := file.Name()

                if file.IsDir() {
                        continue
                }
                // try to resolve symlink and check resolved file IsDir
                resolved, err := filepath.EvalSymlinks(fileName)
                if err != nil {
                        return err
                }
                resolvedOpen, err := os.Open(resolved)
                if err != nil {
                        return err
                }
                resolvedStat, err := resolvedOpen.Stat()
                if err != nil {
                        return err
                }
                if resolvedStat.IsDir() {
                        continue
                }

                // run the file handling logic in a closure, so the defers happen within the loop body,
                // rather than the outer function.
                err = func() error {
                        logger.Info("adding file to ISO9660 disk", zap.String("path", outputPath))
                        fileToAdd, err := os.Open(fileName)
                        if err != nil {
                                return err
                        }
                        defer fileToAdd.Close()

                        return writer.AddFile(fileToAdd, outputPath)
                }()
                if err != nil {
                        return err
                }
        }

        outputFile, err := os.OpenFile(diskPath, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0o644)
        if err != nil {
                return err
        }
        // uid=36(qemu) gid=34(kvm) groups=34(kvm)
        err = outputFile.Chown(36, 34)
        if err != nil {
                return err
        }

        err = writer.WriteTo(outputFile, diskName)
        if err != nil {
                return err
        }

        err = outputFile.Close()
        if err != nil {
                return err
        }

        return nil
}

package main

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "net/http"
        "sync"
        "time"

        "github.com/prometheus/client_golang/prometheus"
        "github.com/prometheus/client_golang/prometheus/promhttp"
        "go.uber.org/zap"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/api"
)

type cpuServerCallbacks struct {
        get   func(*zap.Logger) (*vmv1.MilliCPU, error)
        set   func(*zap.Logger, vmv1.MilliCPU) error
        ready func(*zap.Logger) bool
}

func listenForHTTPRequests(
        ctx context.Context,
        logger *zap.Logger,
        port int32,
        callbacks cpuServerCallbacks,
        wg *sync.WaitGroup,
        networkMonitoring bool,
) {
        defer wg.Done()
        mux := http.NewServeMux()
        loggerHandlers := logger.Named("http-handlers")
        cpuChangeLogger := loggerHandlers.Named("cpu_change")
        mux.HandleFunc("/cpu_change", func(w http.ResponseWriter, r *http.Request) {
                handleCPUChange(cpuChangeLogger, w, r, callbacks.set)
        })
        cpuCurrentLogger := loggerHandlers.Named("cpu_current")
        mux.HandleFunc("/cpu_current", func(w http.ResponseWriter, r *http.Request) {
                handleCPUCurrent(cpuCurrentLogger, w, r, callbacks.get)
        })
        mux.HandleFunc("/ready", func(w http.ResponseWriter, r *http.Request) {
                if callbacks.ready(logger) {
                        w.WriteHeader(200)
                } else {
                        w.WriteHeader(500)
                }
        })
        if networkMonitoring {
                reg := prometheus.NewRegistry()
                metrics := NewMonitoringMetrics(reg)
                mux.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
                        metrics.update(logger)
                        h := promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg})
                        h.ServeHTTP(w, r)
                })
        }
        server := http.Server{
                Addr:              fmt.Sprintf("0.0.0.0:%d", port),
                Handler:           mux,
                ReadTimeout:       5 * time.Second,
                ReadHeaderTimeout: 5 * time.Second,
                WriteTimeout:      5 * time.Second,
        }
        errChan := make(chan error)
        go func() {
                errChan <- server.ListenAndServe()
        }()
        select {
        case err := <-errChan:
                if errors.Is(err, http.ErrServerClosed) {
                        logger.Info("http server closed")
                } else if err != nil {
                        logger.Fatal("http server exited with error", zap.Error(err))
                }
        case <-ctx.Done():
                err := server.Shutdown(context.Background())
                logger.Info("shut down http server", zap.Error(err))
        }
}

func handleCPUChange(
        logger *zap.Logger,
        w http.ResponseWriter,
        r *http.Request,
        set func(*zap.Logger, vmv1.MilliCPU) error,
) {
        if r.Method != "POST" {
                logger.Error("unexpected method", zap.String("method", r.Method))
                w.WriteHeader(400)
                return
        }
        body, err := io.ReadAll(r.Body)
        if err != nil {
                logger.Error("could not read body", zap.Error(err))
                w.WriteHeader(400)
                return
        }

        var parsed api.VCPUChange
        if err = json.Unmarshal(body, &parsed); err != nil {
                logger.Error("could not parse body", zap.Error(err))
                w.WriteHeader(400)
                return
        }

        // update cgroup
        logger.Info("got CPU update", zap.Float64("CPU", parsed.VCPUs.AsFloat64()))
        err = set(logger, parsed.VCPUs)
        if err != nil {
                logger.Error("could not set cgroup limit", zap.Error(err))
                w.WriteHeader(500)
                return
        }

        w.WriteHeader(200)
}

func handleCPUCurrent(
        logger *zap.Logger,
        w http.ResponseWriter,
        r *http.Request,
        get func(*zap.Logger) (*vmv1.MilliCPU, error),
) {
        if r.Method != "GET" {
                logger.Error("unexpected method", zap.String("method", r.Method))
                w.WriteHeader(400)
                return
        }

        cpus, err := get(logger)
        if err != nil {
                logger.Error("could not get cgroup quota", zap.Error(err))
                w.WriteHeader(500)
                return
        }
        resp := api.VCPUCgroup{VCPUs: *cpus}
        body, err := json.Marshal(resp)
        if err != nil {
                logger.Error("could not marshal body", zap.Error(err))
                w.WriteHeader(500)
                return
        }

        w.Header().Add("Content-Type", "application/json")
        w.Write(body) //nolint:errcheck // Not much to do with the error here. TODO: log it?
}

package main

import (
        "bufio"
        "bytes"
        "context"
        "encoding/base64"
        "encoding/json"
        "errors"
        "flag"
        "fmt"
        "io"
        "net"
        "net/http"
        "os"
        "os/exec"
        "os/signal"
        "runtime"
        "strings"
        "sync"
        "sync/atomic"
        "syscall"
        "time"

        "github.com/digitalocean/go-qemu/qmp"
        "github.com/jpillora/backoff"
        "github.com/samber/lo"
        "go.uber.org/zap"

        "k8s.io/apimachinery/pkg/api/resource"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/util"
        "github.com/neondatabase/autoscaling/pkg/util/taskgroup"
)

const (
        qemuBinArm64 = "qemu-system-aarch64"
        qemuBinX8664 = "qemu-system-x86_64"
        qemuImgBin   = "qemu-img"

        architectureArm64 = "arm64"
        architectureAmd64 = "amd64"
        defaultKernelPath = "/vm/kernel/vmlinuz"

        qmpUnixSocketForSigtermHandler = "/vm/qmp-sigterm.sock"
        logSerialSocket                = "/vm/log.sock"
        bufferedReaderSize             = 4096
)

func checkKVM() bool {
        info, err := os.Stat("/dev/kvm")
        if err != nil {
                return false
        }
        mode := info.Mode()

        return mode&os.ModeCharDevice == os.ModeCharDevice
}

func checkDevTun() bool {
        info, err := os.Stat("/dev/net/tun")
        if err != nil {
                return false
        }
        mode := info.Mode()

        return mode&os.ModeCharDevice == os.ModeCharDevice
}

func runInitScript(logger *zap.Logger, script string) error {
        if len(script) == 0 {
                return nil
        }

        // creates a tmp file with the script content
        tmpFile, err := os.CreateTemp(os.TempDir(), "init-script-")
        if err != nil {
                return err
        }
        defer os.Remove(tmpFile.Name()) // clean up

        if _, err := tmpFile.Write([]byte(script)); err != nil {
                return err
        }

        if err := tmpFile.Close(); err != nil {
                return err
        }

        logger.Info("running init script", zap.String("path", tmpFile.Name()))

        if err := execFg("/bin/sh", tmpFile.Name()); err != nil {
                return err
        }

        return nil
}

type Config struct {
        vmSpecDump           string
        vmStatusDump         string
        kernelPath           string
        appendKernelCmdline  string
        skipCgroupManagement bool
        diskCacheSettings    string
        // autoMovableRatio value for VirtioMem provider. Validated in newConfig.
        autoMovableRatio string
        // cpuScalingMode is a mode to use for CPU scaling. Validated in newConfig.
        cpuScalingMode vmv1.CpuScalingMode
        // System CPU architecture. Set automatically equal to runtime.GOARCH.
        architecture string
}

func newConfig(logger *zap.Logger) *Config {
        cfg := &Config{
                vmSpecDump:           "",
                vmStatusDump:         "",
                kernelPath:           defaultKernelPath,
                appendKernelCmdline:  "",
                skipCgroupManagement: false,
                diskCacheSettings:    "cache=none",
                autoMovableRatio:     "",
                cpuScalingMode:       "",
                architecture:         runtime.GOARCH,
        }
        flag.StringVar(&cfg.vmSpecDump, "vmspec", cfg.vmSpecDump,
                "Base64 encoded VirtualMachine json specification")
        flag.StringVar(&cfg.vmStatusDump, "vmstatus", cfg.vmStatusDump,
                "Base64 encoded VirtualMachine json status")
        flag.StringVar(&cfg.kernelPath, "kernelpath", cfg.kernelPath,
                "Override path for kernel to use")
        flag.StringVar(&cfg.appendKernelCmdline, "appendKernelCmdline",
                cfg.appendKernelCmdline, "Additional kernel command line arguments")
        flag.BoolVar(&cfg.skipCgroupManagement, "skip-cgroup-management",
                cfg.skipCgroupManagement,
                "Don't try to manage CPU")
        flag.StringVar(&cfg.diskCacheSettings, "qemu-disk-cache-settings",
                cfg.diskCacheSettings, "Cache settings to add to -drive args for VM disks")
        flag.StringVar(&cfg.autoMovableRatio, "memhp-auto-movable-ratio",
                cfg.autoMovableRatio, "Set value of kernel's memory_hotplug.auto_movable_ratio [virtio-mem only]")
        flag.Func("cpu-scaling-mode", "Set CPU scaling mode", cfg.cpuScalingMode.FlagFunc)
        flag.Parse()

        if cfg.autoMovableRatio == "" {
                logger.Fatal("missing required flag '-memhp-auto-movable-ratio'")
        }
        if cfg.cpuScalingMode == "" {
                logger.Fatal("missing required flag '-cpu-scaling-mode'")
        }

        return cfg
}

func main() {
        logger := zap.Must(zap.NewProduction()).Named("neonvm-runner")

        if err := run(logger); err != nil {
                logger.Fatal("Failed to run", zap.Error(err))
        }
}

func run(logger *zap.Logger) error {
        cfg := newConfig(logger)

        vmSpecJson, err := base64.StdEncoding.DecodeString(cfg.vmSpecDump)
        if err != nil {
                return fmt.Errorf("failed to decode VirtualMachine Spec dump: %w", err)
        }
        vmStatusJson, err := base64.StdEncoding.DecodeString(cfg.vmStatusDump)
        if err != nil {
                return fmt.Errorf("failed to decode VirtualMachine Status dump: %w", err)
        }

        vmSpec := &vmv1.VirtualMachineSpec{}
        if err := json.Unmarshal(vmSpecJson, vmSpec); err != nil {
                return fmt.Errorf("failed to unmarshal VM spec: %w", err)
        }
        var vmStatus vmv1.VirtualMachineStatus
        if err := json.Unmarshal(vmStatusJson, &vmStatus); err != nil {
                return fmt.Errorf("failed to unmarshal VM Status: %w", err)
        }

        enableSSH := false
        if vmSpec.EnableSSH != nil && *vmSpec.EnableSSH {
                enableSSH = true
        }

        // Set hostname, with "vm-" prefix to distinguish it from the pod name
        //
        // This is just to reduce the risk of mixing things up when ssh'ing to different
        // computes, the hostname isn't used for anything as such.
        hostname, err := os.Hostname()
        if err != nil {
                logger.Warn("could not read pod's hostname", zap.Error(err))
        } else {
                hostname = fmt.Sprintf("vm-%s", hostname)
        }

        sysctl := []string{
                "kernel.core_pattern=core",
                "kernel.core_uses_pid=1",
        }
        var shmSize *resource.Quantity
        var swapSize *resource.Quantity
        if vmSpec.Guest.Settings != nil {
                sysctl = append(sysctl, vmSpec.Guest.Settings.Sysctl...)
                swapSize = vmSpec.Guest.Settings.Swap

                // By default, Linux sets the size of /dev/shm to 1/2 of the physical memory.  If
                // swap is configured, we want to set /dev/shm higher, because we can autoscale
                // the memory up.
                //
                // See https://github.com/neondatabase/autoscaling/issues/800
                initialMemorySize := vmSpec.Guest.MemorySlotSize.Value() * int64(vmSpec.Guest.MemorySlots.Min)
                if swapSize != nil && swapSize.Value() > initialMemorySize/2 {
                        shmSize = swapSize
                }
        }

        tg := taskgroup.NewGroup(logger)
        tg.Go("init-script", func(logger *zap.Logger) error {
                return runInitScript(logger, vmSpec.InitScript)
        })

        // create iso9660 disk with runtime options (command, args, envs, mounts)
        tg.Go("iso9660-runtime", func(logger *zap.Logger) error {
                disks := vmSpec.Disks

                // add the tls path.
                // this is needed to just `mkdir` the mounting directory.
                if vmSpec.TLS != nil {
                        disks = append(disks, vmv1.Disk{
                                Name:      "tls-keys",
                                MountPath: vmSpec.TLS.MountPath,
                                Watch:     lo.ToPtr(true),
                                ReadOnly:  nil,
                                DiskSource: vmv1.DiskSource{
                                        EmptyDisk: nil,
                                        ConfigMap: nil,
                                        Secret:    nil,
                                        Tmpfs:     nil,
                                },
                        })
                }

                return createISO9660runtime(
                        runtimeDiskPath,
                        vmSpec.Guest.Command,
                        vmSpec.Guest.Args,
                        sysctl,
                        vmSpec.Guest.Env,
                        disks,
                        enableSSH,
                        swapSize,
                        shmSize,
                )
        })

        tg.Go("rootDisk", func(logger *zap.Logger) error {
                // resize rootDisk image of size specified and new size more than current
                return resizeRootDisk(logger, vmSpec)
        })
        var qemuCmd []string

        tg.Go("qemu-cmd", func(logger *zap.Logger) error {
                var err error
                qemuCmd, err = buildQEMUCmd(cfg, logger, vmSpec, &vmStatus, enableSSH, swapSize, hostname)
                return err
        })

        if err := tg.Wait(); err != nil {
                return err
        }

        err = runQEMU(cfg, logger, vmSpec, qemuCmd)
        if err != nil {
                return fmt.Errorf("failed to run QEMU: %w", err)
        }

        return nil
}

func buildQEMUCmd(
        cfg *Config,
        logger *zap.Logger,
        vmSpec *vmv1.VirtualMachineSpec,
        vmStatus *vmv1.VirtualMachineStatus,
        enableSSH bool,
        swapSize *resource.Quantity,
        hostname string,
) ([]string, error) {
        // prepare qemu command line
        qemuCmd := []string{
                "-runas", "qemu",
                "-machine", getMachineType(cfg.architecture),
                "-nographic",
                "-no-reboot",
                "-nodefaults",
                "-only-migratable",
                "-audiodev", "none,id=noaudio",
                "-serial", "pty",
                "-msg", "timestamp=on",
                "-qmp", fmt.Sprintf("tcp:0.0.0.0:%d,server,wait=off", vmSpec.QMP),
                "-qmp", fmt.Sprintf("tcp:0.0.0.0:%d,server,wait=off", vmSpec.QMPManual),
                "-qmp", fmt.Sprintf("unix:%s,server,wait=off", qmpUnixSocketForSigtermHandler),
                "-device", "virtio-serial",
                "-chardev", fmt.Sprintf("socket,path=%s,server=on,wait=off,id=log", logSerialSocket),
                "-device", "virtserialport,chardev=log,name=tech.neon.log.0",
        }

        qemuDiskArgs, err := setupVMDisks(logger, cfg.diskCacheSettings, enableSSH, swapSize, vmSpec.Disks)
        if err != nil {
                return nil, err
        }
        qemuCmd = append(qemuCmd, qemuDiskArgs...)

        switch cfg.architecture {
        case architectureArm64:
                // add custom firmware to have ACPI working
                qemuCmd = append(qemuCmd, "-bios", "/vm/QEMU_EFI_ARM.fd")
                // arm virt has only one UART, setup virtio-serial to add more /dev/hvcX
                qemuCmd = append(qemuCmd,
                        "-chardev", "stdio,id=virtio-console",
                        "-device", "virtconsole,chardev=virtio-console",
                )
        case architectureAmd64:
                // on amd we have multiple UART ports so we can just use serial stdio
                qemuCmd = append(qemuCmd, "-serial", "stdio")
        default:
                logger.Fatal("unsupported architecture", zap.String("architecture", cfg.architecture))
        }

        // cpu details
        // NB: EnableAcceleration guaranteed non-nil because the k8s API server sets the default for us.
        if *vmSpec.EnableAcceleration && checkKVM() {
                logger.Info("using KVM acceleration")
                qemuCmd = append(qemuCmd, "-enable-kvm")
        } else {
                logger.Warn("not using KVM acceleration")
        }
        qemuCmd = append(qemuCmd, "-cpu", "max")

        // cpu scaling details
        maxCPUs := vmSpec.Guest.CPUs.Max.RoundedUp()
        minCPUs := vmSpec.Guest.CPUs.Min.RoundedUp()

        switch cfg.cpuScalingMode {
        case vmv1.CpuScalingModeSysfs:
                // Boot with all CPUs plugged, we will online them on-demand
                qemuCmd = append(qemuCmd, "-smp", fmt.Sprintf(
                        "cpus=%d,maxcpus=%d,sockets=1,cores=%d,threads=1",
                        maxCPUs,
                        maxCPUs,
                        maxCPUs,
                ))
        case vmv1.CpuScalingModeQMP:
                // Boot with minCPUs hotplugged, but with slots reserved for maxCPUs.
                qemuCmd = append(qemuCmd, "-smp", fmt.Sprintf(
                        "cpus=%d,maxcpus=%d,sockets=1,cores=%d,threads=1",
                        minCPUs,
                        maxCPUs,
                        maxCPUs,
                ))
        default:
                // we should never get here because we validate the flag in newConfig
                panic(fmt.Errorf("unknown CPU scaling mode %s", cfg.cpuScalingMode))
        }

        // memory details
        qemuCmd = append(qemuCmd, "-m", fmt.Sprintf(
                "size=%db,slots=%d,maxmem=%db",
                vmSpec.Guest.MemorySlotSize.Value()*int64(vmSpec.Guest.MemorySlots.Min),
                vmSpec.Guest.MemorySlots.Max-vmSpec.Guest.MemorySlots.Min,
                vmSpec.Guest.MemorySlotSize.Value()*int64(vmSpec.Guest.MemorySlots.Max),
        ))
        // we don't actually have any slots because it's virtio-mem, but we're still using the API
        // designed around DIMM slots, so we need to use them to calculate how much memory we expect
        // to be able to plug in.
        numSlots := vmSpec.Guest.MemorySlots.Max - vmSpec.Guest.MemorySlots.Min
        virtioMemSize := int64(numSlots) * vmSpec.Guest.MemorySlotSize.Value()
        // We can add virtio-mem if it actually needs to be a non-zero size.
        // Otherwise, QEMU fails with:
        //   property 'size' of memory-backend-ram doesn't take value '0'
        if virtioMemSize != 0 {
                qemuCmd = append(qemuCmd, "-object", fmt.Sprintf("memory-backend-ram,id=vmem0,size=%db", virtioMemSize))
                qemuCmd = append(qemuCmd, "-device", "virtio-mem-pci,id=vm0,memdev=vmem0,block-size=8M,requested-size=0")
        }

        qemuNetArgs, err := setupVMNetworks(logger, vmSpec.Guest.Ports, vmSpec.ExtraNetwork)
        if err != nil {
                return nil, err
        }
        qemuCmd = append(qemuCmd, qemuNetArgs...)

        // kernel details
        qemuCmd = append(
                qemuCmd,
                "-kernel", cfg.kernelPath,
                "-append", makeKernelCmdline(cfg, logger, vmSpec, vmStatus, hostname),
        )

        // should runner receive migration ?
        if os.Getenv("RECEIVE_MIGRATION") == "true" {
                qemuCmd = append(qemuCmd, "-incoming", fmt.Sprintf("tcp:0:%d", vmv1.MigrationPort))
        }

        return qemuCmd, nil
}

const (
        // this loglevel is used only during startup, later it is overriden during vminit
        baseKernelCmdline          = "panic=-1 init=/neonvm/bin/init loglevel=6 root=/dev/vda rw"
        kernelCmdlineVirtioMemTmpl = "memhp_default_state=online memory_hotplug.online_policy=auto-movable memory_hotplug.auto_movable_ratio=%s"
)

func makeKernelCmdline(cfg *Config, logger *zap.Logger, vmSpec *vmv1.VirtualMachineSpec, vmStatus *vmv1.VirtualMachineStatus, hostname string) string {
        cmdlineParts := []string{baseKernelCmdline}

        cmdlineParts = append(cmdlineParts, fmt.Sprintf(kernelCmdlineVirtioMemTmpl, cfg.autoMovableRatio))

        if vmSpec.ExtraNetwork != nil && vmSpec.ExtraNetwork.Enable {
                netDetails := fmt.Sprintf("ip=%s:::%s:%s:eth1:off", vmStatus.ExtraNetIP, vmStatus.ExtraNetMask, vmStatus.PodName)
                cmdlineParts = append(cmdlineParts, netDetails)
        }

        if len(hostname) != 0 {
                cmdlineParts = append(cmdlineParts, fmt.Sprintf("hostname=%s", hostname))
        }

        if cfg.appendKernelCmdline != "" {
                cmdlineParts = append(cmdlineParts, cfg.appendKernelCmdline)
        }
        if cfg.cpuScalingMode == vmv1.CpuScalingModeSysfs {
                // Limit the number of online CPUs kernel boots with. More CPUs will be enabled on upscaling
                cmdlineParts = append(cmdlineParts, fmt.Sprintf("maxcpus=%d", vmSpec.Guest.CPUs.Min.RoundedUp()))
        }

        switch cfg.architecture {
        case architectureArm64:
                // explicitly enable acpi if we run on arm
                cmdlineParts = append(cmdlineParts, "acpi=on")
                // use virtio-serial device kernel console
                cmdlineParts = append(cmdlineParts, "console=hvc0")
        case architectureAmd64:
                cmdlineParts = append(cmdlineParts, "console=ttyS1")
        default:
                logger.Fatal("unsupported architecture", zap.String("architecture", cfg.architecture))
        }

        return strings.Join(cmdlineParts, " ")
}

func runQEMU(
        cfg *Config,
        logger *zap.Logger,
        vmSpec *vmv1.VirtualMachineSpec,
        qemuCmd []string,
) error {
        selfPodName, ok := os.LookupEnv("K8S_POD_NAME")
        if !ok {
                return fmt.Errorf("environment variable K8S_POD_NAME missing")
        }

        var cgroupPath string

        if !cfg.skipCgroupManagement {
                var err error
                cgroupPath, err = setupQEMUCgroup(logger, selfPodName, vmSpec.Guest.CPUs.Use)
                if err != nil {
                        return err
                }
        }

        ctx, cancel := context.WithCancel(context.Background())
        wg := sync.WaitGroup{}

        wg.Add(1)
        go terminateQemuOnSigterm(ctx, logger, &wg)
        var callbacks cpuServerCallbacks
        // lastValue is used to store last fractional CPU request
        // we need to store the value as is because we can't convert it back from MilliCPU
        // and otherwise we would have infinite reconciliation loop
        // this will eventually be dropped in favor of real fractional CPU scaling based on the cgroups
        lastValue := &atomic.Uint32{}
        lastValue.Store(uint32(vmSpec.Guest.CPUs.Min))

        callbacks = cpuServerCallbacks{
                get: func(logger *zap.Logger) (*vmv1.MilliCPU, error) {
                        return lo.ToPtr(vmv1.MilliCPU(lastValue.Load())), nil
                },
                set: func(logger *zap.Logger, cpu vmv1.MilliCPU) error {
                        if cfg.cpuScalingMode == vmv1.CpuScalingModeSysfs {
                                err := setNeonvmDaemonCPU(cpu)
                                if err != nil {
                                        logger.Error("setting CPU through NeonVM Daemon failed", zap.Any("cpu", cpu), zap.Error(err))
                                        return err
                                }
                        }
                        lastValue.Store(uint32(cpu))
                        return nil
                },
                ready: func(logger *zap.Logger) bool {
                        switch cfg.cpuScalingMode {
                        case vmv1.CpuScalingModeSysfs:
                                // check if the NeonVM Daemon is ready to accept requests
                                err := checkNeonvmDaemonCPU()
                                if err != nil {
                                        logger.Warn("neonvm-daemon ready probe failed", zap.Error(err))
                                        return false
                                }
                                return true
                        case vmv1.CpuScalingModeQMP:
                                // no readiness check for QMP mode
                                return true
                        default:
                                // explicit panic for unknown CPU scaling mode
                                // in case if we add a new CPU scaling mode and forget to update this function
                                panic(fmt.Errorf("unknown CPU scaling mode %s", cfg.cpuScalingMode))
                        }
                },
        }

        wg.Add(1)
        monitoring := vmSpec.EnableNetworkMonitoring != nil && *vmSpec.EnableNetworkMonitoring
        go listenForHTTPRequests(ctx, logger, vmSpec.RunnerPort, callbacks, &wg, monitoring)
        wg.Add(1)
        go forwardLogs(ctx, logger, &wg)
        wg.Add(1)
        go monitorFiles(ctx, logger, &wg, vmSpec)

        qemuBin := getQemuBinaryName(cfg.architecture)
        var bin string
        var cmd []string
        if !cfg.skipCgroupManagement {
                bin = "cgexec"
                cmd = append([]string{"-g", fmt.Sprintf("cpu:%s", cgroupPath), qemuBin}, qemuCmd...)
        } else {
                bin = qemuBin
                cmd = qemuCmd
        }

        logger.Info(fmt.Sprintf("calling %s", bin), zap.Strings("args", cmd))
        err := execFg(bin, cmd...)
        if err != nil {
                msg := "QEMU exited with error" // TODO: technically this might not be accurate. This can also happen if it fails to start.
                logger.Error(msg, zap.Error(err))
                err = fmt.Errorf("%s: %w", msg, err)
        } else {
                logger.Info("QEMU exited without error")
        }

        cancel()
        wg.Wait()

        return err
}

func getQemuBinaryName(architecture string) string {
        switch architecture {
        case architectureArm64:
                return qemuBinArm64
        case architectureAmd64:
                return qemuBinX8664
        default:
                panic(fmt.Errorf("unknown architecture %s", architecture))
        }
}

func getMachineType(architecture string) string {
        switch architecture {
        case architectureArm64:
                // virt is the most up to date and generic ARM machine architecture
                return "virt"
        case architectureAmd64:
                // q35 is the most up to date and generic x86_64 machine architecture
                return "q35"
        default:
                panic(fmt.Errorf("unknown architecture %s", architecture))
        }
}

func printWithNewline(slice []byte) error {
        if len(slice) == 0 {
                return nil
        }

        _, err := os.Stdout.Write(slice)
        if err != nil {
                return err
        }

        if slice[len(slice)-1] == '\n' {
                return nil
        }

        _, err = os.Stdout.WriteString("\n")
        return err
}

func drainLogsReader(reader *bufio.Reader, logger *zap.Logger) error {
        for {
                // ReadSlice actually can return no more than bufferedReaderSize bytes
                slice, err := reader.ReadSlice('\n')
                // If err != nil, slice might not have \n at the end
                err2 := printWithNewline(slice)

                err = errors.Join(err, err2)
                if err != nil {
                        if errors.Is(err, os.ErrDeadlineExceeded) {
                                return nil
                        }
                        if errors.Is(err, io.EOF) {
                                logger.Warn("EOF while reading from log serial")
                        } else {
                                logger.Error("failed to read from log serial", zap.Error(err))
                        }
                        return err
                }
        }
}

// forwardLogs writes from socket to stdout line by line
func forwardLogs(ctx context.Context, logger *zap.Logger, wg *sync.WaitGroup) {
        defer wg.Done()

        delay := 3 * time.Second
        var conn net.Conn
        var reader *bufio.Reader

        b := &backoff.Backoff{
                Min:    100 * time.Millisecond,
                Max:    delay,
                Factor: 2,
                Jitter: true,
        }

        // Wait a bit to reduce the chance we attempt dialing before
        // QEMU is started
        select {
        case <-time.After(200 * time.Millisecond):
        case <-ctx.Done():
                logger.Warn("QEMU shut down too soon to start forwarding logs")
        }

        for {
                func() {
                        if conn == nil {
                                var err error
                                conn, err = net.Dial("unix", logSerialSocket)
                                if err != nil {
                                        logger.Error("failed to dial to logSerialSocket", zap.Error(err))
                                        return
                                }
                                reader = bufio.NewReaderSize(conn, bufferedReaderSize)
                        }

                        b.Attempt()
                        err := conn.SetReadDeadline(time.Now().Add(delay))
                        if err != nil {
                                logger.Error("failed to set read deadline", zap.Error(err))
                                conn = nil
                                return
                        }

                        err = drainLogsReader(reader, logger)
                        if errors.Is(err, os.ErrDeadlineExceeded) {
                                // We've hit the deadline, meaning the reading session was successful.
                                b.Reset()
                                return
                        }

                        if err != nil {
                                conn = nil
                        }
                }()

                select {
                case <-ctx.Done():
                        if conn != nil {
                                conn.Close()
                        }
                        if reader != nil {
                                _ = drainLogsReader(reader, logger)
                        }
                        return
                case <-time.After(b.Duration()):
                }
        }
}

// monitorFiles watches a specific set of files and copied them into the guest VM via neonvm-daemon.
func monitorFiles(ctx context.Context, logger *zap.Logger, wg *sync.WaitGroup, vmSpec *vmv1.VirtualMachineSpec) {
        defer wg.Done()

        secrets := make(map[string]string)
        secretsOrd := []string{}
        for _, disk := range vmSpec.Disks {
                if disk.Watch != nil && *disk.Watch {
                        // secrets/configmaps are mounted using the atomicwriter utility,
                        // which loads the directory into `..data`.
                        dataDir := fmt.Sprintf("/vm/mounts%s/..data", disk.MountPath)
                        secrets[dataDir] = disk.MountPath
                        secretsOrd = append(secretsOrd, dataDir)
                }
        }

        if vmSpec.TLS != nil {
                dataDir := fmt.Sprintf("/vm/mounts%s/..data", vmSpec.TLS.MountPath)
                secrets[dataDir] = vmSpec.TLS.MountPath
                secretsOrd = append(secretsOrd, dataDir)
        }

        if len(secretsOrd) == 0 {
                return
        }

        // Faster loop for the initial upload.
        // The VM might need the secrets in order for postgres to actually start up,
        // so it's important we sync them as soon as the daemon is available.
        for {
                success := true
                for _, hostpath := range secretsOrd {
                        guestpath := secrets[hostpath]
                        if err := sendFilesToNeonvmDaemon(ctx, hostpath, guestpath); err != nil {
                                success = false
                                logger.Error("failed to upload file to vm guest", zap.Error(err))
                        }
                }
                if success {
                        break
                }

                select {
                case <-time.After(1 * time.Second):
                        continue
                case <-ctx.Done():
                        return
                }
        }

        // For the entire duration the VM is alive, periodically check whether any of the watched disks
        // still match what's inside the VM, and if not, send the update.
        ticker := time.NewTicker(30 * time.Second)
        defer ticker.Stop()

        for {
                select {
                case <-ctx.Done():
                        return
                case <-ticker.C:
                        // for each secret we are tracking
                        for hostpath, guestpath := range secrets {
                                // get the checksum for the pod directory
                                hostsum, err := util.ChecksumFlatDir(hostpath)
                                if err != nil {
                                        logger.Error("failed to get dir checksum from host", zap.Error(err), zap.String("dir", hostpath))
                                        continue
                                }

                                // get the checksum for the VM directory
                                guestsum, err := getFileChecksumFromNeonvmDaemon(ctx, guestpath)
                                if err != nil {
                                        logger.Error("failed to get dir checksum from guest", zap.Error(err), zap.String("dir", guestpath))
                                        continue
                                }

                                // if not equal, update the files inside the VM.
                                if guestsum != hostsum {
                                        if err = sendFilesToNeonvmDaemon(ctx, hostpath, guestpath); err != nil {
                                                logger.Error("failed to upload files to vm guest", zap.Error(err))
                                        }
                                }
                        }
                }
        }
}

func terminateQemuOnSigterm(ctx context.Context, logger *zap.Logger, wg *sync.WaitGroup) {
        logger = logger.Named("terminate-qemu-on-sigterm")

        defer wg.Done()
        logger.Info("watching OS signals")
        c := make(chan os.Signal, 1) // we need to reserve to buffer size 1, so the notifier are not blocked
        signal.Notify(c, os.Interrupt, syscall.SIGTERM)
        select {
        case <-c:
        case <-ctx.Done():
                logger.Info("context canceled, not going to powerdown QEMU because it's already finished")
                return
        }

        logger.Info("got signal, sending powerdown command to QEMU")

        mon, err := qmp.NewSocketMonitor("unix", qmpUnixSocketForSigtermHandler, 2*time.Second)
        if err != nil {
                logger.Error("failed to connect to QEMU monitor", zap.Error(err))
                return
        }

        if err := mon.Connect(); err != nil {
                logger.Error("failed to start monitor connection", zap.Error(err))
                return
        }
        defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?

        qmpcmd := []byte(`{"execute": "system_powerdown"}`)
        _, err = mon.Run(qmpcmd)
        if err != nil {
                logger.Error("failed to execute system_powerdown command", zap.Error(err))
                return
        }

        logger.Info("system_powerdown command sent to QEMU")
}

//lint:ignore U1000 the function is not in use right now, but it's good to have for the future
func execBg(name string, arg ...string) error {
        cmd := exec.Command(name, arg...)
        cmd.Stdout = os.Stdout
        cmd.Stderr = os.Stderr
        if err := cmd.Start(); err != nil {
                return err
        }
        return nil
}

func execFg(name string, arg ...string) error {
        cmd := exec.Command(name, arg...)
        cmd.Stdout = os.Stdout
        cmd.Stderr = os.Stderr
        if err := cmd.Run(); err != nil {
                return err
        }
        return nil
}

func setNeonvmDaemonCPU(cpu vmv1.MilliCPU) error {
        _, vmIP, _, err := calcIPs(defaultNetworkCIDR)
        if err != nil {
                return fmt.Errorf("could not calculate VM IP address: %w", err)
        }

        ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
        defer cancel()

        url := fmt.Sprintf("http://%s:25183/cpu", vmIP)
        body := bytes.NewReader([]byte(fmt.Sprintf("%d", uint32(cpu))))

        req, err := http.NewRequestWithContext(ctx, http.MethodPut, url, body)
        if err != nil {
                return fmt.Errorf("could not build request: %w", err)
        }

        resp, err := http.DefaultClient.Do(req)
        if err != nil {
                return fmt.Errorf("could not send request: %w", err)
        }
        defer resp.Body.Close()

        if resp.StatusCode != 200 {
                return fmt.Errorf("neonvm-daemon responded with status %d", resp.StatusCode)
        }

        return nil
}

// checkNeonvmDaemonCPU sends a GET request to the NeonVM Daemon to get the current CPU limit for the sake of readiness probe.
func checkNeonvmDaemonCPU() error {
        _, vmIP, _, err := calcIPs(defaultNetworkCIDR)
        if err != nil {
                return fmt.Errorf("could not calculate VM IP address: %w", err)
        }
        ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
        defer cancel()
        url := fmt.Sprintf("http://%s:25183/cpu", vmIP)
        req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
        if err != nil {
                return fmt.Errorf("could not build request: %w", err)
        }
        resp, err := http.DefaultClient.Do(req)
        if err != nil {
                return fmt.Errorf("could not send request: %w", err)
        }
        defer resp.Body.Close()
        if resp.StatusCode != 200 {
                return fmt.Errorf("neonvm-daemon responded with status %d", resp.StatusCode)
        }
        return nil
}

type File struct {
        // base64 encoded file contents
        Data string `json:"data"`
}

func sendFilesToNeonvmDaemon(ctx context.Context, hostpath, guestpath string) error {
        _, vmIP, _, err := calcIPs(defaultNetworkCIDR)
        if err != nil {
                return fmt.Errorf("could not calculate VM IP address: %w", err)
        }

        files, err := util.ReadAllFiles(hostpath)
        if err != nil && !os.IsNotExist(err) {
                return fmt.Errorf("could not open file: %w", err)
        }

        encodedFiles := make(map[string]File)
        for k, v := range files {
                encodedFiles[k] = File{Data: base64.StdEncoding.EncodeToString(v)}
        }
        body, err := json.Marshal(encodedFiles)
        if err != nil {
                return fmt.Errorf("could not encode files: %w", err)
        }

        ctx, cancel := context.WithTimeout(ctx, time.Second)
        defer cancel()

        // guestpath has a leading forward slash
        url := fmt.Sprintf("http://%s:25183/files%s", vmIP, guestpath)

        req, err := http.NewRequestWithContext(ctx, http.MethodPut, url, bytes.NewReader(body))
        if err != nil {
                return fmt.Errorf("could not build request: %w", err)
        }

        client := &http.Client{
                CheckRedirect: func(req *http.Request, via []*http.Request) error {
                        return http.ErrUseLastResponse
                },
        }
        resp, err := client.Do(req)
        if err != nil {
                return fmt.Errorf("could not send request: %w", err)
        }
        defer resp.Body.Close()

        if resp.StatusCode != 200 {
                return fmt.Errorf("neonvm-daemon responded with status %d", resp.StatusCode)
        }

        return nil
}

func getFileChecksumFromNeonvmDaemon(ctx context.Context, guestpath string) (string, error) {
        _, vmIP, _, err := calcIPs(defaultNetworkCIDR)
        if err != nil {
                return "", fmt.Errorf("could not calculate VM IP address: %w", err)
        }

        ctx, cancel := context.WithTimeout(ctx, time.Second)
        defer cancel()

        // guestpath has a leading forward slash
        url := fmt.Sprintf("http://%s:25183/files%s", vmIP, guestpath)

        req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, http.NoBody)
        if err != nil {
                return "", fmt.Errorf("could not build request: %w", err)
        }

        resp, err := http.DefaultClient.Do(req)
        if err != nil {
                return "", fmt.Errorf("could not send request: %w", err)
        }
        defer resp.Body.Close()

        if resp.StatusCode != 200 {
                return "", fmt.Errorf("neonvm-daemon responded with status %d", resp.StatusCode)
        }

        checksum, err := io.ReadAll(resp.Body)
        if err != nil {
                return "", fmt.Errorf("could not read response: %w", err)
        }

        return string(checksum), nil
}

package main

import (
        "fmt"
        "net"
        "os"
        "strings"

        "github.com/cilium/cilium/pkg/mac"
        "github.com/coreos/go-iptables/iptables"
        "github.com/docker/docker/libnetwork/resolvconf"
        "github.com/docker/libnetwork/types"
        "github.com/prometheus/client_golang/prometheus"
        "github.com/vishvananda/netlink"
        "go.uber.org/zap"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/util"
)

const (
        defaultNetworkBridgeName = "br-def"
        defaultNetworkTapName    = "tap-def"
        defaultNetworkCIDR       = "169.254.254.252/30"

        overlayNetworkBridgeName = "br-overlay"
        overlayNetworkTapName    = "tap-overlay"

        protocolTCP string = "6"
)

// setupVMNetworks creates the networks for the VM and returns the appropriate QMEU args
func setupVMNetworks(logger *zap.Logger, ports []vmv1.Port, extraNetwork *vmv1.ExtraNetwork) ([]string, error) {
        // Create network tap devices.
        //
        // It is important to enable multiqueue support for virtio-net-pci devices as we seen them choking on
        // traffic and dropping packets. Set queues=4 and vectors=10 as a reasonable default. `vectors` should
        // be to 2*queues + 2 as per https://www.linux-kvm.org/page/Multiqueue. We also enable multiqueue support
        // for all VM sizes, even to a small ones. As of time of writing, it seems to not worth a trouble to
        // dynamically adjust number of queues based on VM size.

        var qemuCmd []string

        // default (pod) net details
        macDefault, err := defaultNetwork(logger, defaultNetworkCIDR, ports)
        if err != nil {
                return nil, fmt.Errorf("Failed to set up default network: %w", err)
        }
        qemuCmd = append(qemuCmd, "-netdev", fmt.Sprintf("tap,id=default,ifname=%s,queues=4,script=no,downscript=no,vhost=on", defaultNetworkTapName))
        qemuCmd = append(qemuCmd, "-device", fmt.Sprintf("virtio-net-pci,mq=on,vectors=10,netdev=default,mac=%s", macDefault.String()))

        // overlay (multus) net details
        if extraNetwork != nil && extraNetwork.Enable {
                macOverlay, err := overlayNetwork(extraNetwork.Interface)
                if err != nil {
                        return nil, fmt.Errorf("Failed to set up overlay network: %w", err)
                }
                qemuCmd = append(qemuCmd, "-netdev", fmt.Sprintf("tap,id=overlay,ifname=%s,queues=4,script=no,downscript=no,vhost=on", overlayNetworkTapName))
                qemuCmd = append(qemuCmd, "-device", fmt.Sprintf("virtio-net-pci,mq=on,vectors=10,netdev=overlay,mac=%s", macOverlay.String()))
        }

        return qemuCmd, nil
}

func calcIPs(cidr string) (net.IP, net.IP, net.IPMask, error) {
        _, ipv4Net, err := net.ParseCIDR(cidr)
        if err != nil {
                return nil, nil, nil, err
        }

        ip0 := ipv4Net.IP.To4()
        mask := ipv4Net.Mask

        ip1 := append(net.IP{}, ip0...)
        ip1[3]++
        ip2 := append(net.IP{}, ip1...)
        ip2[3]++

        return ip1, ip2, mask, nil
}

func defaultNetwork(logger *zap.Logger, cidr string, ports []vmv1.Port) (mac.MAC, error) {
        // gerenare random MAC for default Guest interface
        mac, err := mac.GenerateRandMAC()
        if err != nil {
                logger.Fatal("could not generate random MAC", zap.Error(err))
                return nil, err
        }

        // create an configure linux bridge
        logger.Info("setup bridge interface", zap.String("name", defaultNetworkBridgeName))
        bridge := &netlink.Bridge{
                LinkAttrs: netlink.LinkAttrs{
                        Name: defaultNetworkBridgeName,
                },
        }
        if err := netlink.LinkAdd(bridge); err != nil {
                logger.Fatal("could not create bridge interface", zap.Error(err))
                return nil, err
        }
        ipPod, ipVm, mask, err := calcIPs(cidr)
        if err != nil {
                logger.Fatal("could not parse IP", zap.Error(err))
                return nil, err
        }
        bridgeAddr := &netlink.Addr{
                IPNet: &net.IPNet{
                        IP:   ipPod,
                        Mask: mask,
                },
        }
        if err := netlink.AddrAdd(bridge, bridgeAddr); err != nil {
                logger.Fatal("could not parse IP", zap.Error(err))
                return nil, err
        }
        if err := netlink.LinkSetUp(bridge); err != nil {
                logger.Fatal("could not set up bridge", zap.Error(err))
                return nil, err
        }

        // create an configure TAP interface
        if !checkDevTun() {
                logger.Info("create /dev/net/tun")
                if err := execFg("mkdir", "-p", "/dev/net"); err != nil {
                        return nil, err
                }
                if err := execFg("mknod", "/dev/net/tun", "c", "10", "200"); err != nil {
                        return nil, err
                }
                if err := execFg("chown", "qemu:kvm", "/dev/net/tun"); err != nil {
                        return nil, err
                }
        }

        logger.Info("setup tap interface", zap.String("name", defaultNetworkTapName))
        tap := &netlink.Tuntap{
                LinkAttrs: netlink.LinkAttrs{
                        Name: defaultNetworkTapName,
                },
                Mode:  netlink.TUNTAP_MODE_TAP,
                Flags: netlink.TUNTAP_MULTI_QUEUE_DEFAULTS,
        }
        if err := netlink.LinkAdd(tap); err != nil {
                logger.Error("could not add tap device", zap.Error(err))
                return nil, err
        }
        if err := netlink.LinkSetMaster(tap, bridge); err != nil {
                logger.Error("could not set up tap as master", zap.Error(err))
                return nil, err
        }
        if err := netlink.LinkSetUp(tap); err != nil {
                logger.Error("could not set up tap device", zap.Error(err))
                return nil, err
        }

        // setup masquerading outgoing (from VM) traffic
        logger.Info("setup masquerading for outgoing traffic")
        if err := execFg("iptables", "-t", "nat", "-A", "POSTROUTING", "-o", "eth0", "-j", "MASQUERADE"); err != nil {
                logger.Error("could not setup masquerading for outgoing traffic", zap.Error(err))
                return nil, err
        }

        // pass incoming traffic to .Guest.Spec.Ports into VM
        var iptablesArgs []string
        for _, port := range ports {
                logger.Debug(fmt.Sprintf("setup DNAT rule for incoming traffic to port %d", port.Port))
                iptablesArgs = []string{
                        "-t", "nat", "-A", "PREROUTING",
                        "-i", "eth0", "-p", fmt.Sprint(port.Protocol), "--dport", fmt.Sprint(port.Port),
                        "-j", "DNAT", "--to", fmt.Sprintf("%s:%d", ipVm.String(), port.Port),
                }
                if err := execFg("iptables", iptablesArgs...); err != nil {
                        logger.Error("could not set up DNAT rule for incoming traffic", zap.Error(err))
                        return nil, err
                }
                logger.Debug(fmt.Sprintf("setup DNAT rule for traffic originating from localhost to port %d", port.Port))
                iptablesArgs = []string{
                        "-t", "nat", "-A", "OUTPUT",
                        "-m", "addrtype", "--src-type", "LOCAL", "--dst-type", "LOCAL",
                        "-p", fmt.Sprint(port.Protocol), "--dport", fmt.Sprint(port.Port),
                        "-j", "DNAT", "--to-destination", fmt.Sprintf("%s:%d", ipVm.String(), port.Port),
                }
                if err := execFg("iptables", iptablesArgs...); err != nil {
                        logger.Error("could not set up DNAT rule for traffic from localhost", zap.Error(err))
                        return nil, err
                }
                logger.Debug(fmt.Sprintf("setup ACCEPT rule for traffic originating from localhost to port %d", port.Port))
                iptablesArgs = []string{
                        "-A", "OUTPUT",
                        "-s", "127.0.0.1", "-d", ipVm.String(),
                        "-p", fmt.Sprint(port.Protocol), "--dport", fmt.Sprint(port.Port),
                        "-j", "ACCEPT",
                }
                if err := execFg("iptables", iptablesArgs...); err != nil {
                        logger.Error("could not set up ACCEPT rule for traffic from localhost", zap.Error(err))
                        return nil, err
                }
        }
        logger.Debug("setup MASQUERADE rule for traffic originating from localhost")
        iptablesArgs = []string{
                "-t", "nat", "-A", "POSTROUTING",
                "-m", "addrtype", "--src-type", "LOCAL", "--dst-type", "UNICAST",
                "-j", "MASQUERADE",
        }
        if err := execFg("iptables", iptablesArgs...); err != nil {
                logger.Error("could not set up MASQUERADE rule for traffic from localhost", zap.Error(err))
                return nil, err
        }

        // get dns details from /etc/resolv.conf
        resolvConf, err := resolvconf.Get()
        if err != nil {
                logger.Error("could not get DNS details", zap.Error(err))
                return nil, err
        }
        dns := resolvconf.GetNameservers(resolvConf.Content, types.IP)[0]
        dnsSearch := strings.Join(resolvconf.GetSearchDomains(resolvConf.Content), ",")

        // prepare dnsmask command line (instead of config file)
        logger.Info("run dnsmasq for interface", zap.String("name", defaultNetworkBridgeName))
        dnsMaskCmd := []string{
                // No DNS, DHCP only
                "--port=0",

                // Because we don't provide DNS, no need to load resolv.conf. This helps to
                // avoid "dnsmasq: failed to create inotify: No file descriptors available"
                // errors.
                "--no-resolv",

                "--bind-interfaces",
                "--dhcp-authoritative",
                fmt.Sprintf("--interface=%s", defaultNetworkBridgeName),
                fmt.Sprintf("--dhcp-range=%s,static,%d.%d.%d.%d", ipVm.String(), mask[0], mask[1], mask[2], mask[3]),
                fmt.Sprintf("--dhcp-host=%s,%s,infinite", mac.String(), ipVm.String()),
                fmt.Sprintf("--dhcp-option=option:router,%s", ipPod.String()),
                fmt.Sprintf("--dhcp-option=option:dns-server,%s", dns),
                fmt.Sprintf("--dhcp-option=option:domain-search,%s", dnsSearch),
                fmt.Sprintf("--shared-network=%s,%s", defaultNetworkBridgeName, ipVm.String()),
        }

        // run dnsmasq for default Guest interface
        if err := execFg("dnsmasq", dnsMaskCmd...); err != nil {
                logger.Error("could not run dnsmasq", zap.Error(err))
                return nil, err
        }

        // Adding VM's IP address to the /etc/hosts, so we can access it easily from
        // the pod. This is particularly useful for ssh into the VM from the runner
        // pod.
        f, err := os.OpenFile("/etc/hosts", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
        if err != nil {
                return nil, err
        }
        defer f.Close()
        record := fmt.Sprintf("%v guest-vm\n", ipVm)
        if _, err := f.WriteString(record); err != nil {
                return nil, err
        }

        return mac, nil
}

func overlayNetwork(iface string) (mac.MAC, error) {
        // gerenare random MAC for overlay Guest interface
        mac, err := mac.GenerateRandMAC()
        if err != nil {
                return nil, err
        }

        // create and configure linux bridge
        bridge := &netlink.Bridge{
                LinkAttrs: netlink.LinkAttrs{
                        Name: overlayNetworkBridgeName,
                        Protinfo: &netlink.Protinfo{
                                Learning: false,
                        },
                },
        }
        if err := netlink.LinkAdd(bridge); err != nil {
                return nil, err
        }
        if err := netlink.LinkSetUp(bridge); err != nil {
                return nil, err
        }

        // create an configure TAP interface
        tap := &netlink.Tuntap{
                LinkAttrs: netlink.LinkAttrs{
                        Name: overlayNetworkTapName,
                },
                Mode:  netlink.TUNTAP_MODE_TAP,
                Flags: netlink.TUNTAP_MULTI_QUEUE_DEFAULTS,
        }
        if err := netlink.LinkAdd(tap); err != nil {
                return nil, err
        }
        if err := netlink.LinkSetMaster(tap, bridge); err != nil {
                return nil, err
        }
        if err := netlink.LinkSetUp(tap); err != nil {
                return nil, err
        }

        // add overlay interface to bridge as well
        overlayLink, err := netlink.LinkByName(iface)
        if err != nil {
                return nil, err
        }
        // firsly delete IP address(es) (it it exist) from overlay interface
        overlayAddrs, err := netlink.AddrList(overlayLink, netlink.FAMILY_V4)
        if err != nil {
                return nil, err
        }
        for _, a := range overlayAddrs {
                ip := a.IPNet
                if ip != nil {
                        if err := netlink.AddrDel(overlayLink, &a); err != nil {
                                return nil, err
                        }
                }
        }
        // and now add overlay link to bridge
        if err := netlink.LinkSetMaster(overlayLink, bridge); err != nil {
                return nil, err
        }

        return mac, nil
}

type NetworkMonitoringMetrics struct {
        IngressBytes, EgressBytes, Errors prometheus.Counter
        IngressBytesRaw, EgressBytesRaw   uint64 // Absolute values to calc increments for Counters
}

func NewMonitoringMetrics(reg *prometheus.Registry) *NetworkMonitoringMetrics {
        m := &NetworkMonitoringMetrics{
                IngressBytes: util.RegisterMetric(reg, prometheus.NewCounter(
                        prometheus.CounterOpts{
                                Name: "runner_vm_ingress_bytes",
                                Help: "Number of bytes received by the VM from the open internet",
                        },
                )),
                EgressBytes: util.RegisterMetric(reg, prometheus.NewCounter(
                        prometheus.CounterOpts{
                                Name: "runner_vm_egress_bytes",
                                Help: "Number of bytes sent by the VM to the open internet",
                        },
                )),
                IngressBytesRaw: 0,
                EgressBytesRaw:  0,
                Errors: util.RegisterMetric(reg, prometheus.NewCounter(
                        prometheus.CounterOpts{
                                Name: "runner_vm_network_fetch_errors_total",
                                Help: "Number of errors while fetching network monitoring data",
                        },
                )),
        }
        return m
}

func shouldBeIgnored(ip net.IP) bool {
        // We need to measure only external traffic to/from vm, so we filter internal traffic
        // Don't filter on isUnspecified as it's an iptables rule, not a real ip
        return ip.IsLoopback() || ip.IsPrivate()
}

func getNetworkBytesCounter(iptables *iptables.IPTables, chain string) (uint64, error) {
        cnt := uint64(0)
        rules, err := iptables.Stats("filter", chain)
        if err != nil {
                return cnt, err
        }

        for _, rawStat := range rules {
                stat, err := iptables.ParseStat(rawStat)
                if err != nil {
                        return cnt, err
                }
                src, dest := stat.Source.IP, stat.Destination.IP
                if stat.Protocol == protocolTCP && !shouldBeIgnored(src) && !shouldBeIgnored(dest) {
                        cnt += stat.Bytes
                }
        }
        return cnt, nil
}

func (m *NetworkMonitoringMetrics) update(logger *zap.Logger) {
        // Rules configured at github.com/neondatabase/cloud/blob/main/compute-init/compute-init.sh#L98
        iptables, err := iptables.New()
        if err != nil {
                logger.Error("initializing iptables failed", zap.Error(err))
                m.Errors.Inc()
                return
        }

        ingress, err := getNetworkBytesCounter(iptables, "INPUT")
        if err != nil {
                logger.Error("getting iptables input counter failed", zap.Error(err))
                m.Errors.Inc()
                return
        }
        m.IngressBytes.Add(float64(ingress - m.IngressBytesRaw))
        m.IngressBytesRaw = ingress

        egress, err := getNetworkBytesCounter(iptables, "OUTPUT")
        if err != nil {
                logger.Error("getting iptables output counter failed", zap.Error(err))
                m.Errors.Inc()
                return
        }
        m.EgressBytes.Add(float64(egress - m.EgressBytesRaw))
        m.EgressBytesRaw = egress
}

//go:build linux

package main

import (
        "context"
        "flag"
        "log"
        "net"
        "os"
        "syscall"
        "time"

        "github.com/coreos/go-iptables/iptables"
        "github.com/vishvananda/netlink"

        corev1 "k8s.io/api/core/v1"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/client-go/kubernetes"
        "k8s.io/client-go/rest"
)

const (
        // vxlan interface details
        VXLAN_IF_NAME     = "neon-vxlan0"
        VXLAN_BRIDGE_NAME = "neon-br0"
        VXLAN_ID          = 100

        // iptables settings details
        iptablesChainName = "NEON-EXTRANET"
        extraNetCidr      = "10.100.0.0/16"
)

var deleteIfaces = flag.Bool("delete", false, `delete VXLAN interfaces`)

func main() {
        flag.Parse()

        // creates the in-cluster config
        config, err := rest.InClusterConfig()
        if err != nil {
                log.Fatal(err)
        }

        // creates the clientset
        clientset, err := kubernetes.NewForConfig(config)
        if err != nil {
                log.Fatal(err)
        }

        // -delete option used for teardown vxlan setup
        if *deleteIfaces {
                log.Printf("deleting vxlan interface %s", VXLAN_IF_NAME)
                if err := deleteLink(VXLAN_IF_NAME); err != nil {
                        log.Print(err)
                }
                log.Printf("deleting bridge interface %s", VXLAN_BRIDGE_NAME)
                if err := deleteLink(VXLAN_BRIDGE_NAME); err != nil {
                        log.Print(err)
                }
                log.Printf("deleting iptables nat rules")
                if err := deleteIptablesRules(); err != nil {
                        log.Print(err)
                }
                os.Exit(0)
        }

        ownNodeIP := os.Getenv("MY_NODE_IP")
        log.Printf("own node IP: %s", ownNodeIP)

        // create linux bridge
        log.Printf("creating linux bridge interface (name: %s)", VXLAN_BRIDGE_NAME)
        if err := createBrigeInterface(VXLAN_BRIDGE_NAME); err != nil {
                log.Fatal(err)
        }

        // create vxlan
        log.Printf("creating vxlan interface (name: %s, id: %d)", VXLAN_IF_NAME, VXLAN_ID)
        if err := createVxlanInterface(VXLAN_IF_NAME, VXLAN_ID, ownNodeIP, VXLAN_BRIDGE_NAME); err != nil {
                log.Fatal(err)
        }

        for {
                log.Print("getting nodes IP addresses")
                nodeIPs, err := getNodesIPs(clientset)
                if err != nil {
                        log.Fatal(err)
                }
                log.Printf("found %d ip addresses", len(nodeIPs))

                // update FDB
                log.Print("update FDB table")
                if err := updateFDB(VXLAN_IF_NAME, nodeIPs, ownNodeIP); err != nil {
                        log.Fatal(err)
                }
                // upsert iptables nat rules
                log.Printf("upsert iptables nat rules")
                if err := upsertIptablesRules(); err != nil {
                        log.Print(err)
                }
                time.Sleep(30 * time.Second)
        }
}

func getNodesIPs(clientset *kubernetes.Clientset) ([]string, error) {
        ips := []string{}
        nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
        if err != nil {
                return ips, err
        }
        for _, n := range nodes.Items {
                for _, a := range n.Status.Addresses {
                        if a.Type == corev1.NodeInternalIP {
                                ips = append(ips, a.Address)
                        }
                }
        }
        return ips, nil
}

func createBrigeInterface(name string) error {
        // check if interface already exists
        _, err := netlink.LinkByName(name)
        if err == nil {
                log.Printf("link with name %s already found", name)
                return nil
        }
        _, notFound := err.(netlink.LinkNotFoundError) //nolint:errorlint // errors.Is doesn't work, we actually just want to know the type.
        if !notFound {
                return err
        }

        // create an configure linux bridge
        link := &netlink.Bridge{
                LinkAttrs: netlink.LinkAttrs{
                        Name: name,
                },
        }
        if err := netlink.LinkAdd(link); err != nil {
                return err
        }

        if err := netlink.LinkSetUp(link); err != nil {
                return err
        }

        return nil
}

func createVxlanInterface(name string, vxlanID int, ownIP string, bridgeName string) error {
        // check if interface already exists
        _, err := netlink.LinkByName(name)
        if err == nil {
                log.Printf("link with name %s already found", name)
                return nil
        }
        _, notFound := err.(netlink.LinkNotFoundError) //nolint:errorlint // errors.Is doesn't work, we actually just want to know the type.
        if !notFound {
                return err
        }

        // create an configure vxlan
        link := &netlink.Vxlan{
                LinkAttrs: netlink.LinkAttrs{
                        Name: name,
                },
                VxlanId: vxlanID,
                SrcAddr: net.ParseIP(ownIP),
                Port:    4789,
        }

        if err := netlink.LinkAdd(link); err != nil {
                return err
        }

        // add vxlan to bridge
        br, err := netlink.LinkByName(bridgeName)
        if err != nil {
                return err
        }
        if err := netlink.LinkSetMaster(link, br); err != nil {
                return err
        }

        if err := netlink.LinkSetUp(link); err != nil {
                return err
        }

        return nil
}

func updateFDB(vxlanName string, nodeIPs []string, ownIP string) error {
        broadcastFdbMac, _ := net.ParseMAC("00:00:00:00:00:00")

        // get vxlan interface details
        link, err := netlink.LinkByName(vxlanName)
        if err != nil {
                return err
        }

        for _, ip := range nodeIPs {
                if ip != ownIP {
                        if net.ParseIP(ip).To4() == nil {
                                log.Printf("not adding IPv6 addr %q to FDB broadcast entry, no support for it", ip)
                                continue
                        }

                        broadcastFdbEntry := netlink.Neigh{
                                LinkIndex:    link.Attrs().Index,
                                Family:       syscall.AF_BRIDGE,
                                State:        netlink.NUD_PERMANENT,
                                Flags:        netlink.NTF_SELF,
                                IP:           net.ParseIP(ip),
                                HardwareAddr: broadcastFdbMac,
                        }
                        // add entry to FDB table
                        // duplicate append action will not case error.
                        log.Printf("add/update FDB broadcast entry via %s", ip)
                        if err := netlink.NeighAppend(&broadcastFdbEntry); err != nil {
                                return err
                        }
                }
        }

        return nil
}

func deleteLink(name string) error {
        // check if interface already exists
        link, err := netlink.LinkByName(name)
        if err == nil {
                if err := netlink.LinkDel(link); err != nil {
                        return err
                }
                log.Printf("link with name %s was deleted", name)
                return nil
        }
        _, notFound := err.(netlink.LinkNotFoundError) //nolint:errorlint // errors.Is doesn't work, we actually just want to know the type.
        if !notFound {
                return err
        }
        log.Printf("link with name %s not found", name)

        return nil
}

func upsertIptablesRules() error {
        // manage iptables
        ipt, err := iptables.New(iptables.IPFamily(iptables.ProtocolIPv4), iptables.Timeout(5))
        if err != nil {
                return err
        }
        chainExists, err := ipt.ChainExists("nat", iptablesChainName)
        if err != nil {
                return err
        }
        if !chainExists {
                err := ipt.NewChain("nat", iptablesChainName)
                if err != nil {
                        return err
                }
        }

        if err := insertRule(ipt, "nat", "POSTROUTING", 1, "-d", extraNetCidr, "-j", iptablesChainName); err != nil {
                return err
        }
        if err := insertRule(ipt, "nat", iptablesChainName, 1, "-s", extraNetCidr, "-j", "ACCEPT"); err != nil {
                return err
        }
        if err := insertRule(ipt, "nat", iptablesChainName, 2, "-d", extraNetCidr, "-j", "ACCEPT"); err != nil {
                return err
        }

        return nil
}

func deleteIptablesRules() error {
        // manage iptables
        ipt, err := iptables.New(iptables.IPFamily(iptables.ProtocolIPv4), iptables.Timeout(5))
        if err != nil {
                return err
        }
        err = ipt.ClearAndDeleteChain("nat", iptablesChainName)
        if err != nil {
                return err
        }

        return nil
}

// insertRule acts like Insert except that it won't insert a duplicate (no matter the position in the chain)
func insertRule(ipt *iptables.IPTables, table, chain string, pos int, rulespec ...string) error {
        exists, err := ipt.Exists(table, chain, rulespec...)
        if err != nil {
                return err
        }

        if !exists {
                return ipt.Insert(table, chain, pos, rulespec...)
        }

        return nil
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Package v1 contains API Schema definitions for the vm v1 API group
// +kubebuilder:object:generate=true
// +groupName=vm.neon.tech
package v1

import (
        "sigs.k8s.io/controller-runtime/pkg/scheme"

        "k8s.io/apimachinery/pkg/runtime/schema"
)

var (
        // SchemeGroupVersion is group version used to register these objects
        SchemeGroupVersion = schema.GroupVersion{Group: "vm.neon.tech", Version: "v1"}

        // SchemeBuilder is used to add go types to the GroupVersionKind scheme
        SchemeBuilder = &scheme.Builder{GroupVersion: SchemeGroupVersion}

        // AddToScheme adds the types in this group-version to the given scheme.
        AddToScheme = SchemeBuilder.AddToScheme
)

// Resource takes an unqualified resource and returns a Group qualified GroupResource
func Resource(resource string) schema.GroupResource {
        return SchemeGroupVersion.WithResource(resource).GroupResource()
}

package v1

import (
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// IPPoolSpec defines the desired state of IPPool
type IPPoolSpec struct {
        // Range is a RFC 4632/4291-style string that represents an IP address and prefix length in CIDR notation
        Range string `json:"range"`
        // Allocations is the set of allocated IPs for the given range. Its` indices are a direct mapping to the
        // IP with the same index/offset for the pool's range.
        Allocations map[string]IPAllocation `json:"allocations"`
}

// IPAllocation represents metadata about the pod/container owner of a specific IP
// coped from Whereabout CNI as their allocation functions used
type IPAllocation struct {
        ContainerID string `json:"id"`
        PodRef      string `json:"podref,omitempty"`
}

//+genclient
//+kubebuilder:object:root=true
//+kubebuilder:resource:singular=ippool

// IPPool is the Schema for the ippools API
type IPPool struct {
        metav1.TypeMeta   `json:",inline"`
        metav1.ObjectMeta `json:"metadata,omitempty"`

        Spec IPPoolSpec `json:"spec,omitempty"`
}

// +kubebuilder:object:root=true

// IPPoolList contains a list of IPPool
type IPPoolList struct {
        metav1.TypeMeta `json:",inline"`
        metav1.ListMeta `json:"metadata,omitempty"`
        Items           []IPPool `json:"items"`
}

func init() {
        SchemeBuilder.Register(&IPPool{}, &IPPoolList{}) //nolint:exhaustruct // just being used to provide the types
}

package v1

import (
        "encoding/json"
        "fmt"

        corev1 "k8s.io/api/core/v1"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// VirtualMachineOwnerForPod returns the OwnerReference for the VirtualMachine that owns the pod, if
// there is one.
//
// When a live migration is ongoing, only the source Pod will be marked as owned by the
// VirtualMachine.
func VirtualMachineOwnerForPod(pod *corev1.Pod) (_ metav1.OwnerReference, ok bool) {
        gv := SchemeGroupVersion.String()

        for _, ref := range pod.OwnerReferences {
                if ref.APIVersion == gv && ref.Kind == "VirtualMachine" {
                        return ref, true
                }
        }

        var empty metav1.OwnerReference
        return empty, false
}

// MigrationRole represents the role that a Pod is taking during a live migration -- either the
// source or target of the migration.
type MigrationRole string

const (
        MigrationRoleSource MigrationRole = "source"
        MigrationRoleTarget MigrationRole = "target"
)

// MigrationOwnerForPod returns the OwnerReference for the live migration that this Pod is a part
// of, if there is one ongoing.
//
// The MigrationRole returned also indicates whether the Pod is the source or the target of the
// migration.
func MigrationOwnerForPod(pod *corev1.Pod) (metav1.OwnerReference, MigrationRole, bool) {
        gv := SchemeGroupVersion.String()

        for _, ref := range pod.OwnerReferences {
                if ref.APIVersion == gv && ref.Kind == "VirtualMachineMigration" {
                        var role MigrationRole
                        if ref.Controller != nil && *ref.Controller {
                                // the migration only ever "controls" the target pod. When the migration is ongoing,
                                // the virtual machine controls the source, and when it's over, the migration stops
                                // owning the target and transfers "control" to the virtual machine object, while
                                // keeping the source pod as a non-controlling reference.
                                role = MigrationRoleTarget
                        } else {
                                role = MigrationRoleSource
                        }

                        return ref, role, true
                }
        }

        var emptyRef metav1.OwnerReference
        return emptyRef, "", false
}

// VirtualMachineUsageFromPod returns the resources currently used by the virtual machine, as
// described by the helper usage annotation on the pod.
//
// If the usage annotation is not present, this function returns (nil, nil).
func VirtualMachineUsageFromPod(pod *corev1.Pod) (*VirtualMachineUsage, error) {
        return extractFromAnnotation[VirtualMachineUsage](pod, VirtualMachineUsageAnnotation)
}

// VirtualMachineResourcesFromPod returns the information about resources allocated to the virtual
// machine, as encoded by the helper annotation on the pod.
//
// If the annotation is not present, this function returns (nil, nil).
func VirtualMachineResourcesFromPod(pod *corev1.Pod) (*VirtualMachineResources, error) {
        return extractFromAnnotation[VirtualMachineResources](pod, VirtualMachineResourcesAnnotation)
}

func extractFromAnnotation[T any](pod *corev1.Pod, annotation string) (*T, error) {
        jsonString, ok := pod.Annotations[annotation]
        if !ok {
                return nil, nil
        }

        var value T
        if err := json.Unmarshal([]byte(jsonString), &value); err != nil {
                return nil, fmt.Errorf("could not unmarshal %s annotation: %w", annotation, err)
        }
        return &value, nil
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1

import (
        "encoding/json"
        "fmt"
        "slices"
        "time"

        "go.uber.org/zap/zapcore"

        corev1 "k8s.io/api/core/v1"
        "k8s.io/apimachinery/pkg/api/resource"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

const (
        // VirtualMachineNameLabel is the label assigned to each NeonVM Pod, providing the name of the
        // VirtualMachine object for the VM running in it
        //
        // This label can be used both to find which VM is running in a Pod (by getting the value of the
        // label) or to find which Pod a VM is running in (by searching for Pods with the label equal to
        // the VM's name).
        VirtualMachineNameLabel string = "vm.neon.tech/name"

        // Label that determines the version of runner pod. May be missing on older runners
        RunnerPodVersionLabel string = "vm.neon.tech/runner-version"

        // VirtualMachineUsageAnnotation is the annotation added to each runner Pod, mirroring
        // information about the resource allocations of the VM running in the pod.
        //
        // The value of this annotation is always a JSON-encoded VirtualMachineUsage object.
        VirtualMachineUsageAnnotation string = "vm.neon.tech/usage"

        // VirtualMachineResourcesAnnotation is the annotation added to each runner Pod, mirroring
        // information about the resource allocations of the VM running in the pod.
        //
        // The value of this annotation is always a JSON-encoded VirtualMachineResources object.
        VirtualMachineResourcesAnnotation string = "vm.neon.tech/resources"
)

// VirtualMachineUsage provides information about a VM's current usage. This is the type of the
// JSON-encoded data in the VirtualMachineUsageAnnotation attached to each runner pod.
type VirtualMachineUsage struct {
        CPU    *resource.Quantity `json:"cpu"`
        Memory *resource.Quantity `json:"memory"`
}

// VirtualMachineResources provides information about a VM's resource allocations.
type VirtualMachineResources struct {
        CPUs           CPUs              `json:"cpus"`
        MemorySlots    MemorySlots       `json:"memorySlots"`
        MemorySlotSize resource.Quantity `json:"memorySlotSize"`
}

// NOTE: json tags are required.  Any new fields you add must have json tags for the fields to be serialized.

// VirtualMachineSpec defines the desired state of VirtualMachine
type VirtualMachineSpec struct {
        // +kubebuilder:validation:Minimum=1
        // +kubebuilder:validation:Maximum=65535
        // +kubebuilder:default:=20183
        // +optional
        QMP int32 `json:"qmp,omitempty"`

        // +kubebuilder:validation:Minimum=1
        // +kubebuilder:validation:Maximum=65535
        // +kubebuilder:default:=20184
        // +optional
        QMPManual int32 `json:"qmpManual,omitempty"`

        // +kubebuilder:validation:Minimum=1
        // +kubebuilder:validation:Maximum=65535
        // +kubebuilder:default:=25183
        // +optional
        RunnerPort int32 `json:"runnerPort,omitempty"`

        // +kubebuilder:default:=5
        // +optional
        TerminationGracePeriodSeconds *int64 `json:"terminationGracePeriodSeconds"`

        NodeSelector       map[string]string           `json:"nodeSelector,omitempty"`
        Affinity           *corev1.Affinity            `json:"affinity,omitempty"`
        Tolerations        []corev1.Toleration         `json:"tolerations,omitempty"`
        SchedulerName      string                      `json:"schedulerName,omitempty"`
        ServiceAccountName string                      `json:"serviceAccountName,omitempty"`
        PodResources       corev1.ResourceRequirements `json:"podResources,omitempty"`

        // +kubebuilder:default:=Always
        // +optional
        RestartPolicy RestartPolicy `json:"restartPolicy"`

        ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"`

        // +kubebuilder:default:=amd64
        // +optional
        TargetArchitecture *CPUArchitecture `json:"targetArchitecture,omitempty"`

        Guest Guest `json:"guest"`

        // Running init containers is costly, so InitScript field should be preferred over ExtraInitContainers
        ExtraInitContainers []corev1.Container `json:"extraInitContainers,omitempty"`

        // InitScript will be executed in the main container before VM is started.
        // +optional
        InitScript string `json:"initScript,omitempty"`

        // List of disk that can be mounted by virtual machine.
        // +optional
        Disks []Disk `json:"disks,omitempty"`

        // Extra network interface attached to network provided by Mutlus CNI.
        // +optional
        ExtraNetwork *ExtraNetwork `json:"extraNetwork,omitempty"`

        // +optional
        ServiceLinks *bool `json:"service_links,omitempty"`

        // Use KVM acceleation
        // +kubebuilder:default:=true
        // +optional
        EnableAcceleration *bool `json:"enableAcceleration,omitempty"`

        // Override for normal neonvm-runner image
        // +optional
        RunnerImage *string `json:"runnerImage,omitempty"`

        // Enable SSH on the VM. It works only if the VM image is built using VM Builder that
        // has SSH support (TODO: mention VM Builder version).
        // +kubebuilder:default:=true
        // +optional
        EnableSSH *bool `json:"enableSSH,omitempty"`

        // The TLS configuration to use for provisioning certificates
        // +optional
        TLS *TLSProvisioning `json:"tls,omitempty"`

        // TargetRevision is the identifier set by external party to track when changes to the spec
        // propagate to the VM.
        //
        // If a certain value is written into Spec.TargetRevision together with the changes, and
        // the same value is observed in Status.CurrentRevision, it means that the changes were
        // propagated to the VM.
        // +optional
        TargetRevision *RevisionWithTime `json:"targetRevision,omitempty"`

        // Controls how CPU scaling is performed, either hotplug new CPUs with QMP, or enable them in sysfs.
        // +kubebuilder:default:=QmpScaling
        // +optional
        CpuScalingMode *CpuScalingMode `json:"cpuScalingMode,omitempty"`

        // Enable network monitoring on the VM
        // +kubebuilder:default:=false
        // +optional
        EnableNetworkMonitoring *bool `json:"enableNetworkMonitoring,omitempty"`
}

type TLSProvisioning struct {
        // The CertificateIssuer for the certificates issued to this VM
        CertificateIssuer string `json:"certificateIssuer,omitempty"`

        // This is required to set the duration that the certificate should be valid for before expiring
        ExpireAfter metav1.Duration `json:"expireAfter,omitempty"`

        // This is required to set the duration before certificate expiration that the certificate is renewed
        RenewBefore metav1.Duration `json:"renewBefore,omitempty"`

        // This is the common name for the TLS certificate
        ServerName string `json:"serverName,omitempty"`

        // Which directory in the VM these certificates should be mounted to.
        // Will be exposed as `tls.key` and `tls.crt`.
        // +kubebuilder:default:=/var/tls
        // +optional
        MountPath string `json:"mountPath,omitempty"`
}

func (spec *VirtualMachineSpec) Resources() VirtualMachineResources {
        return VirtualMachineResources{
                CPUs:           spec.Guest.CPUs,
                MemorySlots:    spec.Guest.MemorySlots,
                MemorySlotSize: spec.Guest.MemorySlotSize,
        }
}

// +kubebuilder:validation:Enum=amd64;arm64
type CPUArchitecture string

const (
        CPUArchitectureAMD64 CPUArchitecture = "amd64"
        CPUArchitectureARM64 CPUArchitecture = "arm64"
)

// +kubebuilder:validation:Enum=QmpScaling;SysfsScaling
type CpuScalingMode string

// FlagFunc is a parsing function to be used with flag.Func
func (p *CpuScalingMode) FlagFunc(value string) error {
        possibleValues := []string{
                string(CpuScalingModeQMP),
                string(CpuScalingModeSysfs),
        }

        if !slices.Contains(possibleValues, value) {
                return fmt.Errorf("Unknown CpuScalingMode %q, must be one of %v", value, possibleValues)
        }

        *p = CpuScalingMode(value)
        return nil
}

const (
        // CpuScalingModeQMP is the value of the VirtualMachineSpec.CpuScalingMode field that indicates
        // that the VM should use QMP to scale CPUs.
        CpuScalingModeQMP CpuScalingMode = "QmpScaling"

        // CpuScalingModeSysfs is the value of the VirtualMachineSpec.CpuScalingMode field that
        // indicates that the VM should use the CPU sysfs state interface to scale CPUs.
        CpuScalingModeSysfs CpuScalingMode = "SysfsScaling"
)

// +kubebuilder:validation:Enum=Always;OnFailure;Never
type RestartPolicy string

const (
        RestartPolicyAlways    RestartPolicy = "Always"
        RestartPolicyOnFailure RestartPolicy = "OnFailure"
        RestartPolicyNever     RestartPolicy = "Never"
)

type Guest struct {
        // +optional
        KernelImage *string `json:"kernelImage,omitempty"`
        // Set the maximum MOVABLE:KERNEL memory ratio in %.
        // Kernel default is 301%.
        // See https://docs.kernel.org/admin-guide/mm/memory-hotplug.html
        // +optional
        MemhpAutoMovableRatio *string `json:"memhpAutoMovableRatio,omitempty"`
        // +optional
        AppendKernelCmdline *string `json:"appendKernelCmdline,omitempty"`

        // +optional
        CPUs CPUs `json:"cpus"`
        // +optional
        // +kubebuilder:default:="1Gi"
        MemorySlotSize resource.Quantity `json:"memorySlotSize"`
        // +optional
        MemorySlots MemorySlots `json:"memorySlots"`
        // +optional
        RootDisk RootDisk `json:"rootDisk"`
        // Docker image Entrypoint array replacement.
        // +optional
        Command []string `json:"command,omitempty"`
        // Arguments to the entrypoint.
        // The docker image's cmd is used if this is not provided.
        // +optional
        Args []string `json:"args,omitempty"`
        // List of environment variables to set in the vmstart process.
        // +optional
        Env []EnvVar `json:"env,omitempty" patchStrategy:"merge" patchMergeKey:"name"`
        // List of ports to expose from the container.
        // Cannot be updated.
        // +optional
        Ports []Port `json:"ports,omitempty"`

        // Additional settings for the VM.
        // Cannot be updated.
        // +optional
        Settings *GuestSettings `json:"settings,omitempty"`
}

const virtioMemBlockSizeBytes = 8 * 1024 * 1024 // 8 MiB

// ValidateMemorySize returns an error iff the memory settings are invalid for use with virtio-mem
// (the backing memory provider that we use)
func (g Guest) ValidateMemorySize() error {
        if g.MemorySlotSize.Value()%virtioMemBlockSizeBytes != 0 {
                return fmt.Errorf("memorySlotSize invalid for use with virtio-mem: must be a multiple of 8Mi")
        }
        return nil
}

// Flag is a bitmask of flags. The meaning is up to the user.
//
// Used in Revision below.
type Flag uint64

func (f *Flag) Set(flag Flag) {
        *f |= flag
}

func (f *Flag) Clear(flag Flag) {
        *f &= ^flag
}

func (f *Flag) Has(flag Flag) bool {
        return *f&flag != 0
}

// Revision is an identifier, which can be assigned to a specific configuration of a VM.
// Later it can be used to track the application of the configuration.
type Revision struct {
        Value int64 `json:"value"`
        Flags Flag  `json:"flags"`
}

// ZeroRevision is the default value when revisions updates are disabled.
var ZeroRevision = Revision{Value: 0, Flags: 0}

func (r Revision) Min(other Revision) Revision {
        if r.Value < other.Value {
                return r
        }
        return other
}

func (r Revision) WithTime(t time.Time) RevisionWithTime {
        return RevisionWithTime{
                Revision:  r,
                UpdatedAt: metav1.NewTime(t),
        }
}

// MarshalLogObject implements zapcore.ObjectMarshaler, so that Revision can be used with zap.Object
func (r *Revision) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        enc.AddInt64("value", r.Value)
        enc.AddUint64("flags", uint64(r.Flags))
        return nil
}

// RevisionWithTime contains a Revision and the time it was last updated.
type RevisionWithTime struct {
        Revision  `json:"revision"`
        UpdatedAt metav1.Time `json:"updatedAt"`
}

// MarshalLogObject implements zapcore.ObjectMarshaler, so that RevisionWithTime can be used with zap.Object
func (r *RevisionWithTime) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        enc.AddTime("updatedAt", r.UpdatedAt.Time)
        return r.Revision.MarshalLogObject(enc)
}

type GuestSettings struct {
        // Individual lines to add to a sysctl.conf file. See sysctl.conf(5) for more
        // +optional
        Sysctl []string `json:"sysctl,omitempty"`

        // Swap adds a swap disk with the provided size.
        //
        // +optional
        Swap *resource.Quantity `json:"swap,omitempty"`
}

type CPUs struct {
        Min MilliCPU `json:"min"`
        Max MilliCPU `json:"max"`
        Use MilliCPU `json:"use"`
}

// MilliCPU is a special type to represent vCPUs * 1000
// e.g. 2 vCPU is 2000, 0.25 is 250
//
// +kubebuilder:validation:XIntOrString
// +kubebuilder:validation:Pattern=^[0-9]+((\.[0-9]*)?|m)
type MilliCPU uint32 // note: pattern is more restrictive than resource.Quantity, because we're just using it for CPU

// RoundedUp returns the smallest integer number of CPUs greater than or equal to the effective
// value of m.
func (m MilliCPU) RoundedUp() uint32 {
        r := uint32(m) / 1000
        if m%1000 != 0 {
                r += 1
        }
        return r
}

// MilliCPUFromResourceQuantity converts resource.Quantity into MilliCPU
func MilliCPUFromResourceQuantity(r resource.Quantity) MilliCPU {
        return MilliCPU(r.MilliValue())
}

// ToResourceQuantity converts a MilliCPU to resource.Quantity
// this is useful for formatting/serialization
func (m MilliCPU) ToResourceQuantity() *resource.Quantity {
        return resource.NewMilliQuantity(int64(m), resource.BinarySI)
}

// AsFloat64 converts the MilliCPU value into a float64 of CPU
//
// This should be preferred over calling m.ToResourceQuantity().AsApproximateFloat64(), because
// going through the resource.Quantity can produce less accurate floats.
func (m MilliCPU) AsFloat64() float64 {
        return float64(m) / 1000
}

// this is used to parse scheduler config and communication between components
// we used resource.Quantity as underlying transport format for MilliCPU
func (m *MilliCPU) UnmarshalJSON(data []byte) error {
        var quantity resource.Quantity
        err := json.Unmarshal(data, &quantity)
        if err != nil {
                return err
        }

        *m = MilliCPUFromResourceQuantity(quantity)
        return nil
}

func (m MilliCPU) MarshalJSON() ([]byte, error) {
        // Mashal as an integer if we can, for backwards-compatibility with components that wouldn't be
        // expecting a string here.
        if m%1000 == 0 {
                return json.Marshal(uint32(m / 1000))
        }

        return json.Marshal(m.ToResourceQuantity())
}

func (m MilliCPU) Format(state fmt.State, verb rune) {
        switch {
        case verb == 'v' && state.Flag('#'):
                //nolint:errcheck // can't do anything about the write error
                state.Write([]byte(fmt.Sprintf("%v", uint32(m))))
        default:
                //nolint:errcheck // can't do anything about the write error
                state.Write([]byte(fmt.Sprintf("%v", m.AsFloat64())))
        }
}

type MemorySlots struct {
        // +kubebuilder:validation:Minimum=1
        // +kubebuilder:validation:Maximum=512
        // +kubebuilder:validation:ExclusiveMaximum=false
        Min int32 `json:"min"`
        // +kubebuilder:validation:Minimum=1
        // +kubebuilder:validation:Maximum=512
        // +kubebuilder:validation:ExclusiveMaximum=false
        Max int32 `json:"max"`
        // +kubebuilder:validation:Minimum=1
        // +kubebuilder:validation:Maximum=512
        // +kubebuilder:validation:ExclusiveMaximum=false
        Use int32 `json:"use"`
}

type RootDisk struct {
        Image string `json:"image"`
        // +optional
        Size resource.Quantity `json:"size,omitempty"`
        // +optional
        // +kubebuilder:default:="IfNotPresent"
        ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy"`
        // +optional
        Execute []string `json:"execute,omitempty"`
}

type EnvVar struct {
        // Name of the environment variable. Must be a C_IDENTIFIER.
        Name string `json:"name"`
        // +optional
        // +kubebuilder:default:=""
        Value string `json:"value,omitempty"`
}

type Port struct {
        // If specified, this must be an IANA_SVC_NAME and unique within the pod. Each
        // named port in a pod must have a unique name. Name for the port that can be
        // referred to by services.
        Name string `json:"name,omitempty"`
        // Number of port to expose on the pod's IP address.
        // This must be a valid port number, 0 < x < 65536.
        // +kubebuilder:validation:Minimum=1
        // +kubebuilder:validation:Maximum=65535
        Port int `json:"port"`
        // Protocol for port. Must be UDP or TCP.
        // Defaults to "TCP".
        // +kubebuilder:default:=TCP
        Protocol Protocol `json:"protocol,omitempty"`
}

type Protocol string

const (
        // ProtocolTCP is the TCP protocol.
        ProtocolTCP Protocol = "TCP"
        // ProtocolUDP is the UDP protocol.
        ProtocolUDP Protocol = "UDP"
)

type Disk struct {
        // Disk's name.
        // Must be a DNS_LABEL and unique within the virtual machine.
        Name string `json:"name"`
        // Mounted read-only if true, read-write otherwise (false or unspecified).
        // Defaults to false.
        // +optional
        // +kubebuilder:default:=false
        ReadOnly *bool `json:"readOnly,omitempty"`
        // Path within the virtual machine at which the disk should be mounted.  Must
        // not contain ':'.
        MountPath string `json:"mountPath"`
        // The disk source is monitored for changes if true, otherwise it is only read on VM startup (false or unspecified).
        // This only works if the disk source is a configmap, a secret, or a projected volume.
        // Defaults to false.
        // +optional
        // +kubebuilder:default:=false
        Watch *bool `json:"watch,omitempty"`
        // DiskSource represents the location and type of the mounted disk.
        DiskSource `json:",inline"`
}

type DiskSource struct {
        // EmptyDisk represents a temporary empty qcow2 disk that shares a vm's lifetime.
        EmptyDisk *EmptyDiskSource `json:"emptyDisk,omitempty"`
        // configMap represents a configMap that should populate this disk
        // +optional
        ConfigMap *corev1.ConfigMapVolumeSource `json:"configMap,omitempty"`
        // Secret represents a secret that should populate this disk.
        // +optional
        Secret *corev1.SecretVolumeSource `json:"secret,omitempty"`
        // TmpfsDisk represents a tmpfs.
        // +optional
        Tmpfs *TmpfsDiskSource `json:"tmpfs,omitempty"`
}

type EmptyDiskSource struct {
        Size resource.Quantity `json:"size"`
        // Discard enables the "discard" mount option for the filesystem
        Discard bool `json:"discard,omitempty"`
        // EnableQuotas enables the "prjquota" mount option for the ext4 filesystem.
        // More info here:
        // https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9/html/managing_file_systems/limiting-storage-space-usage-on-ext4-with-quotas_managing-file-systems
        EnableQuotas bool `json:"enableQuotas,omitempty"`
}

type TmpfsDiskSource struct {
        Size resource.Quantity `json:"size"`
}

type ExtraNetwork struct {
        // Enable extra network interface
        // +kubebuilder:default:=false
        // +optional
        Enable bool `json:"enable"`
        // Interface name.
        // +kubebuilder:default:=net1
        // +optional
        Interface string `json:"interface"`
        // Multus Network name specified in network-attachments-definition.
        // +optional
        MultusNetwork string `json:"multusNetwork,omitempty"`
}

// VirtualMachineStatus defines the observed state of VirtualMachine
type VirtualMachineStatus struct {
        // Represents the observations of a VirtualMachine's current state.
        // VirtualMachine.status.conditions.type are: "Available", "Progressing", and "Degraded"
        // VirtualMachine.status.conditions.status are one of True, False, Unknown.
        // VirtualMachine.status.conditions.reason the value should be a CamelCase string and producers of specific
        // condition types may define expected values and meanings for this field, and whether the values
        // are considered a guaranteed API.
        // VirtualMachine.status.conditions.Message is a human readable message indicating details about the transition.
        // For further information see: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties

        Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"`

        // The phase of a VM is a simple, high-level summary of where the VM is in its lifecycle.
        // +optional
        Phase VmPhase `json:"phase,omitempty"`
        // Number of times the VM runner pod has been recreated
        // +optional
        RestartCount int32 `json:"restartCount"`
        // +optional
        PodName string `json:"podName,omitempty"`
        // +optional
        PodIP string `json:"podIP,omitempty"`
        // +optional
        ExtraNetIP string `json:"extraNetIP,omitempty"`
        // +optional
        ExtraNetMask string `json:"extraNetMask,omitempty"`
        // +optional
        Node string `json:"node,omitempty"`
        // +optional
        CPUs *MilliCPU `json:"cpus,omitempty"`
        // +optional
        MemorySize *resource.Quantity `json:"memorySize,omitempty"`
        // +optional
        SSHSecretName string `json:"sshSecretName,omitempty"`
        // +optional
        TLSSecretName string `json:"tlsSecretName,omitempty"`

        // CurrentRevision is updated with Spec.TargetRevision's value once
        // the changes are propagated to the VM.
        // +optional
        CurrentRevision *RevisionWithTime `json:"currentRevision,omitempty"`
}

type VmPhase string

const (
        // VmPending means the VM has been accepted by the system, but vm-runner pod
        // has not been started. This includes time before being bound to a node, as well as time spent
        // pulling images onto the host.
        VmPending VmPhase = "Pending"
        // VmRunning means the vm-runner pod has been bound to a node and have been started.
        VmRunning VmPhase = "Running"
        // VmSucceeded means that all containers in the vm-runner pod have voluntarily terminated
        // with a container exit code of 0, and the system is not going to restart any of these containers.
        VmSucceeded VmPhase = "Succeeded"
        // VmFailed means that all containers in the vm-runner pod have terminated, and at least one container has
        // terminated in a failure (exited with a non-zero exit code or was stopped by the system).
        VmFailed VmPhase = "Failed"
        // VmPreMigrating means that VM in preparation to start migration
        VmPreMigrating VmPhase = "PreMigrating"
        // VmMigrating means that VM in migration to another node
        VmMigrating VmPhase = "Migrating"
        // VmScaling means that devices are plugging/unplugging to/from the VM
        VmScaling VmPhase = "Scaling"
)

// IsAlive returns whether the guest in the VM is expected to be running
func (p VmPhase) IsAlive() bool {
        switch p {
        case VmRunning, VmPreMigrating, VmMigrating, VmScaling:
                return true
        default:
                return false
        }
}

//+genclient
//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:resource:singular=neonvm

// VirtualMachine is the Schema for the virtualmachines API
// +kubebuilder:printcolumn:name="Cpus",type=string,JSONPath=`.status.cpus`
// +kubebuilder:printcolumn:name="Memory",type=string,JSONPath=`.status.memorySize`
// +kubebuilder:printcolumn:name="Pod",type=string,JSONPath=`.status.podName`
// +kubebuilder:printcolumn:name="ExtraIP",type=string,JSONPath=`.status.extraNetIP`
// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.phase`
// +kubebuilder:printcolumn:name="Restarts",type=string,JSONPath=`.status.restarts`
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
// +kubebuilder:printcolumn:name="Node",type=string,priority=1,JSONPath=`.status.node`
// +kubebuilder:printcolumn:name="Image",type=string,priority=1,JSONPath=`.spec.guest.rootDisk.image`
// +kubebuilder:printcolumn:name="CPUScalingMode",type=string,priority=1,JSONPath=`.spec.cpuScalingMode`
// +kubebuilder:printcolumn:name="TargetArchitecture",type=string,priority=1,JSONPath=`.spec.targetArchitecture`
type VirtualMachine struct {
        metav1.TypeMeta   `json:",inline"`
        metav1.ObjectMeta `json:"metadata,omitempty"`

        Spec   VirtualMachineSpec   `json:"spec,omitempty"`
        Status VirtualMachineStatus `json:"status,omitempty"`
}

func (vm *VirtualMachine) Cleanup() {
        vm.Status.PodName = ""
        vm.Status.PodIP = ""
        vm.Status.Node = ""
        vm.Status.CPUs = nil
        vm.Status.MemorySize = nil
}

func (vm *VirtualMachine) HasRestarted() bool {
        return vm.Status.RestartCount > 0
}

//+kubebuilder:object:root=true

// VirtualMachineList contains a list of VirtualMachine
type VirtualMachineList struct {
        metav1.TypeMeta `json:",inline"`
        metav1.ListMeta `json:"metadata,omitempty"`
        Items           []VirtualMachine `json:"items"`
}

func init() {
        SchemeBuilder.Register(&VirtualMachine{}, &VirtualMachineList{}) //nolint:exhaustruct // just being used to provide the types
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1

import (
        "errors"
        "fmt"
        "reflect"
        "slices"

        "sigs.k8s.io/controller-runtime/pkg/webhook"
        "sigs.k8s.io/controller-runtime/pkg/webhook/admission"

        "k8s.io/apimachinery/pkg/runtime"
)

//+kubebuilder:webhook:path=/mutate-vm-neon-tech-v1-virtualmachine,mutating=true,failurePolicy=fail,sideEffects=None,groups=vm.neon.tech,resources=virtualmachines,verbs=create;update,versions=v1,name=mvirtualmachine.kb.io,admissionReviewVersions=v1

var _ webhook.Defaulter = &VirtualMachine{}

// Default implements webhook.Defaulter
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachine) Default() {
        // Nothing to do.
}

//+kubebuilder:webhook:path=/validate-vm-neon-tech-v1-virtualmachine,mutating=false,failurePolicy=fail,sideEffects=None,groups=vm.neon.tech,resources=virtualmachines,verbs=create;update,versions=v1,name=vvirtualmachine.kb.io,admissionReviewVersions=v1

var _ webhook.Validator = &VirtualMachine{}

// ValidateCreate implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control.
func (r *VirtualMachine) ValidateCreate() (admission.Warnings, error) {
        // validate .spec.guest.cpus.use and .spec.guest.cpus.max
        if r.Spec.Guest.CPUs.Use < r.Spec.Guest.CPUs.Min {
                return nil, fmt.Errorf(".spec.guest.cpus.use (%v) should be greater than or equal to the .spec.guest.cpus.min (%v)",
                        r.Spec.Guest.CPUs.Use,
                        r.Spec.Guest.CPUs.Min)
        }
        if r.Spec.Guest.CPUs.Use > r.Spec.Guest.CPUs.Max {
                return nil, fmt.Errorf(".spec.guest.cpus.use (%v) should be less than or equal to the .spec.guest.cpus.max (%v)",
                        r.Spec.Guest.CPUs.Use,
                        r.Spec.Guest.CPUs.Max)
        }

        if err := r.Spec.Guest.ValidateMemorySize(); err != nil {
                return nil, fmt.Errorf(".spec.guest: %w", err)
        }

        // validate .spec.guest.memorySlots.use and .spec.guest.memorySlots.max
        if r.Spec.Guest.MemorySlots.Use < r.Spec.Guest.MemorySlots.Min {
                return nil, fmt.Errorf(".spec.guest.memorySlots.use (%d) should be greater than or equal to the .spec.guest.memorySlots.min (%d)",
                        r.Spec.Guest.MemorySlots.Use,
                        r.Spec.Guest.MemorySlots.Min)
        }
        if r.Spec.Guest.MemorySlots.Use > r.Spec.Guest.MemorySlots.Max {
                return nil, fmt.Errorf(".spec.guest.memorySlots.use (%d) should be less than or equal to the .spec.guest.memorySlots.max (%d)",
                        r.Spec.Guest.MemorySlots.Use,
                        r.Spec.Guest.MemorySlots.Max)
        }

        // validate .spec.disk names
        reservedDiskNames := []string{
                "virtualmachineimages",
                "rootdisk",
                "runtime",
                "swapdisk",
                "sysfscgroup",
                "ssh-privatekey",
                "ssh-publickey",
                "ssh-authorized-keys",
                "tls",
        }
        for _, disk := range r.Spec.Disks {
                if slices.Contains(reservedDiskNames, disk.Name) {
                        return nil, fmt.Errorf("'%s' is reserved for .spec.disks[].name", disk.Name)
                }
                if len(disk.Name) > 32 {
                        return nil, fmt.Errorf("disk name '%s' too long, should be less than or equal to 32", disk.Name)
                }
        }

        // validate .spec.guest.ports[].name
        for _, port := range r.Spec.Guest.Ports {
                if len(port.Name) != 0 && port.Name == "qmp" {
                        return nil, errors.New("'qmp' is reserved name for .spec.guest.ports[].name")
                }
        }

        return nil, nil
}

// ValidateUpdate implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control.
func (r *VirtualMachine) ValidateUpdate(old runtime.Object) (admission.Warnings, error) {
        // process immutable fields
        before, _ := old.(*VirtualMachine)

        immutableFields := []struct {
                fieldName string
                getter    func(*VirtualMachine) any
        }{
                {".spec.guest.cpus.min", func(v *VirtualMachine) any { return v.Spec.Guest.CPUs.Min }},
                {".spec.guest.cpus.max", func(v *VirtualMachine) any { return v.Spec.Guest.CPUs.Max }},
                {".spec.guest.memorySlots.min", func(v *VirtualMachine) any { return v.Spec.Guest.MemorySlots.Min }},
                {".spec.guest.memorySlots.max", func(v *VirtualMachine) any { return v.Spec.Guest.MemorySlots.Max }},
                {".spec.guest.ports", func(v *VirtualMachine) any { return v.Spec.Guest.Ports }},
                {".spec.guest.rootDisk", func(v *VirtualMachine) any { return v.Spec.Guest.RootDisk }},
                {".spec.guest.command", func(v *VirtualMachine) any { return v.Spec.Guest.Command }},
                {".spec.guest.args", func(v *VirtualMachine) any { return v.Spec.Guest.Args }},
                {".spec.guest.env", func(v *VirtualMachine) any { return v.Spec.Guest.Env }},
                {".spec.guest.settings", func(v *VirtualMachine) any { return v.Spec.Guest.Settings }},
                {".spec.disks", func(v *VirtualMachine) any { return v.Spec.Disks }},
                {".spec.podResources", func(v *VirtualMachine) any { return v.Spec.PodResources }},
                {".spec.enableAcceleration", func(v *VirtualMachine) any { return v.Spec.EnableAcceleration }},
                {".spec.enableSSH", func(v *VirtualMachine) any { return v.Spec.EnableSSH }},
                {".spec.initScript", func(v *VirtualMachine) any { return v.Spec.InitScript }},
                {".spec.enableNetworkMonitoring", func(v *VirtualMachine) any { return v.Spec.EnableNetworkMonitoring }},
        }

        for _, info := range immutableFields {
                if !reflect.DeepEqual(info.getter(r), info.getter(before)) {
                        return nil, fmt.Errorf("%s is immutable", info.fieldName)
                }
        }

        fieldsAllowedToChangeFromNilOnly := []struct {
                fieldName string
                getter    func(*VirtualMachine) any
        }{
                {".spec.cpuScalingMode", func(v *VirtualMachine) any { return v.Spec.CpuScalingMode }},
                {".spec.targetArchitecture", func(v *VirtualMachine) any { return v.Spec.TargetArchitecture }},
        }

        for _, info := range fieldsAllowedToChangeFromNilOnly {
                beforeValue := info.getter(before)
                newValue := info.getter(r)
                if !reflect.ValueOf(beforeValue).IsNil() && (reflect.ValueOf(newValue).IsNil() || !reflect.DeepEqual(newValue, beforeValue)) {
                        return nil, fmt.Errorf("%s is not allowed to be changed once it's set", info.fieldName)
                }
        }

        // validate .spec.guest.cpu.use
        if r.Spec.Guest.CPUs.Use < r.Spec.Guest.CPUs.Min {
                return nil, fmt.Errorf(".cpus.use (%v) should be greater than or equal to the .cpus.min (%v)",
                        r.Spec.Guest.CPUs.Use,
                        r.Spec.Guest.CPUs.Min)
        }
        if r.Spec.Guest.CPUs.Use > r.Spec.Guest.CPUs.Max {
                return nil, fmt.Errorf(".cpus.use (%v) should be less than or equal to the .cpus.max (%v)",
                        r.Spec.Guest.CPUs.Use,
                        r.Spec.Guest.CPUs.Max)
        }

        // validate .spec.guest.memorySlots.use
        if r.Spec.Guest.MemorySlots.Use < r.Spec.Guest.MemorySlots.Min {
                return nil, fmt.Errorf(".memorySlots.use (%d) should be greater than or equal to the .memorySlots.min (%d)",
                        r.Spec.Guest.MemorySlots.Use,
                        r.Spec.Guest.MemorySlots.Min)
        }
        if r.Spec.Guest.MemorySlots.Use > r.Spec.Guest.MemorySlots.Max {
                return nil, fmt.Errorf(".memorySlots.use (%d) should be less than or equal to the .memorySlots.max (%d)",
                        r.Spec.Guest.MemorySlots.Use,
                        r.Spec.Guest.MemorySlots.Max)
        }

        return nil, nil
}

// ValidateDelete implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachine) ValidateDelete() (admission.Warnings, error) {
        // No deletion validation required currently.
        return nil, nil
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1

import (
        corev1 "k8s.io/api/core/v1"
        "k8s.io/apimachinery/pkg/api/resource"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

const MigrationPort int32 = 20187

// NOTE: json tags are required.  Any new fields you add must have json tags for the fields to be serialized.

// VirtualMachineMigrationSpec defines the desired state of VirtualMachineMigration
type VirtualMachineMigrationSpec struct {
        VmName string `json:"vmName"`

        // TODO: not implemented
        // +optional
        NodeSelector map[string]string `json:"nodeSelector,omitempty"`
        // TODO: not implemented
        // +optional
        NodeAffinity *corev1.NodeAffinity `json:"nodeAffinity,omitempty"`

        // +optional
        // +kubebuilder:default:=true
        PreventMigrationToSameHost bool `json:"preventMigrationToSameHost"`

        // TODO: not implemented
        // Set 1 hour as default timeout for migration
        // +optional
        // +kubebuilder:default:=3600
        CompletionTimeout int32 `json:"completionTimeout"`

        // Trigger incremental disk copy migration by default, otherwise full disk copy used in migration
        // +optional
        // +kubebuilder:default:=true
        Incremental bool `json:"incremental"`

        // Use PostCopy migration by default
        // +optional
        // +kubebuilder:default:=false
        AllowPostCopy bool `json:"allowPostCopy"`

        // Use Auto converge by default
        // +optional
        // +kubebuilder:default:=true
        AutoConverge bool `json:"autoConverge"`

        // Set 1 Gbyte/sec as default for migration bandwidth
        // +optional
        // +kubebuilder:default:="1Gi"
        MaxBandwidth resource.Quantity `json:"maxBandwidth"`
}

// VirtualMachineMigrationStatus defines the observed state of VirtualMachineMigration
type VirtualMachineMigrationStatus struct {
        // Represents the observations of a VirtualMachineMigration's current state.
        // VirtualMachineMigration.status.conditions.type are: "Available", "Progressing", and "Degraded"
        // VirtualMachineMigration.status.conditions.status are one of True, False, Unknown.
        // VirtualMachineMigration.status.conditions.reason the value should be a CamelCase string and producers of specific
        // condition types may define expected values and meanings for this field, and whether the values
        // are considered a guaranteed API.
        // VirtualMachineMigration.status.conditions.Message is a human readable message indicating details about the transition.
        // For further information see: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties

        Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"`

        // The phase of a VM is a simple, high-level summary of where the VM is in its lifecycle.
        // +optional
        Phase VmmPhase `json:"phase,omitempty"`
        // +optional
        SourcePodName string `json:"sourcePodName,omitempty"`
        // +optional
        TargetPodName string `json:"targetPodName,omitempty"`
        // +optional
        SourcePodIP string `json:"sourcePodIP,omitempty"`
        // +optional
        TargetPodIP string `json:"targetPodIP,omitempty"`
        // +optional
        SourceNode string `json:"sourceNode,omitempty"`
        // +optional
        TargetNode string `json:"targetNode,omitempty"`
        // +optional
        Info MigrationInfo `json:"info,omitempty"`
}

type MigrationInfo struct {
        // +optional
        Status string `json:"status,omitempty"`
        // +optional
        TotalTimeMs int64 `json:"totalTimeMs,omitempty"`
        // +optional
        SetupTimeMs int64 `json:"setupTimeMs,omitempty"`
        // +optional
        DowntimeMs int64 `json:"downtimeMs,omitempty"`
        // +optional
        Ram MigrationInfoRam `json:"ram,omitempty"`
        // +optional
        Compression MigrationInfoCompression `json:"compression,omitempty"`
}

type MigrationInfoRam struct {
        // +optional
        Transferred int64 `json:"transferred,omitempty"`
        // +optional
        Remaining int64 `json:"remaining,omitempty"`
        // +optional
        Total int64 `json:"total,omitempty"`
}

type MigrationInfoCompression struct {
        // +optional
        CompressedSize int64 `json:"compressedSize,omitempty"`
        // +optional
        CompressionRate int64 `json:"compressionRate,omitempty"`
}

type VmmPhase string

const (
        // VmmPending means the migration has been accepted by the system, but target vm-runner pod
        // has not been started. This includes time before being bound to a node, as well as time spent
        // pulling images onto the host.
        VmmPending VmmPhase = "Pending"
        // VmmRunning means the target vm-runner pod has been bound to a node and have been started.
        VmmRunning VmmPhase = "Running"
        // VmmSucceeded means that migration finisged with success
        VmmSucceeded VmmPhase = "Succeeded"
        // VmmFailed means that migration failed
        VmmFailed VmmPhase = "Failed"
)

//+genclient
//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:resource:singular=neonvmm

// VirtualMachineMigration is the Schema for the virtualmachinemigrations API
// +kubebuilder:printcolumn:name="VM",type=string,JSONPath=`.spec.vmName`
// +kubebuilder:printcolumn:name="Source",type=string,JSONPath=`.status.sourcePodName`
// +kubebuilder:printcolumn:name="SourceIP",type=string,priority=1,JSONPath=`.status.sourcePodIP`
// +kubebuilder:printcolumn:name="Target",type=string,JSONPath=`.status.targetPodName`
// +kubebuilder:printcolumn:name="TargetIP",type=string,priority=1,JSONPath=`.status.targetPodIP`
// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.phase`
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
type VirtualMachineMigration struct {
        metav1.TypeMeta   `json:",inline"`
        metav1.ObjectMeta `json:"metadata,omitempty"`

        Spec   VirtualMachineMigrationSpec   `json:"spec,omitempty"`
        Status VirtualMachineMigrationStatus `json:"status,omitempty"`
}

//+kubebuilder:object:root=true

// VirtualMachineMigrationList contains a list of VirtualMachineMigration
type VirtualMachineMigrationList struct {
        metav1.TypeMeta `json:",inline"`
        metav1.ListMeta `json:"metadata,omitempty"`
        Items           []VirtualMachineMigration `json:"items"`
}

func init() {
        SchemeBuilder.Register(&VirtualMachineMigration{}, &VirtualMachineMigrationList{}) //nolint:exhaustruct // just being used to provide the types
}

/*
Copyright 2023.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1

import (
        "sigs.k8s.io/controller-runtime/pkg/webhook"
        "sigs.k8s.io/controller-runtime/pkg/webhook/admission"

        "k8s.io/apimachinery/pkg/runtime"
)

//+kubebuilder:webhook:path=/mutate-vm-neon-tech-v1-virtualmachinemigration,mutating=true,failurePolicy=fail,sideEffects=None,groups=vm.neon.tech,resources=virtualmachinemigrations,verbs=create;update,versions=v1,name=mvirtualmachinemigration.kb.io,admissionReviewVersions=v1

var _ webhook.Defaulter = &VirtualMachineMigration{}

// Default implements webhook.Defaulter
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachineMigration) Default() {
        // TODO: implement defaults
}

//+kubebuilder:webhook:path=/validate-vm-neon-tech-v1-virtualmachinemigration,mutating=false,failurePolicy=fail,sideEffects=None,groups=vm.neon.tech,resources=virtualmachinemigrations,verbs=create;update,versions=v1,name=vvirtualmachinemigration.kb.io,admissionReviewVersions=v1

var _ webhook.Validator = &VirtualMachineMigration{}

// ValidateCreate implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachineMigration) ValidateCreate() (admission.Warnings, error) {
        // TODO: implement creation validation webhook (?)
        return nil, nil
}

// ValidateUpdate implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachineMigration) ValidateUpdate(old runtime.Object) (admission.Warnings, error) {
        // TODO: implement update validation webhook
        return nil, nil
}

// ValidateDelete implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachineMigration) ValidateDelete() (admission.Warnings, error) {
        // TODO: implement deletion validation webhook (?)
        return nil, nil
}

//go:build !ignore_autogenerated

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Code generated by controller-gen. DO NOT EDIT.

package v1

import (
        corev1 "k8s.io/api/core/v1"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/apimachinery/pkg/runtime"
)

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *CPUs) DeepCopyInto(out *CPUs) {
        *out = *in
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CPUs.
func (in *CPUs) DeepCopy() *CPUs {
        if in == nil {
                return nil
        }
        out := new(CPUs)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Disk) DeepCopyInto(out *Disk) {
        *out = *in
        if in.ReadOnly != nil {
                in, out := &in.ReadOnly, &out.ReadOnly
                *out = new(bool)
                **out = **in
        }
        if in.Watch != nil {
                in, out := &in.Watch, &out.Watch
                *out = new(bool)
                **out = **in
        }
        in.DiskSource.DeepCopyInto(&out.DiskSource)
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Disk.
func (in *Disk) DeepCopy() *Disk {
        if in == nil {
                return nil
        }
        out := new(Disk)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DiskSource) DeepCopyInto(out *DiskSource) {
        *out = *in
        if in.EmptyDisk != nil {
                in, out := &in.EmptyDisk, &out.EmptyDisk
                *out = new(EmptyDiskSource)
                (*in).DeepCopyInto(*out)
        }
        if in.ConfigMap != nil {
                in, out := &in.ConfigMap, &out.ConfigMap
                *out = new(corev1.ConfigMapVolumeSource)
                (*in).DeepCopyInto(*out)
        }
        if in.Secret != nil {
                in, out := &in.Secret, &out.Secret
                *out = new(corev1.SecretVolumeSource)
                (*in).DeepCopyInto(*out)
        }
        if in.Tmpfs != nil {
                in, out := &in.Tmpfs, &out.Tmpfs
                *out = new(TmpfsDiskSource)
                (*in).DeepCopyInto(*out)
        }
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiskSource.
func (in *DiskSource) DeepCopy() *DiskSource {
        if in == nil {
                return nil
        }
        out := new(DiskSource)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *EmptyDiskSource) DeepCopyInto(out *EmptyDiskSource) {
        *out = *in
        out.Size = in.Size.DeepCopy()
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EmptyDiskSource.
func (in *EmptyDiskSource) DeepCopy() *EmptyDiskSource {
        if in == nil {
                return nil
        }
        out := new(EmptyDiskSource)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *EnvVar) DeepCopyInto(out *EnvVar) {
        *out = *in
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvVar.
func (in *EnvVar) DeepCopy() *EnvVar {
        if in == nil {
                return nil
        }
        out := new(EnvVar)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ExtraNetwork) DeepCopyInto(out *ExtraNetwork) {
        *out = *in
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtraNetwork.
func (in *ExtraNetwork) DeepCopy() *ExtraNetwork {
        if in == nil {
                return nil
        }
        out := new(ExtraNetwork)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Guest) DeepCopyInto(out *Guest) {
        *out = *in
        if in.KernelImage != nil {
                in, out := &in.KernelImage, &out.KernelImage
                *out = new(string)
                **out = **in
        }
        if in.MemhpAutoMovableRatio != nil {
                in, out := &in.MemhpAutoMovableRatio, &out.MemhpAutoMovableRatio
                *out = new(string)
                **out = **in
        }
        if in.AppendKernelCmdline != nil {
                in, out := &in.AppendKernelCmdline, &out.AppendKernelCmdline
                *out = new(string)
                **out = **in
        }
        out.CPUs = in.CPUs
        out.MemorySlotSize = in.MemorySlotSize.DeepCopy()
        out.MemorySlots = in.MemorySlots
        in.RootDisk.DeepCopyInto(&out.RootDisk)
        if in.Command != nil {
                in, out := &in.Command, &out.Command
                *out = make([]string, len(*in))
                copy(*out, *in)
        }
        if in.Args != nil {
                in, out := &in.Args, &out.Args
                *out = make([]string, len(*in))
                copy(*out, *in)
        }
        if in.Env != nil {
                in, out := &in.Env, &out.Env
                *out = make([]EnvVar, len(*in))
                copy(*out, *in)
        }
        if in.Ports != nil {
                in, out := &in.Ports, &out.Ports
                *out = make([]Port, len(*in))
                copy(*out, *in)
        }
        if in.Settings != nil {
                in, out := &in.Settings, &out.Settings
                *out = new(GuestSettings)
                (*in).DeepCopyInto(*out)
        }
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Guest.
func (in *Guest) DeepCopy() *Guest {
        if in == nil {
                return nil
        }
        out := new(Guest)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *GuestSettings) DeepCopyInto(out *GuestSettings) {
        *out = *in
        if in.Sysctl != nil {
                in, out := &in.Sysctl, &out.Sysctl
                *out = make([]string, len(*in))
                copy(*out, *in)
        }
        if in.Swap != nil {
                in, out := &in.Swap, &out.Swap
                x := (*in).DeepCopy()
                *out = &x
        }
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GuestSettings.
func (in *GuestSettings) DeepCopy() *GuestSettings {
        if in == nil {
                return nil
        }
        out := new(GuestSettings)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IPAllocation) DeepCopyInto(out *IPAllocation) {
        *out = *in
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IPAllocation.
func (in *IPAllocation) DeepCopy() *IPAllocation {
        if in == nil {
                return nil
        }
        out := new(IPAllocation)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IPPool) DeepCopyInto(out *IPPool) {
        *out = *in
        out.TypeMeta = in.TypeMeta
        in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
        in.Spec.DeepCopyInto(&out.Spec)
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IPPool.
func (in *IPPool) DeepCopy() *IPPool {
        if in == nil {
                return nil
        }
        out := new(IPPool)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *IPPool) DeepCopyObject() runtime.Object {
        if c := in.DeepCopy(); c != nil {
                return c
        }
        return nil
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IPPoolList) DeepCopyInto(out *IPPoolList) {
        *out = *in
        out.TypeMeta = in.TypeMeta
        in.ListMeta.DeepCopyInto(&out.ListMeta)
        if in.Items != nil {
                in, out := &in.Items, &out.Items
                *out = make([]IPPool, len(*in))
                for i := range *in {
                        (*in)[i].DeepCopyInto(&(*out)[i])
                }
        }
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IPPoolList.
func (in *IPPoolList) DeepCopy() *IPPoolList {
        if in == nil {
                return nil
        }
        out := new(IPPoolList)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *IPPoolList) DeepCopyObject() runtime.Object {
        if c := in.DeepCopy(); c != nil {
                return c
        }
        return nil
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IPPoolSpec) DeepCopyInto(out *IPPoolSpec) {
        *out = *in
        if in.Allocations != nil {
                in, out := &in.Allocations, &out.Allocations
                *out = make(map[string]IPAllocation, len(*in))
                for key, val := range *in {
                        (*out)[key] = val
                }
        }
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IPPoolSpec.
func (in *IPPoolSpec) DeepCopy() *IPPoolSpec {
        if in == nil {
                return nil
        }
        out := new(IPPoolSpec)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MemorySlots) DeepCopyInto(out *MemorySlots) {
        *out = *in
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MemorySlots.
func (in *MemorySlots) DeepCopy() *MemorySlots {
        if in == nil {
                return nil
        }
        out := new(MemorySlots)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MigrationInfo) DeepCopyInto(out *MigrationInfo) {
        *out = *in
        out.Ram = in.Ram
        out.Compression = in.Compression
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationInfo.
func (in *MigrationInfo) DeepCopy() *MigrationInfo {
        if in == nil {
                return nil
        }
        out := new(MigrationInfo)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MigrationInfoCompression) DeepCopyInto(out *MigrationInfoCompression) {
        *out = *in
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationInfoCompression.
func (in *MigrationInfoCompression) DeepCopy() *MigrationInfoCompression {
        if in == nil {
                return nil
        }
        out := new(MigrationInfoCompression)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MigrationInfoRam) DeepCopyInto(out *MigrationInfoRam) {
        *out = *in
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationInfoRam.
func (in *MigrationInfoRam) DeepCopy() *MigrationInfoRam {
        if in == nil {
                return nil
        }
        out := new(MigrationInfoRam)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Port) DeepCopyInto(out *Port) {
        *out = *in
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Port.
func (in *Port) DeepCopy() *Port {
        if in == nil {
                return nil
        }
        out := new(Port)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Revision) DeepCopyInto(out *Revision) {
        *out = *in
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Revision.
func (in *Revision) DeepCopy() *Revision {
        if in == nil {
                return nil
        }
        out := new(Revision)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RevisionWithTime) DeepCopyInto(out *RevisionWithTime) {
        *out = *in
        out.Revision = in.Revision
        in.UpdatedAt.DeepCopyInto(&out.UpdatedAt)
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RevisionWithTime.
func (in *RevisionWithTime) DeepCopy() *RevisionWithTime {
        if in == nil {
                return nil
        }
        out := new(RevisionWithTime)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RootDisk) DeepCopyInto(out *RootDisk) {
        *out = *in
        out.Size = in.Size.DeepCopy()
        if in.Execute != nil {
                in, out := &in.Execute, &out.Execute
                *out = make([]string, len(*in))
                copy(*out, *in)
        }
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RootDisk.
func (in *RootDisk) DeepCopy() *RootDisk {
        if in == nil {
                return nil
        }
        out := new(RootDisk)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *TLSProvisioning) DeepCopyInto(out *TLSProvisioning) {
        *out = *in
        out.ExpireAfter = in.ExpireAfter
        out.RenewBefore = in.RenewBefore
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TLSProvisioning.
func (in *TLSProvisioning) DeepCopy() *TLSProvisioning {
        if in == nil {
                return nil
        }
        out := new(TLSProvisioning)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *TmpfsDiskSource) DeepCopyInto(out *TmpfsDiskSource) {
        *out = *in
        out.Size = in.Size.DeepCopy()
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TmpfsDiskSource.
func (in *TmpfsDiskSource) DeepCopy() *TmpfsDiskSource {
        if in == nil {
                return nil
        }
        out := new(TmpfsDiskSource)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachine) DeepCopyInto(out *VirtualMachine) {
        *out = *in
        out.TypeMeta = in.TypeMeta
        in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
        in.Spec.DeepCopyInto(&out.Spec)
        in.Status.DeepCopyInto(&out.Status)
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachine.
func (in *VirtualMachine) DeepCopy() *VirtualMachine {
        if in == nil {
                return nil
        }
        out := new(VirtualMachine)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *VirtualMachine) DeepCopyObject() runtime.Object {
        if c := in.DeepCopy(); c != nil {
                return c
        }
        return nil
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineList) DeepCopyInto(out *VirtualMachineList) {
        *out = *in
        out.TypeMeta = in.TypeMeta
        in.ListMeta.DeepCopyInto(&out.ListMeta)
        if in.Items != nil {
                in, out := &in.Items, &out.Items
                *out = make([]VirtualMachine, len(*in))
                for i := range *in {
                        (*in)[i].DeepCopyInto(&(*out)[i])
                }
        }
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineList.
func (in *VirtualMachineList) DeepCopy() *VirtualMachineList {
        if in == nil {
                return nil
        }
        out := new(VirtualMachineList)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *VirtualMachineList) DeepCopyObject() runtime.Object {
        if c := in.DeepCopy(); c != nil {
                return c
        }
        return nil
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineMigration) DeepCopyInto(out *VirtualMachineMigration) {
        *out = *in
        out.TypeMeta = in.TypeMeta
        in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
        in.Spec.DeepCopyInto(&out.Spec)
        in.Status.DeepCopyInto(&out.Status)
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineMigration.
func (in *VirtualMachineMigration) DeepCopy() *VirtualMachineMigration {
        if in == nil {
                return nil
        }
        out := new(VirtualMachineMigration)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *VirtualMachineMigration) DeepCopyObject() runtime.Object {
        if c := in.DeepCopy(); c != nil {
                return c
        }
        return nil
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineMigrationList) DeepCopyInto(out *VirtualMachineMigrationList) {
        *out = *in
        out.TypeMeta = in.TypeMeta
        in.ListMeta.DeepCopyInto(&out.ListMeta)
        if in.Items != nil {
                in, out := &in.Items, &out.Items
                *out = make([]VirtualMachineMigration, len(*in))
                for i := range *in {
                        (*in)[i].DeepCopyInto(&(*out)[i])
                }
        }
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineMigrationList.
func (in *VirtualMachineMigrationList) DeepCopy() *VirtualMachineMigrationList {
        if in == nil {
                return nil
        }
        out := new(VirtualMachineMigrationList)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *VirtualMachineMigrationList) DeepCopyObject() runtime.Object {
        if c := in.DeepCopy(); c != nil {
                return c
        }
        return nil
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineMigrationSpec) DeepCopyInto(out *VirtualMachineMigrationSpec) {
        *out = *in
        if in.NodeSelector != nil {
                in, out := &in.NodeSelector, &out.NodeSelector
                *out = make(map[string]string, len(*in))
                for key, val := range *in {
                        (*out)[key] = val
                }
        }
        if in.NodeAffinity != nil {
                in, out := &in.NodeAffinity, &out.NodeAffinity
                *out = new(corev1.NodeAffinity)
                (*in).DeepCopyInto(*out)
        }
        out.MaxBandwidth = in.MaxBandwidth.DeepCopy()
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineMigrationSpec.
func (in *VirtualMachineMigrationSpec) DeepCopy() *VirtualMachineMigrationSpec {
        if in == nil {
                return nil
        }
        out := new(VirtualMachineMigrationSpec)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineMigrationStatus) DeepCopyInto(out *VirtualMachineMigrationStatus) {
        *out = *in
        if in.Conditions != nil {
                in, out := &in.Conditions, &out.Conditions
                *out = make([]metav1.Condition, len(*in))
                for i := range *in {
                        (*in)[i].DeepCopyInto(&(*out)[i])
                }
        }
        out.Info = in.Info
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineMigrationStatus.
func (in *VirtualMachineMigrationStatus) DeepCopy() *VirtualMachineMigrationStatus {
        if in == nil {
                return nil
        }
        out := new(VirtualMachineMigrationStatus)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineResources) DeepCopyInto(out *VirtualMachineResources) {
        *out = *in
        out.CPUs = in.CPUs
        out.MemorySlots = in.MemorySlots
        out.MemorySlotSize = in.MemorySlotSize.DeepCopy()
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineResources.
func (in *VirtualMachineResources) DeepCopy() *VirtualMachineResources {
        if in == nil {
                return nil
        }
        out := new(VirtualMachineResources)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineSpec) DeepCopyInto(out *VirtualMachineSpec) {
        *out = *in
        if in.TerminationGracePeriodSeconds != nil {
                in, out := &in.TerminationGracePeriodSeconds, &out.TerminationGracePeriodSeconds
                *out = new(int64)
                **out = **in
        }
        if in.NodeSelector != nil {
                in, out := &in.NodeSelector, &out.NodeSelector
                *out = make(map[string]string, len(*in))
                for key, val := range *in {
                        (*out)[key] = val
                }
        }
        if in.Affinity != nil {
                in, out := &in.Affinity, &out.Affinity
                *out = new(corev1.Affinity)
                (*in).DeepCopyInto(*out)
        }
        if in.Tolerations != nil {
                in, out := &in.Tolerations, &out.Tolerations
                *out = make([]corev1.Toleration, len(*in))
                for i := range *in {
                        (*in)[i].DeepCopyInto(&(*out)[i])
                }
        }
        in.PodResources.DeepCopyInto(&out.PodResources)
        if in.ImagePullSecrets != nil {
                in, out := &in.ImagePullSecrets, &out.ImagePullSecrets
                *out = make([]corev1.LocalObjectReference, len(*in))
                copy(*out, *in)
        }
        if in.TargetArchitecture != nil {
                in, out := &in.TargetArchitecture, &out.TargetArchitecture
                *out = new(CPUArchitecture)
                **out = **in
        }
        in.Guest.DeepCopyInto(&out.Guest)
        if in.ExtraInitContainers != nil {
                in, out := &in.ExtraInitContainers, &out.ExtraInitContainers
                *out = make([]corev1.Container, len(*in))
                for i := range *in {
                        (*in)[i].DeepCopyInto(&(*out)[i])
                }
        }
        if in.Disks != nil {
                in, out := &in.Disks, &out.Disks
                *out = make([]Disk, len(*in))
                for i := range *in {
                        (*in)[i].DeepCopyInto(&(*out)[i])
                }
        }
        if in.ExtraNetwork != nil {
                in, out := &in.ExtraNetwork, &out.ExtraNetwork
                *out = new(ExtraNetwork)
                **out = **in
        }
        if in.ServiceLinks != nil {
                in, out := &in.ServiceLinks, &out.ServiceLinks
                *out = new(bool)
                **out = **in
        }
        if in.EnableAcceleration != nil {
                in, out := &in.EnableAcceleration, &out.EnableAcceleration
                *out = new(bool)
                **out = **in
        }
        if in.RunnerImage != nil {
                in, out := &in.RunnerImage, &out.RunnerImage
                *out = new(string)
                **out = **in
        }
        if in.EnableSSH != nil {
                in, out := &in.EnableSSH, &out.EnableSSH
                *out = new(bool)
                **out = **in
        }
        if in.TLS != nil {
                in, out := &in.TLS, &out.TLS
                *out = new(TLSProvisioning)
                **out = **in
        }
        if in.TargetRevision != nil {
                in, out := &in.TargetRevision, &out.TargetRevision
                *out = new(RevisionWithTime)
                (*in).DeepCopyInto(*out)
        }
        if in.CpuScalingMode != nil {
                in, out := &in.CpuScalingMode, &out.CpuScalingMode
                *out = new(CpuScalingMode)
                **out = **in
        }
        if in.EnableNetworkMonitoring != nil {
                in, out := &in.EnableNetworkMonitoring, &out.EnableNetworkMonitoring
                *out = new(bool)
                **out = **in
        }
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineSpec.
func (in *VirtualMachineSpec) DeepCopy() *VirtualMachineSpec {
        if in == nil {
                return nil
        }
        out := new(VirtualMachineSpec)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineStatus) DeepCopyInto(out *VirtualMachineStatus) {
        *out = *in
        if in.Conditions != nil {
                in, out := &in.Conditions, &out.Conditions
                *out = make([]metav1.Condition, len(*in))
                for i := range *in {
                        (*in)[i].DeepCopyInto(&(*out)[i])
                }
        }
        if in.CPUs != nil {
                in, out := &in.CPUs, &out.CPUs
                *out = new(MilliCPU)
                **out = **in
        }
        if in.MemorySize != nil {
                in, out := &in.MemorySize, &out.MemorySize
                x := (*in).DeepCopy()
                *out = &x
        }
        if in.CurrentRevision != nil {
                in, out := &in.CurrentRevision, &out.CurrentRevision
                *out = new(RevisionWithTime)
                (*in).DeepCopyInto(*out)
        }
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineStatus.
func (in *VirtualMachineStatus) DeepCopy() *VirtualMachineStatus {
        if in == nil {
                return nil
        }
        out := new(VirtualMachineStatus)
        in.DeepCopyInto(out)
        return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineUsage) DeepCopyInto(out *VirtualMachineUsage) {
        *out = *in
        if in.CPU != nil {
                in, out := &in.CPU, &out.CPU
                x := (*in).DeepCopy()
                *out = &x
        }
        if in.Memory != nil {
                in, out := &in.Memory, &out.Memory
                x := (*in).DeepCopy()
                *out = &x
        }
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineUsage.
func (in *VirtualMachineUsage) DeepCopy() *VirtualMachineUsage {
        if in == nil {
                return nil
        }
        out := new(VirtualMachineUsage)
        in.DeepCopyInto(out)
        return out
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.

package versioned

import (
        "fmt"
        "net/http"

        neonvmv1 "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/typed/neonvm/v1"
        discovery "k8s.io/client-go/discovery"
        rest "k8s.io/client-go/rest"
        flowcontrol "k8s.io/client-go/util/flowcontrol"
)

type Interface interface {
        Discovery() discovery.DiscoveryInterface
        NeonvmV1() neonvmv1.NeonvmV1Interface
}

// Clientset contains the clients for groups.
type Clientset struct {
        *discovery.DiscoveryClient
        neonvmV1 *neonvmv1.NeonvmV1Client
}

// NeonvmV1 retrieves the NeonvmV1Client
func (c *Clientset) NeonvmV1() neonvmv1.NeonvmV1Interface {
        return c.neonvmV1
}

// Discovery retrieves the DiscoveryClient
func (c *Clientset) Discovery() discovery.DiscoveryInterface {
        if c == nil {
                return nil
        }
        return c.DiscoveryClient
}

// NewForConfig creates a new Clientset for the given config.
// If config's RateLimiter is not set and QPS and Burst are acceptable,
// NewForConfig will generate a rate-limiter in configShallowCopy.
// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient),
// where httpClient was generated with rest.HTTPClientFor(c).
func NewForConfig(c *rest.Config) (*Clientset, error) {
        configShallowCopy := *c

        if configShallowCopy.UserAgent == "" {
                configShallowCopy.UserAgent = rest.DefaultKubernetesUserAgent()
        }

        // share the transport between all clients
        httpClient, err := rest.HTTPClientFor(&configShallowCopy)
        if err != nil {
                return nil, err
        }

        return NewForConfigAndClient(&configShallowCopy, httpClient)
}

// NewForConfigAndClient creates a new Clientset for the given config and http client.
// Note the http client provided takes precedence over the configured transport values.
// If config's RateLimiter is not set and QPS and Burst are acceptable,
// NewForConfigAndClient will generate a rate-limiter in configShallowCopy.
func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, error) {
        configShallowCopy := *c
        if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 {
                if configShallowCopy.Burst <= 0 {
                        return nil, fmt.Errorf("burst is required to be greater than 0 when RateLimiter is not set and QPS is set to greater than 0")
                }
                configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst)
        }

        var cs Clientset
        var err error
        cs.neonvmV1, err = neonvmv1.NewForConfigAndClient(&configShallowCopy, httpClient)
        if err != nil {
                return nil, err
        }

        cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfigAndClient(&configShallowCopy, httpClient)
        if err != nil {
                return nil, err
        }
        return &cs, nil
}

// NewForConfigOrDie creates a new Clientset for the given config and
// panics if there is an error in the config.
func NewForConfigOrDie(c *rest.Config) *Clientset {
        cs, err := NewForConfig(c)
        if err != nil {
                panic(err)
        }
        return cs
}

// New creates a new Clientset for the given RESTClient.
func New(c rest.Interface) *Clientset {
        var cs Clientset
        cs.neonvmV1 = neonvmv1.New(c)

        cs.DiscoveryClient = discovery.NewDiscoveryClient(c)
        return &cs
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.

package fake

import (
        clientset "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
        neonvmv1 "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/typed/neonvm/v1"
        fakeneonvmv1 "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/typed/neonvm/v1/fake"
        "k8s.io/apimachinery/pkg/runtime"
        "k8s.io/apimachinery/pkg/watch"
        "k8s.io/client-go/discovery"
        fakediscovery "k8s.io/client-go/discovery/fake"
        "k8s.io/client-go/testing"
)

// NewSimpleClientset returns a clientset that will respond with the provided objects.
// It's backed by a very simple object tracker that processes creates, updates and deletions as-is,
// without applying any validations and/or defaults. It shouldn't be considered a replacement
// for a real clientset and is mostly useful in simple unit tests.
func NewSimpleClientset(objects ...runtime.Object) *Clientset {
        o := testing.NewObjectTracker(scheme, codecs.UniversalDecoder())
        for _, obj := range objects {
                if err := o.Add(obj); err != nil {
                        panic(err)
                }
        }

        cs := &Clientset{tracker: o}
        cs.discovery = &fakediscovery.FakeDiscovery{Fake: &cs.Fake}
        cs.AddReactor("*", "*", testing.ObjectReaction(o))
        cs.AddWatchReactor("*", func(action testing.Action) (handled bool, ret watch.Interface, err error) {
                gvr := action.GetResource()
                ns := action.GetNamespace()
                watch, err := o.Watch(gvr, ns)
                if err != nil {
                        return false, nil, err
                }
                return true, watch, nil
        })

        return cs
}

// Clientset implements clientset.Interface. Meant to be embedded into a
// struct to get a default implementation. This makes faking out just the method
// you want to test easier.
type Clientset struct {
        testing.Fake
        discovery *fakediscovery.FakeDiscovery
        tracker   testing.ObjectTracker
}

func (c *Clientset) Discovery() discovery.DiscoveryInterface {
        return c.discovery
}

func (c *Clientset) Tracker() testing.ObjectTracker {
        return c.tracker
}

var (
        _ clientset.Interface = &Clientset{}
        _ testing.FakeClient  = &Clientset{}
)

// NeonvmV1 retrieves the NeonvmV1Client
func (c *Clientset) NeonvmV1() neonvmv1.NeonvmV1Interface {
        return &fakeneonvmv1.FakeNeonvmV1{Fake: &c.Fake}
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.

package fake

import (
        neonvmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        runtime "k8s.io/apimachinery/pkg/runtime"
        schema "k8s.io/apimachinery/pkg/runtime/schema"
        serializer "k8s.io/apimachinery/pkg/runtime/serializer"
        utilruntime "k8s.io/apimachinery/pkg/util/runtime"
)

var scheme = runtime.NewScheme()
var codecs = serializer.NewCodecFactory(scheme)

var localSchemeBuilder = runtime.SchemeBuilder{
        neonvmv1.AddToScheme,
}

// AddToScheme adds all types of this clientset into the given scheme. This allows composition
// of clientsets, like in:
//
//        import (
//          "k8s.io/client-go/kubernetes"
//          clientsetscheme "k8s.io/client-go/kubernetes/scheme"
//          aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme"
//        )
//
//        kclientset, _ := kubernetes.NewForConfig(c)
//        _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme)
//
// After this, RawExtensions in Kubernetes types will serialize kube-aggregator types
// correctly.
var AddToScheme = localSchemeBuilder.AddToScheme

func init() {
        v1.AddToGroupVersion(scheme, schema.GroupVersion{Version: "v1"})
        utilruntime.Must(AddToScheme(scheme))
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.

package scheme

import (
        neonvmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        runtime "k8s.io/apimachinery/pkg/runtime"
        schema "k8s.io/apimachinery/pkg/runtime/schema"
        serializer "k8s.io/apimachinery/pkg/runtime/serializer"
        utilruntime "k8s.io/apimachinery/pkg/util/runtime"
)

var Scheme = runtime.NewScheme()
var Codecs = serializer.NewCodecFactory(Scheme)
var ParameterCodec = runtime.NewParameterCodec(Scheme)
var localSchemeBuilder = runtime.SchemeBuilder{
        neonvmv1.AddToScheme,
}

// AddToScheme adds all types of this clientset into the given scheme. This allows composition
// of clientsets, like in:
//
//        import (
//          "k8s.io/client-go/kubernetes"
//          clientsetscheme "k8s.io/client-go/kubernetes/scheme"
//          aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme"
//        )
//
//        kclientset, _ := kubernetes.NewForConfig(c)
//        _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme)
//
// After this, RawExtensions in Kubernetes types will serialize kube-aggregator types
// correctly.
var AddToScheme = localSchemeBuilder.AddToScheme

func init() {
        v1.AddToGroupVersion(Scheme, schema.GroupVersion{Version: "v1"})
        utilruntime.Must(AddToScheme(Scheme))
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.

package fake

import (
        "context"

        v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        labels "k8s.io/apimachinery/pkg/labels"
        types "k8s.io/apimachinery/pkg/types"
        watch "k8s.io/apimachinery/pkg/watch"
        testing "k8s.io/client-go/testing"
)

// FakeIPPools implements IPPoolInterface
type FakeIPPools struct {
        Fake *FakeNeonvmV1
        ns   string
}

var ippoolsResource = v1.SchemeGroupVersion.WithResource("ippools")

var ippoolsKind = v1.SchemeGroupVersion.WithKind("IPPool")

// Get takes name of the iPPool, and returns the corresponding iPPool object, and an error if there is any.
func (c *FakeIPPools) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.IPPool, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewGetAction(ippoolsResource, c.ns, name), &v1.IPPool{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.IPPool), err
}

// List takes label and field selectors, and returns the list of IPPools that match those selectors.
func (c *FakeIPPools) List(ctx context.Context, opts metav1.ListOptions) (result *v1.IPPoolList, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewListAction(ippoolsResource, ippoolsKind, c.ns, opts), &v1.IPPoolList{})

        if obj == nil {
                return nil, err
        }

        label, _, _ := testing.ExtractFromListOptions(opts)
        if label == nil {
                label = labels.Everything()
        }
        list := &v1.IPPoolList{ListMeta: obj.(*v1.IPPoolList).ListMeta}
        for _, item := range obj.(*v1.IPPoolList).Items {
                if label.Matches(labels.Set(item.Labels)) {
                        list.Items = append(list.Items, item)
                }
        }
        return list, err
}

// Watch returns a watch.Interface that watches the requested iPPools.
func (c *FakeIPPools) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) {
        return c.Fake.
                InvokesWatch(testing.NewWatchAction(ippoolsResource, c.ns, opts))

}

// Create takes the representation of a iPPool and creates it.  Returns the server's representation of the iPPool, and an error, if there is any.
func (c *FakeIPPools) Create(ctx context.Context, iPPool *v1.IPPool, opts metav1.CreateOptions) (result *v1.IPPool, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewCreateAction(ippoolsResource, c.ns, iPPool), &v1.IPPool{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.IPPool), err
}

// Update takes the representation of a iPPool and updates it. Returns the server's representation of the iPPool, and an error, if there is any.
func (c *FakeIPPools) Update(ctx context.Context, iPPool *v1.IPPool, opts metav1.UpdateOptions) (result *v1.IPPool, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewUpdateAction(ippoolsResource, c.ns, iPPool), &v1.IPPool{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.IPPool), err
}

// Delete takes name of the iPPool and deletes it. Returns an error if one occurs.
func (c *FakeIPPools) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
        _, err := c.Fake.
                Invokes(testing.NewDeleteActionWithOptions(ippoolsResource, c.ns, name, opts), &v1.IPPool{})

        return err
}

// DeleteCollection deletes a collection of objects.
func (c *FakeIPPools) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error {
        action := testing.NewDeleteCollectionAction(ippoolsResource, c.ns, listOpts)

        _, err := c.Fake.Invokes(action, &v1.IPPoolList{})
        return err
}

// Patch applies the patch and returns the patched iPPool.
func (c *FakeIPPools) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.IPPool, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewPatchSubresourceAction(ippoolsResource, c.ns, name, pt, data, subresources...), &v1.IPPool{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.IPPool), err
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.

package fake

import (
        v1 "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/typed/neonvm/v1"
        rest "k8s.io/client-go/rest"
        testing "k8s.io/client-go/testing"
)

type FakeNeonvmV1 struct {
        *testing.Fake
}

func (c *FakeNeonvmV1) IPPools(namespace string) v1.IPPoolInterface {
        return &FakeIPPools{c, namespace}
}

func (c *FakeNeonvmV1) VirtualMachines(namespace string) v1.VirtualMachineInterface {
        return &FakeVirtualMachines{c, namespace}
}

func (c *FakeNeonvmV1) VirtualMachineMigrations(namespace string) v1.VirtualMachineMigrationInterface {
        return &FakeVirtualMachineMigrations{c, namespace}
}

// RESTClient returns a RESTClient that is used to communicate
// with API server by this client implementation.
func (c *FakeNeonvmV1) RESTClient() rest.Interface {
        var ret *rest.RESTClient
        return ret
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.

package fake

import (
        "context"

        v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        labels "k8s.io/apimachinery/pkg/labels"
        types "k8s.io/apimachinery/pkg/types"
        watch "k8s.io/apimachinery/pkg/watch"
        testing "k8s.io/client-go/testing"
)

// FakeVirtualMachines implements VirtualMachineInterface
type FakeVirtualMachines struct {
        Fake *FakeNeonvmV1
        ns   string
}

var virtualmachinesResource = v1.SchemeGroupVersion.WithResource("virtualmachines")

var virtualmachinesKind = v1.SchemeGroupVersion.WithKind("VirtualMachine")

// Get takes name of the virtualMachine, and returns the corresponding virtualMachine object, and an error if there is any.
func (c *FakeVirtualMachines) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.VirtualMachine, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewGetAction(virtualmachinesResource, c.ns, name), &v1.VirtualMachine{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.VirtualMachine), err
}

// List takes label and field selectors, and returns the list of VirtualMachines that match those selectors.
func (c *FakeVirtualMachines) List(ctx context.Context, opts metav1.ListOptions) (result *v1.VirtualMachineList, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewListAction(virtualmachinesResource, virtualmachinesKind, c.ns, opts), &v1.VirtualMachineList{})

        if obj == nil {
                return nil, err
        }

        label, _, _ := testing.ExtractFromListOptions(opts)
        if label == nil {
                label = labels.Everything()
        }
        list := &v1.VirtualMachineList{ListMeta: obj.(*v1.VirtualMachineList).ListMeta}
        for _, item := range obj.(*v1.VirtualMachineList).Items {
                if label.Matches(labels.Set(item.Labels)) {
                        list.Items = append(list.Items, item)
                }
        }
        return list, err
}

// Watch returns a watch.Interface that watches the requested virtualMachines.
func (c *FakeVirtualMachines) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) {
        return c.Fake.
                InvokesWatch(testing.NewWatchAction(virtualmachinesResource, c.ns, opts))

}

// Create takes the representation of a virtualMachine and creates it.  Returns the server's representation of the virtualMachine, and an error, if there is any.
func (c *FakeVirtualMachines) Create(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.CreateOptions) (result *v1.VirtualMachine, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewCreateAction(virtualmachinesResource, c.ns, virtualMachine), &v1.VirtualMachine{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.VirtualMachine), err
}

// Update takes the representation of a virtualMachine and updates it. Returns the server's representation of the virtualMachine, and an error, if there is any.
func (c *FakeVirtualMachines) Update(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.UpdateOptions) (result *v1.VirtualMachine, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewUpdateAction(virtualmachinesResource, c.ns, virtualMachine), &v1.VirtualMachine{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.VirtualMachine), err
}

// UpdateStatus was generated because the type contains a Status member.
// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus().
func (c *FakeVirtualMachines) UpdateStatus(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.UpdateOptions) (*v1.VirtualMachine, error) {
        obj, err := c.Fake.
                Invokes(testing.NewUpdateSubresourceAction(virtualmachinesResource, "status", c.ns, virtualMachine), &v1.VirtualMachine{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.VirtualMachine), err
}

// Delete takes name of the virtualMachine and deletes it. Returns an error if one occurs.
func (c *FakeVirtualMachines) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
        _, err := c.Fake.
                Invokes(testing.NewDeleteActionWithOptions(virtualmachinesResource, c.ns, name, opts), &v1.VirtualMachine{})

        return err
}

// DeleteCollection deletes a collection of objects.
func (c *FakeVirtualMachines) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error {
        action := testing.NewDeleteCollectionAction(virtualmachinesResource, c.ns, listOpts)

        _, err := c.Fake.Invokes(action, &v1.VirtualMachineList{})
        return err
}

// Patch applies the patch and returns the patched virtualMachine.
func (c *FakeVirtualMachines) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.VirtualMachine, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewPatchSubresourceAction(virtualmachinesResource, c.ns, name, pt, data, subresources...), &v1.VirtualMachine{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.VirtualMachine), err
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.

package fake

import (
        "context"

        v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        labels "k8s.io/apimachinery/pkg/labels"
        types "k8s.io/apimachinery/pkg/types"
        watch "k8s.io/apimachinery/pkg/watch"
        testing "k8s.io/client-go/testing"
)

// FakeVirtualMachineMigrations implements VirtualMachineMigrationInterface
type FakeVirtualMachineMigrations struct {
        Fake *FakeNeonvmV1
        ns   string
}

var virtualmachinemigrationsResource = v1.SchemeGroupVersion.WithResource("virtualmachinemigrations")

var virtualmachinemigrationsKind = v1.SchemeGroupVersion.WithKind("VirtualMachineMigration")

// Get takes name of the virtualMachineMigration, and returns the corresponding virtualMachineMigration object, and an error if there is any.
func (c *FakeVirtualMachineMigrations) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.VirtualMachineMigration, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewGetAction(virtualmachinemigrationsResource, c.ns, name), &v1.VirtualMachineMigration{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.VirtualMachineMigration), err
}

// List takes label and field selectors, and returns the list of VirtualMachineMigrations that match those selectors.
func (c *FakeVirtualMachineMigrations) List(ctx context.Context, opts metav1.ListOptions) (result *v1.VirtualMachineMigrationList, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewListAction(virtualmachinemigrationsResource, virtualmachinemigrationsKind, c.ns, opts), &v1.VirtualMachineMigrationList{})

        if obj == nil {
                return nil, err
        }

        label, _, _ := testing.ExtractFromListOptions(opts)
        if label == nil {
                label = labels.Everything()
        }
        list := &v1.VirtualMachineMigrationList{ListMeta: obj.(*v1.VirtualMachineMigrationList).ListMeta}
        for _, item := range obj.(*v1.VirtualMachineMigrationList).Items {
                if label.Matches(labels.Set(item.Labels)) {
                        list.Items = append(list.Items, item)
                }
        }
        return list, err
}

// Watch returns a watch.Interface that watches the requested virtualMachineMigrations.
func (c *FakeVirtualMachineMigrations) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) {
        return c.Fake.
                InvokesWatch(testing.NewWatchAction(virtualmachinemigrationsResource, c.ns, opts))

}

// Create takes the representation of a virtualMachineMigration and creates it.  Returns the server's representation of the virtualMachineMigration, and an error, if there is any.
func (c *FakeVirtualMachineMigrations) Create(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.CreateOptions) (result *v1.VirtualMachineMigration, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewCreateAction(virtualmachinemigrationsResource, c.ns, virtualMachineMigration), &v1.VirtualMachineMigration{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.VirtualMachineMigration), err
}

// Update takes the representation of a virtualMachineMigration and updates it. Returns the server's representation of the virtualMachineMigration, and an error, if there is any.
func (c *FakeVirtualMachineMigrations) Update(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.UpdateOptions) (result *v1.VirtualMachineMigration, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewUpdateAction(virtualmachinemigrationsResource, c.ns, virtualMachineMigration), &v1.VirtualMachineMigration{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.VirtualMachineMigration), err
}

// UpdateStatus was generated because the type contains a Status member.
// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus().
func (c *FakeVirtualMachineMigrations) UpdateStatus(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.UpdateOptions) (*v1.VirtualMachineMigration, error) {
        obj, err := c.Fake.
                Invokes(testing.NewUpdateSubresourceAction(virtualmachinemigrationsResource, "status", c.ns, virtualMachineMigration), &v1.VirtualMachineMigration{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.VirtualMachineMigration), err
}

// Delete takes name of the virtualMachineMigration and deletes it. Returns an error if one occurs.
func (c *FakeVirtualMachineMigrations) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
        _, err := c.Fake.
                Invokes(testing.NewDeleteActionWithOptions(virtualmachinemigrationsResource, c.ns, name, opts), &v1.VirtualMachineMigration{})

        return err
}

// DeleteCollection deletes a collection of objects.
func (c *FakeVirtualMachineMigrations) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error {
        action := testing.NewDeleteCollectionAction(virtualmachinemigrationsResource, c.ns, listOpts)

        _, err := c.Fake.Invokes(action, &v1.VirtualMachineMigrationList{})
        return err
}

// Patch applies the patch and returns the patched virtualMachineMigration.
func (c *FakeVirtualMachineMigrations) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.VirtualMachineMigration, err error) {
        obj, err := c.Fake.
                Invokes(testing.NewPatchSubresourceAction(virtualmachinemigrationsResource, c.ns, name, pt, data, subresources...), &v1.VirtualMachineMigration{})

        if obj == nil {
                return nil, err
        }
        return obj.(*v1.VirtualMachineMigration), err
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.

package v1

import (
        "context"
        "time"

        v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        scheme "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/scheme"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        types "k8s.io/apimachinery/pkg/types"
        watch "k8s.io/apimachinery/pkg/watch"
        rest "k8s.io/client-go/rest"
)

// IPPoolsGetter has a method to return a IPPoolInterface.
// A group's client should implement this interface.
type IPPoolsGetter interface {
        IPPools(namespace string) IPPoolInterface
}

// IPPoolInterface has methods to work with IPPool resources.
type IPPoolInterface interface {
        Create(ctx context.Context, iPPool *v1.IPPool, opts metav1.CreateOptions) (*v1.IPPool, error)
        Update(ctx context.Context, iPPool *v1.IPPool, opts metav1.UpdateOptions) (*v1.IPPool, error)
        Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error
        DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error
        Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.IPPool, error)
        List(ctx context.Context, opts metav1.ListOptions) (*v1.IPPoolList, error)
        Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error)
        Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.IPPool, err error)
        IPPoolExpansion
}

// iPPools implements IPPoolInterface
type iPPools struct {
        client rest.Interface
        ns     string
}

// newIPPools returns a IPPools
func newIPPools(c *NeonvmV1Client, namespace string) *iPPools {
        return &iPPools{
                client: c.RESTClient(),
                ns:     namespace,
        }
}

// Get takes name of the iPPool, and returns the corresponding iPPool object, and an error if there is any.
func (c *iPPools) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.IPPool, err error) {
        result = &v1.IPPool{}
        err = c.client.Get().
                Namespace(c.ns).
                Resource("ippools").
                Name(name).
                VersionedParams(&options, scheme.ParameterCodec).
                Do(ctx).
                Into(result)
        return
}

// List takes label and field selectors, and returns the list of IPPools that match those selectors.
func (c *iPPools) List(ctx context.Context, opts metav1.ListOptions) (result *v1.IPPoolList, err error) {
        var timeout time.Duration
        if opts.TimeoutSeconds != nil {
                timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
        }
        result = &v1.IPPoolList{}
        err = c.client.Get().
                Namespace(c.ns).
                Resource("ippools").
                VersionedParams(&opts, scheme.ParameterCodec).
                Timeout(timeout).
                Do(ctx).
                Into(result)
        return
}

// Watch returns a watch.Interface that watches the requested iPPools.
func (c *iPPools) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) {
        var timeout time.Duration
        if opts.TimeoutSeconds != nil {
                timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
        }
        opts.Watch = true
        return c.client.Get().
                Namespace(c.ns).
                Resource("ippools").
                VersionedParams(&opts, scheme.ParameterCodec).
                Timeout(timeout).
                Watch(ctx)
}

// Create takes the representation of a iPPool and creates it.  Returns the server's representation of the iPPool, and an error, if there is any.
func (c *iPPools) Create(ctx context.Context, iPPool *v1.IPPool, opts metav1.CreateOptions) (result *v1.IPPool, err error) {
        result = &v1.IPPool{}
        err = c.client.Post().
                Namespace(c.ns).
                Resource("ippools").
                VersionedParams(&opts, scheme.ParameterCodec).
                Body(iPPool).
                Do(ctx).
                Into(result)
        return
}

// Update takes the representation of a iPPool and updates it. Returns the server's representation of the iPPool, and an error, if there is any.
func (c *iPPools) Update(ctx context.Context, iPPool *v1.IPPool, opts metav1.UpdateOptions) (result *v1.IPPool, err error) {
        result = &v1.IPPool{}
        err = c.client.Put().
                Namespace(c.ns).
                Resource("ippools").
                Name(iPPool.Name).
                VersionedParams(&opts, scheme.ParameterCodec).
                Body(iPPool).
                Do(ctx).
                Into(result)
        return
}

// Delete takes name of the iPPool and deletes it. Returns an error if one occurs.
func (c *iPPools) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
        return c.client.Delete().
                Namespace(c.ns).
                Resource("ippools").
                Name(name).
                Body(&opts).
                Do(ctx).
                Error()
}

// DeleteCollection deletes a collection of objects.
func (c *iPPools) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error {
        var timeout time.Duration
        if listOpts.TimeoutSeconds != nil {
                timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second
        }
        return c.client.Delete().
                Namespace(c.ns).
                Resource("ippools").
                VersionedParams(&listOpts, scheme.ParameterCodec).
                Timeout(timeout).
                Body(&opts).
                Do(ctx).
                Error()
}

// Patch applies the patch and returns the patched iPPool.
func (c *iPPools) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.IPPool, err error) {
        result = &v1.IPPool{}
        err = c.client.Patch(pt).
                Namespace(c.ns).
                Resource("ippools").
                Name(name).
                SubResource(subresources...).
                VersionedParams(&opts, scheme.ParameterCodec).
                Body(data).
                Do(ctx).
                Into(result)
        return
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.

package v1

import (
        "net/http"

        v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/scheme"
        rest "k8s.io/client-go/rest"
)

type NeonvmV1Interface interface {
        RESTClient() rest.Interface
        IPPoolsGetter
        VirtualMachinesGetter
        VirtualMachineMigrationsGetter
}

// NeonvmV1Client is used to interact with features provided by the neonvm group.
type NeonvmV1Client struct {
        restClient rest.Interface
}

func (c *NeonvmV1Client) IPPools(namespace string) IPPoolInterface {
        return newIPPools(c, namespace)
}

func (c *NeonvmV1Client) VirtualMachines(namespace string) VirtualMachineInterface {
        return newVirtualMachines(c, namespace)
}

func (c *NeonvmV1Client) VirtualMachineMigrations(namespace string) VirtualMachineMigrationInterface {
        return newVirtualMachineMigrations(c, namespace)
}

// NewForConfig creates a new NeonvmV1Client for the given config.
// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient),
// where httpClient was generated with rest.HTTPClientFor(c).
func NewForConfig(c *rest.Config) (*NeonvmV1Client, error) {
        config := *c
        if err := setConfigDefaults(&config); err != nil {
                return nil, err
        }
        httpClient, err := rest.HTTPClientFor(&config)
        if err != nil {
                return nil, err
        }
        return NewForConfigAndClient(&config, httpClient)
}

// NewForConfigAndClient creates a new NeonvmV1Client for the given config and http client.
// Note the http client provided takes precedence over the configured transport values.
func NewForConfigAndClient(c *rest.Config, h *http.Client) (*NeonvmV1Client, error) {
        config := *c
        if err := setConfigDefaults(&config); err != nil {
                return nil, err
        }
        client, err := rest.RESTClientForConfigAndClient(&config, h)
        if err != nil {
                return nil, err
        }
        return &NeonvmV1Client{client}, nil
}

// NewForConfigOrDie creates a new NeonvmV1Client for the given config and
// panics if there is an error in the config.
func NewForConfigOrDie(c *rest.Config) *NeonvmV1Client {
        client, err := NewForConfig(c)
        if err != nil {
                panic(err)
        }
        return client
}

// New creates a new NeonvmV1Client for the given RESTClient.
func New(c rest.Interface) *NeonvmV1Client {
        return &NeonvmV1Client{c}
}

func setConfigDefaults(config *rest.Config) error {
        gv := v1.SchemeGroupVersion
        config.GroupVersion = &gv
        config.APIPath = "/apis"
        config.NegotiatedSerializer = scheme.Codecs.WithoutConversion()

        if config.UserAgent == "" {
                config.UserAgent = rest.DefaultKubernetesUserAgent()
        }

        return nil
}

// RESTClient returns a RESTClient that is used to communicate
// with API server by this client implementation.
func (c *NeonvmV1Client) RESTClient() rest.Interface {
        if c == nil {
                return nil
        }
        return c.restClient
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.

package v1

import (
        "context"
        "time"

        v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        scheme "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/scheme"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        types "k8s.io/apimachinery/pkg/types"
        watch "k8s.io/apimachinery/pkg/watch"
        rest "k8s.io/client-go/rest"
)

// VirtualMachinesGetter has a method to return a VirtualMachineInterface.
// A group's client should implement this interface.
type VirtualMachinesGetter interface {
        VirtualMachines(namespace string) VirtualMachineInterface
}

// VirtualMachineInterface has methods to work with VirtualMachine resources.
type VirtualMachineInterface interface {
        Create(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.CreateOptions) (*v1.VirtualMachine, error)
        Update(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.UpdateOptions) (*v1.VirtualMachine, error)
        UpdateStatus(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.UpdateOptions) (*v1.VirtualMachine, error)
        Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error
        DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error
        Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.VirtualMachine, error)
        List(ctx context.Context, opts metav1.ListOptions) (*v1.VirtualMachineList, error)
        Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error)
        Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.VirtualMachine, err error)
        VirtualMachineExpansion
}

// virtualMachines implements VirtualMachineInterface
type virtualMachines struct {
        client rest.Interface
        ns     string
}

// newVirtualMachines returns a VirtualMachines
func newVirtualMachines(c *NeonvmV1Client, namespace string) *virtualMachines {
        return &virtualMachines{
                client: c.RESTClient(),
                ns:     namespace,
        }
}

// Get takes name of the virtualMachine, and returns the corresponding virtualMachine object, and an error if there is any.
func (c *virtualMachines) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.VirtualMachine, err error) {
        result = &v1.VirtualMachine{}
        err = c.client.Get().
                Namespace(c.ns).
                Resource("virtualmachines").
                Name(name).
                VersionedParams(&options, scheme.ParameterCodec).
                Do(ctx).
                Into(result)
        return
}

// List takes label and field selectors, and returns the list of VirtualMachines that match those selectors.
func (c *virtualMachines) List(ctx context.Context, opts metav1.ListOptions) (result *v1.VirtualMachineList, err error) {
        var timeout time.Duration
        if opts.TimeoutSeconds != nil {
                timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
        }
        result = &v1.VirtualMachineList{}
        err = c.client.Get().
                Namespace(c.ns).
                Resource("virtualmachines").
                VersionedParams(&opts, scheme.ParameterCodec).
                Timeout(timeout).
                Do(ctx).
                Into(result)
        return
}

// Watch returns a watch.Interface that watches the requested virtualMachines.
func (c *virtualMachines) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) {
        var timeout time.Duration
        if opts.TimeoutSeconds != nil {
                timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
        }
        opts.Watch = true
        return c.client.Get().
                Namespace(c.ns).
                Resource("virtualmachines").
                VersionedParams(&opts, scheme.ParameterCodec).
                Timeout(timeout).
                Watch(ctx)
}

// Create takes the representation of a virtualMachine and creates it.  Returns the server's representation of the virtualMachine, and an error, if there is any.
func (c *virtualMachines) Create(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.CreateOptions) (result *v1.VirtualMachine, err error) {
        result = &v1.VirtualMachine{}
        err = c.client.Post().
                Namespace(c.ns).
                Resource("virtualmachines").
                VersionedParams(&opts, scheme.ParameterCodec).
                Body(virtualMachine).
                Do(ctx).
                Into(result)
        return
}

// Update takes the representation of a virtualMachine and updates it. Returns the server's representation of the virtualMachine, and an error, if there is any.
func (c *virtualMachines) Update(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.UpdateOptions) (result *v1.VirtualMachine, err error) {
        result = &v1.VirtualMachine{}
        err = c.client.Put().
                Namespace(c.ns).
                Resource("virtualmachines").
                Name(virtualMachine.Name).
                VersionedParams(&opts, scheme.ParameterCodec).
                Body(virtualMachine).
                Do(ctx).
                Into(result)
        return
}

// UpdateStatus was generated because the type contains a Status member.
// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus().
func (c *virtualMachines) UpdateStatus(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.UpdateOptions) (result *v1.VirtualMachine, err error) {
        result = &v1.VirtualMachine{}
        err = c.client.Put().
                Namespace(c.ns).
                Resource("virtualmachines").
                Name(virtualMachine.Name).
                SubResource("status").
                VersionedParams(&opts, scheme.ParameterCodec).
                Body(virtualMachine).
                Do(ctx).
                Into(result)
        return
}

// Delete takes name of the virtualMachine and deletes it. Returns an error if one occurs.
func (c *virtualMachines) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
        return c.client.Delete().
                Namespace(c.ns).
                Resource("virtualmachines").
                Name(name).
                Body(&opts).
                Do(ctx).
                Error()
}

// DeleteCollection deletes a collection of objects.
func (c *virtualMachines) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error {
        var timeout time.Duration
        if listOpts.TimeoutSeconds != nil {
                timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second
        }
        return c.client.Delete().
                Namespace(c.ns).
                Resource("virtualmachines").
                VersionedParams(&listOpts, scheme.ParameterCodec).
                Timeout(timeout).
                Body(&opts).
                Do(ctx).
                Error()
}

// Patch applies the patch and returns the patched virtualMachine.
func (c *virtualMachines) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.VirtualMachine, err error) {
        result = &v1.VirtualMachine{}
        err = c.client.Patch(pt).
                Namespace(c.ns).
                Resource("virtualmachines").
                Name(name).
                SubResource(subresources...).
                VersionedParams(&opts, scheme.ParameterCodec).
                Body(data).
                Do(ctx).
                Into(result)
        return
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.

package v1

import (
        "context"
        "time"

        v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        scheme "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/scheme"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        types "k8s.io/apimachinery/pkg/types"
        watch "k8s.io/apimachinery/pkg/watch"
        rest "k8s.io/client-go/rest"
)

// VirtualMachineMigrationsGetter has a method to return a VirtualMachineMigrationInterface.
// A group's client should implement this interface.
type VirtualMachineMigrationsGetter interface {
        VirtualMachineMigrations(namespace string) VirtualMachineMigrationInterface
}

// VirtualMachineMigrationInterface has methods to work with VirtualMachineMigration resources.
type VirtualMachineMigrationInterface interface {
        Create(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.CreateOptions) (*v1.VirtualMachineMigration, error)
        Update(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.UpdateOptions) (*v1.VirtualMachineMigration, error)
        UpdateStatus(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.UpdateOptions) (*v1.VirtualMachineMigration, error)
        Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error
        DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error
        Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.VirtualMachineMigration, error)
        List(ctx context.Context, opts metav1.ListOptions) (*v1.VirtualMachineMigrationList, error)
        Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error)
        Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.VirtualMachineMigration, err error)
        VirtualMachineMigrationExpansion
}

// virtualMachineMigrations implements VirtualMachineMigrationInterface
type virtualMachineMigrations struct {
        client rest.Interface
        ns     string
}

// newVirtualMachineMigrations returns a VirtualMachineMigrations
func newVirtualMachineMigrations(c *NeonvmV1Client, namespace string) *virtualMachineMigrations {
        return &virtualMachineMigrations{
                client: c.RESTClient(),
                ns:     namespace,
        }
}

// Get takes name of the virtualMachineMigration, and returns the corresponding virtualMachineMigration object, and an error if there is any.
func (c *virtualMachineMigrations) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.VirtualMachineMigration, err error) {
        result = &v1.VirtualMachineMigration{}
        err = c.client.Get().
                Namespace(c.ns).
                Resource("virtualmachinemigrations").
                Name(name).
                VersionedParams(&options, scheme.ParameterCodec).
                Do(ctx).
                Into(result)
        return
}

// List takes label and field selectors, and returns the list of VirtualMachineMigrations that match those selectors.
func (c *virtualMachineMigrations) List(ctx context.Context, opts metav1.ListOptions) (result *v1.VirtualMachineMigrationList, err error) {
        var timeout time.Duration
        if opts.TimeoutSeconds != nil {
                timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
        }
        result = &v1.VirtualMachineMigrationList{}
        err = c.client.Get().
                Namespace(c.ns).
                Resource("virtualmachinemigrations").
                VersionedParams(&opts, scheme.ParameterCodec).
                Timeout(timeout).
                Do(ctx).
                Into(result)
        return
}

// Watch returns a watch.Interface that watches the requested virtualMachineMigrations.
func (c *virtualMachineMigrations) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) {
        var timeout time.Duration
        if opts.TimeoutSeconds != nil {
                timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
        }
        opts.Watch = true
        return c.client.Get().
                Namespace(c.ns).
                Resource("virtualmachinemigrations").
                VersionedParams(&opts, scheme.ParameterCodec).
                Timeout(timeout).
                Watch(ctx)
}

// Create takes the representation of a virtualMachineMigration and creates it.  Returns the server's representation of the virtualMachineMigration, and an error, if there is any.
func (c *virtualMachineMigrations) Create(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.CreateOptions) (result *v1.VirtualMachineMigration, err error) {
        result = &v1.VirtualMachineMigration{}
        err = c.client.Post().
                Namespace(c.ns).
                Resource("virtualmachinemigrations").
                VersionedParams(&opts, scheme.ParameterCodec).
                Body(virtualMachineMigration).
                Do(ctx).
                Into(result)
        return
}

// Update takes the representation of a virtualMachineMigration and updates it. Returns the server's representation of the virtualMachineMigration, and an error, if there is any.
func (c *virtualMachineMigrations) Update(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.UpdateOptions) (result *v1.VirtualMachineMigration, err error) {
        result = &v1.VirtualMachineMigration{}
        err = c.client.Put().
                Namespace(c.ns).
                Resource("virtualmachinemigrations").
                Name(virtualMachineMigration.Name).
                VersionedParams(&opts, scheme.ParameterCodec).
                Body(virtualMachineMigration).
                Do(ctx).
                Into(result)
        return
}

// UpdateStatus was generated because the type contains a Status member.
// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus().
func (c *virtualMachineMigrations) UpdateStatus(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.UpdateOptions) (result *v1.VirtualMachineMigration, err error) {
        result = &v1.VirtualMachineMigration{}
        err = c.client.Put().
                Namespace(c.ns).
                Resource("virtualmachinemigrations").
                Name(virtualMachineMigration.Name).
                SubResource("status").
                VersionedParams(&opts, scheme.ParameterCodec).
                Body(virtualMachineMigration).
                Do(ctx).
                Into(result)
        return
}

// Delete takes name of the virtualMachineMigration and deletes it. Returns an error if one occurs.
func (c *virtualMachineMigrations) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
        return c.client.Delete().
                Namespace(c.ns).
                Resource("virtualmachinemigrations").
                Name(name).
                Body(&opts).
                Do(ctx).
                Error()
}

// DeleteCollection deletes a collection of objects.
func (c *virtualMachineMigrations) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error {
        var timeout time.Duration
        if listOpts.TimeoutSeconds != nil {
                timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second
        }
        return c.client.Delete().
                Namespace(c.ns).
                Resource("virtualmachinemigrations").
                VersionedParams(&listOpts, scheme.ParameterCodec).
                Timeout(timeout).
                Body(&opts).
                Do(ctx).
                Error()
}

// Patch applies the patch and returns the patched virtualMachineMigration.
func (c *virtualMachineMigrations) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.VirtualMachineMigration, err error) {
        result = &v1.VirtualMachineMigration{}
        err = c.client.Patch(pt).
                Namespace(c.ns).
                Resource("virtualmachinemigrations").
                Name(name).
                SubResource(subresources...).
                VersionedParams(&opts, scheme.ParameterCodec).
                Body(data).
                Do(ctx).
                Into(result)
        return
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.

package externalversions

import (
        reflect "reflect"
        sync "sync"
        time "time"

        versioned "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
        internalinterfaces "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/internalinterfaces"
        neonvm "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/neonvm"
        v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        runtime "k8s.io/apimachinery/pkg/runtime"
        schema "k8s.io/apimachinery/pkg/runtime/schema"
        cache "k8s.io/client-go/tools/cache"
)

// SharedInformerOption defines the functional option type for SharedInformerFactory.
type SharedInformerOption func(*sharedInformerFactory) *sharedInformerFactory

type sharedInformerFactory struct {
        client           versioned.Interface
        namespace        string
        tweakListOptions internalinterfaces.TweakListOptionsFunc
        lock             sync.Mutex
        defaultResync    time.Duration
        customResync     map[reflect.Type]time.Duration
        transform        cache.TransformFunc

        informers map[reflect.Type]cache.SharedIndexInformer
        // startedInformers is used for tracking which informers have been started.
        // This allows Start() to be called multiple times safely.
        startedInformers map[reflect.Type]bool
        // wg tracks how many goroutines were started.
        wg sync.WaitGroup
        // shuttingDown is true when Shutdown has been called. It may still be running
        // because it needs to wait for goroutines.
        shuttingDown bool
}

// WithCustomResyncConfig sets a custom resync period for the specified informer types.
func WithCustomResyncConfig(resyncConfig map[v1.Object]time.Duration) SharedInformerOption {
        return func(factory *sharedInformerFactory) *sharedInformerFactory {
                for k, v := range resyncConfig {
                        factory.customResync[reflect.TypeOf(k)] = v
                }
                return factory
        }
}

// WithTweakListOptions sets a custom filter on all listers of the configured SharedInformerFactory.
func WithTweakListOptions(tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerOption {
        return func(factory *sharedInformerFactory) *sharedInformerFactory {
                factory.tweakListOptions = tweakListOptions
                return factory
        }
}

// WithNamespace limits the SharedInformerFactory to the specified namespace.
func WithNamespace(namespace string) SharedInformerOption {
        return func(factory *sharedInformerFactory) *sharedInformerFactory {
                factory.namespace = namespace
                return factory
        }
}

// WithTransform sets a transform on all informers.
func WithTransform(transform cache.TransformFunc) SharedInformerOption {
        return func(factory *sharedInformerFactory) *sharedInformerFactory {
                factory.transform = transform
                return factory
        }
}

// NewSharedInformerFactory constructs a new instance of sharedInformerFactory for all namespaces.
func NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) SharedInformerFactory {
        return NewSharedInformerFactoryWithOptions(client, defaultResync)
}

// NewFilteredSharedInformerFactory constructs a new instance of sharedInformerFactory.
// Listers obtained via this SharedInformerFactory will be subject to the same filters
// as specified here.
// Deprecated: Please use NewSharedInformerFactoryWithOptions instead
func NewFilteredSharedInformerFactory(client versioned.Interface, defaultResync time.Duration, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerFactory {
        return NewSharedInformerFactoryWithOptions(client, defaultResync, WithNamespace(namespace), WithTweakListOptions(tweakListOptions))
}

// NewSharedInformerFactoryWithOptions constructs a new instance of a SharedInformerFactory with additional options.
func NewSharedInformerFactoryWithOptions(client versioned.Interface, defaultResync time.Duration, options ...SharedInformerOption) SharedInformerFactory {
        factory := &sharedInformerFactory{
                client:           client,
                namespace:        v1.NamespaceAll,
                defaultResync:    defaultResync,
                informers:        make(map[reflect.Type]cache.SharedIndexInformer),
                startedInformers: make(map[reflect.Type]bool),
                customResync:     make(map[reflect.Type]time.Duration),
        }

        // Apply all options
        for _, opt := range options {
                factory = opt(factory)
        }

        return factory
}

func (f *sharedInformerFactory) Start(stopCh <-chan struct{}) {
        f.lock.Lock()
        defer f.lock.Unlock()

        if f.shuttingDown {
                return
        }

        for informerType, informer := range f.informers {
                if !f.startedInformers[informerType] {
                        f.wg.Add(1)
                        // We need a new variable in each loop iteration,
                        // otherwise the goroutine would use the loop variable
                        // and that keeps changing.
                        informer := informer
                        go func() {
                                defer f.wg.Done()
                                informer.Run(stopCh)
                        }()
                        f.startedInformers[informerType] = true
                }
        }
}

func (f *sharedInformerFactory) Shutdown() {
        f.lock.Lock()
        f.shuttingDown = true
        f.lock.Unlock()

        // Will return immediately if there is nothing to wait for.
        f.wg.Wait()
}

func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool {
        informers := func() map[reflect.Type]cache.SharedIndexInformer {
                f.lock.Lock()
                defer f.lock.Unlock()

                informers := map[reflect.Type]cache.SharedIndexInformer{}
                for informerType, informer := range f.informers {
                        if f.startedInformers[informerType] {
                                informers[informerType] = informer
                        }
                }
                return informers
        }()

        res := map[reflect.Type]bool{}
        for informType, informer := range informers {
                res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced)
        }
        return res
}

// InformerFor returns the SharedIndexInformer for obj using an internal
// client.
func (f *sharedInformerFactory) InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer {
        f.lock.Lock()
        defer f.lock.Unlock()

        informerType := reflect.TypeOf(obj)
        informer, exists := f.informers[informerType]
        if exists {
                return informer
        }

        resyncPeriod, exists := f.customResync[informerType]
        if !exists {
                resyncPeriod = f.defaultResync
        }

        informer = newFunc(f.client, resyncPeriod)
        informer.SetTransform(f.transform)
        f.informers[informerType] = informer

        return informer
}

// SharedInformerFactory provides shared informers for resources in all known
// API group versions.
//
// It is typically used like this:
//
//        ctx, cancel := context.Background()
//        defer cancel()
//        factory := NewSharedInformerFactory(client, resyncPeriod)
//        defer factory.WaitForStop()    // Returns immediately if nothing was started.
//        genericInformer := factory.ForResource(resource)
//        typedInformer := factory.SomeAPIGroup().V1().SomeType()
//        factory.Start(ctx.Done())          // Start processing these informers.
//        synced := factory.WaitForCacheSync(ctx.Done())
//        for v, ok := range synced {
//            if !ok {
//                fmt.Fprintf(os.Stderr, "caches failed to sync: %v", v)
//                return
//            }
//        }
//
//        // Creating informers can also be created after Start, but then
//        // Start must be called again:
//        anotherGenericInformer := factory.ForResource(resource)
//        factory.Start(ctx.Done())
type SharedInformerFactory interface {
        internalinterfaces.SharedInformerFactory

        // Start initializes all requested informers. They are handled in goroutines
        // which run until the stop channel gets closed.
        Start(stopCh <-chan struct{})

        // Shutdown marks a factory as shutting down. At that point no new
        // informers can be started anymore and Start will return without
        // doing anything.
        //
        // In addition, Shutdown blocks until all goroutines have terminated. For that
        // to happen, the close channel(s) that they were started with must be closed,
        // either before Shutdown gets called or while it is waiting.
        //
        // Shutdown may be called multiple times, even concurrently. All such calls will
        // block until all goroutines have terminated.
        Shutdown()

        // WaitForCacheSync blocks until all started informers' caches were synced
        // or the stop channel gets closed.
        WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool

        // ForResource gives generic access to a shared informer of the matching type.
        ForResource(resource schema.GroupVersionResource) (GenericInformer, error)

        // InformerFor returns the SharedIndexInformer for obj using an internal
        // client.
        InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer

        Neonvm() neonvm.Interface
}

func (f *sharedInformerFactory) Neonvm() neonvm.Interface {
        return neonvm.New(f, f.namespace, f.tweakListOptions)
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.

package externalversions

import (
        "fmt"

        v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        schema "k8s.io/apimachinery/pkg/runtime/schema"
        cache "k8s.io/client-go/tools/cache"
)

// GenericInformer is type of SharedIndexInformer which will locate and delegate to other
// sharedInformers based on type
type GenericInformer interface {
        Informer() cache.SharedIndexInformer
        Lister() cache.GenericLister
}

type genericInformer struct {
        informer cache.SharedIndexInformer
        resource schema.GroupResource
}

// Informer returns the SharedIndexInformer.
func (f *genericInformer) Informer() cache.SharedIndexInformer {
        return f.informer
}

// Lister returns the GenericLister.
func (f *genericInformer) Lister() cache.GenericLister {
        return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource)
}

// ForResource gives generic access to a shared informer of the matching type
// TODO extend this to unknown resources with a client pool
func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) {
        switch resource {
        // Group=neonvm, Version=v1
        case v1.SchemeGroupVersion.WithResource("ippools"):
                return &genericInformer{resource: resource.GroupResource(), informer: f.Neonvm().V1().IPPools().Informer()}, nil
        case v1.SchemeGroupVersion.WithResource("virtualmachines"):
                return &genericInformer{resource: resource.GroupResource(), informer: f.Neonvm().V1().VirtualMachines().Informer()}, nil
        case v1.SchemeGroupVersion.WithResource("virtualmachinemigrations"):
                return &genericInformer{resource: resource.GroupResource(), informer: f.Neonvm().V1().VirtualMachineMigrations().Informer()}, nil

        }

        return nil, fmt.Errorf("no informer found for %v", resource)
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.

package neonvm

import (
        internalinterfaces "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/internalinterfaces"
        v1 "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/neonvm/v1"
)

// Interface provides access to each of this group's versions.
type Interface interface {
        // V1 provides access to shared informers for resources in V1.
        V1() v1.Interface
}

type group struct {
        factory          internalinterfaces.SharedInformerFactory
        namespace        string
        tweakListOptions internalinterfaces.TweakListOptionsFunc
}

// New returns a new Interface.
func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface {
        return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions}
}

// V1 returns a new v1.Interface.
func (g *group) V1() v1.Interface {
        return v1.New(g.factory, g.namespace, g.tweakListOptions)
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.

package v1

import (
        internalinterfaces "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/internalinterfaces"
)

// Interface provides access to all the informers in this group version.
type Interface interface {
        // IPPools returns a IPPoolInformer.
        IPPools() IPPoolInformer
        // VirtualMachines returns a VirtualMachineInformer.
        VirtualMachines() VirtualMachineInformer
        // VirtualMachineMigrations returns a VirtualMachineMigrationInformer.
        VirtualMachineMigrations() VirtualMachineMigrationInformer
}

type version struct {
        factory          internalinterfaces.SharedInformerFactory
        namespace        string
        tweakListOptions internalinterfaces.TweakListOptionsFunc
}

// New returns a new Interface.
func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface {
        return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions}
}

// IPPools returns a IPPoolInformer.
func (v *version) IPPools() IPPoolInformer {
        return &iPPoolInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions}
}

// VirtualMachines returns a VirtualMachineInformer.
func (v *version) VirtualMachines() VirtualMachineInformer {
        return &virtualMachineInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions}
}

// VirtualMachineMigrations returns a VirtualMachineMigrationInformer.
func (v *version) VirtualMachineMigrations() VirtualMachineMigrationInformer {
        return &virtualMachineMigrationInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions}
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.

package v1

import (
        "context"
        time "time"

        neonvmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        versioned "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
        internalinterfaces "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/internalinterfaces"
        v1 "github.com/neondatabase/autoscaling/neonvm/client/listers/neonvm/v1"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        runtime "k8s.io/apimachinery/pkg/runtime"
        watch "k8s.io/apimachinery/pkg/watch"
        cache "k8s.io/client-go/tools/cache"
)

// IPPoolInformer provides access to a shared informer and lister for
// IPPools.
type IPPoolInformer interface {
        Informer() cache.SharedIndexInformer
        Lister() v1.IPPoolLister
}

type iPPoolInformer struct {
        factory          internalinterfaces.SharedInformerFactory
        tweakListOptions internalinterfaces.TweakListOptionsFunc
        namespace        string
}

// NewIPPoolInformer constructs a new informer for IPPool type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewIPPoolInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer {
        return NewFilteredIPPoolInformer(client, namespace, resyncPeriod, indexers, nil)
}

// NewFilteredIPPoolInformer constructs a new informer for IPPool type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewFilteredIPPoolInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer {
        return cache.NewSharedIndexInformer(
                &cache.ListWatch{
                        ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
                                if tweakListOptions != nil {
                                        tweakListOptions(&options)
                                }
                                return client.NeonvmV1().IPPools(namespace).List(context.TODO(), options)
                        },
                        WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
                                if tweakListOptions != nil {
                                        tweakListOptions(&options)
                                }
                                return client.NeonvmV1().IPPools(namespace).Watch(context.TODO(), options)
                        },
                },
                &neonvmv1.IPPool{},
                resyncPeriod,
                indexers,
        )
}

func (f *iPPoolInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer {
        return NewFilteredIPPoolInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions)
}

func (f *iPPoolInformer) Informer() cache.SharedIndexInformer {
        return f.factory.InformerFor(&neonvmv1.IPPool{}, f.defaultInformer)
}

func (f *iPPoolInformer) Lister() v1.IPPoolLister {
        return v1.NewIPPoolLister(f.Informer().GetIndexer())
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.

package v1

import (
        "context"
        time "time"

        neonvmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        versioned "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
        internalinterfaces "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/internalinterfaces"
        v1 "github.com/neondatabase/autoscaling/neonvm/client/listers/neonvm/v1"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        runtime "k8s.io/apimachinery/pkg/runtime"
        watch "k8s.io/apimachinery/pkg/watch"
        cache "k8s.io/client-go/tools/cache"
)

// VirtualMachineInformer provides access to a shared informer and lister for
// VirtualMachines.
type VirtualMachineInformer interface {
        Informer() cache.SharedIndexInformer
        Lister() v1.VirtualMachineLister
}

type virtualMachineInformer struct {
        factory          internalinterfaces.SharedInformerFactory
        tweakListOptions internalinterfaces.TweakListOptionsFunc
        namespace        string
}

// NewVirtualMachineInformer constructs a new informer for VirtualMachine type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewVirtualMachineInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer {
        return NewFilteredVirtualMachineInformer(client, namespace, resyncPeriod, indexers, nil)
}

// NewFilteredVirtualMachineInformer constructs a new informer for VirtualMachine type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewFilteredVirtualMachineInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer {
        return cache.NewSharedIndexInformer(
                &cache.ListWatch{
                        ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
                                if tweakListOptions != nil {
                                        tweakListOptions(&options)
                                }
                                return client.NeonvmV1().VirtualMachines(namespace).List(context.TODO(), options)
                        },
                        WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
                                if tweakListOptions != nil {
                                        tweakListOptions(&options)
                                }
                                return client.NeonvmV1().VirtualMachines(namespace).Watch(context.TODO(), options)
                        },
                },
                &neonvmv1.VirtualMachine{},
                resyncPeriod,
                indexers,
        )
}

func (f *virtualMachineInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer {
        return NewFilteredVirtualMachineInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions)
}

func (f *virtualMachineInformer) Informer() cache.SharedIndexInformer {
        return f.factory.InformerFor(&neonvmv1.VirtualMachine{}, f.defaultInformer)
}

func (f *virtualMachineInformer) Lister() v1.VirtualMachineLister {
        return v1.NewVirtualMachineLister(f.Informer().GetIndexer())
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.

package v1

import (
        "context"
        time "time"

        neonvmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        versioned "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
        internalinterfaces "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/internalinterfaces"
        v1 "github.com/neondatabase/autoscaling/neonvm/client/listers/neonvm/v1"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        runtime "k8s.io/apimachinery/pkg/runtime"
        watch "k8s.io/apimachinery/pkg/watch"
        cache "k8s.io/client-go/tools/cache"
)

// VirtualMachineMigrationInformer provides access to a shared informer and lister for
// VirtualMachineMigrations.
type VirtualMachineMigrationInformer interface {
        Informer() cache.SharedIndexInformer
        Lister() v1.VirtualMachineMigrationLister
}

type virtualMachineMigrationInformer struct {
        factory          internalinterfaces.SharedInformerFactory
        tweakListOptions internalinterfaces.TweakListOptionsFunc
        namespace        string
}

// NewVirtualMachineMigrationInformer constructs a new informer for VirtualMachineMigration type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewVirtualMachineMigrationInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer {
        return NewFilteredVirtualMachineMigrationInformer(client, namespace, resyncPeriod, indexers, nil)
}

// NewFilteredVirtualMachineMigrationInformer constructs a new informer for VirtualMachineMigration type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewFilteredVirtualMachineMigrationInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer {
        return cache.NewSharedIndexInformer(
                &cache.ListWatch{
                        ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
                                if tweakListOptions != nil {
                                        tweakListOptions(&options)
                                }
                                return client.NeonvmV1().VirtualMachineMigrations(namespace).List(context.TODO(), options)
                        },
                        WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
                                if tweakListOptions != nil {
                                        tweakListOptions(&options)
                                }
                                return client.NeonvmV1().VirtualMachineMigrations(namespace).Watch(context.TODO(), options)
                        },
                },
                &neonvmv1.VirtualMachineMigration{},
                resyncPeriod,
                indexers,
        )
}

func (f *virtualMachineMigrationInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer {
        return NewFilteredVirtualMachineMigrationInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions)
}

func (f *virtualMachineMigrationInformer) Informer() cache.SharedIndexInformer {
        return f.factory.InformerFor(&neonvmv1.VirtualMachineMigration{}, f.defaultInformer)
}

func (f *virtualMachineMigrationInformer) Lister() v1.VirtualMachineMigrationLister {
        return v1.NewVirtualMachineMigrationLister(f.Informer().GetIndexer())
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by lister-gen. DO NOT EDIT.

package v1

import (
        v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "k8s.io/apimachinery/pkg/api/errors"
        "k8s.io/apimachinery/pkg/labels"
        "k8s.io/client-go/tools/cache"
)

// IPPoolLister helps list IPPools.
// All objects returned here must be treated as read-only.
type IPPoolLister interface {
        // List lists all IPPools in the indexer.
        // Objects returned here must be treated as read-only.
        List(selector labels.Selector) (ret []*v1.IPPool, err error)
        // IPPools returns an object that can list and get IPPools.
        IPPools(namespace string) IPPoolNamespaceLister
        IPPoolListerExpansion
}

// iPPoolLister implements the IPPoolLister interface.
type iPPoolLister struct {
        indexer cache.Indexer
}

// NewIPPoolLister returns a new IPPoolLister.
func NewIPPoolLister(indexer cache.Indexer) IPPoolLister {
        return &iPPoolLister{indexer: indexer}
}

// List lists all IPPools in the indexer.
func (s *iPPoolLister) List(selector labels.Selector) (ret []*v1.IPPool, err error) {
        err = cache.ListAll(s.indexer, selector, func(m interface{}) {
                ret = append(ret, m.(*v1.IPPool))
        })
        return ret, err
}

// IPPools returns an object that can list and get IPPools.
func (s *iPPoolLister) IPPools(namespace string) IPPoolNamespaceLister {
        return iPPoolNamespaceLister{indexer: s.indexer, namespace: namespace}
}

// IPPoolNamespaceLister helps list and get IPPools.
// All objects returned here must be treated as read-only.
type IPPoolNamespaceLister interface {
        // List lists all IPPools in the indexer for a given namespace.
        // Objects returned here must be treated as read-only.
        List(selector labels.Selector) (ret []*v1.IPPool, err error)
        // Get retrieves the IPPool from the indexer for a given namespace and name.
        // Objects returned here must be treated as read-only.
        Get(name string) (*v1.IPPool, error)
        IPPoolNamespaceListerExpansion
}

// iPPoolNamespaceLister implements the IPPoolNamespaceLister
// interface.
type iPPoolNamespaceLister struct {
        indexer   cache.Indexer
        namespace string
}

// List lists all IPPools in the indexer for a given namespace.
func (s iPPoolNamespaceLister) List(selector labels.Selector) (ret []*v1.IPPool, err error) {
        err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) {
                ret = append(ret, m.(*v1.IPPool))
        })
        return ret, err
}

// Get retrieves the IPPool from the indexer for a given namespace and name.
func (s iPPoolNamespaceLister) Get(name string) (*v1.IPPool, error) {
        obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name)
        if err != nil {
                return nil, err
        }
        if !exists {
                return nil, errors.NewNotFound(v1.Resource("ippool"), name)
        }
        return obj.(*v1.IPPool), nil
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by lister-gen. DO NOT EDIT.

package v1

import (
        v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "k8s.io/apimachinery/pkg/api/errors"
        "k8s.io/apimachinery/pkg/labels"
        "k8s.io/client-go/tools/cache"
)

// VirtualMachineLister helps list VirtualMachines.
// All objects returned here must be treated as read-only.
type VirtualMachineLister interface {
        // List lists all VirtualMachines in the indexer.
        // Objects returned here must be treated as read-only.
        List(selector labels.Selector) (ret []*v1.VirtualMachine, err error)
        // VirtualMachines returns an object that can list and get VirtualMachines.
        VirtualMachines(namespace string) VirtualMachineNamespaceLister
        VirtualMachineListerExpansion
}

// virtualMachineLister implements the VirtualMachineLister interface.
type virtualMachineLister struct {
        indexer cache.Indexer
}

// NewVirtualMachineLister returns a new VirtualMachineLister.
func NewVirtualMachineLister(indexer cache.Indexer) VirtualMachineLister {
        return &virtualMachineLister{indexer: indexer}
}

// List lists all VirtualMachines in the indexer.
func (s *virtualMachineLister) List(selector labels.Selector) (ret []*v1.VirtualMachine, err error) {
        err = cache.ListAll(s.indexer, selector, func(m interface{}) {
                ret = append(ret, m.(*v1.VirtualMachine))
        })
        return ret, err
}

// VirtualMachines returns an object that can list and get VirtualMachines.
func (s *virtualMachineLister) VirtualMachines(namespace string) VirtualMachineNamespaceLister {
        return virtualMachineNamespaceLister{indexer: s.indexer, namespace: namespace}
}

// VirtualMachineNamespaceLister helps list and get VirtualMachines.
// All objects returned here must be treated as read-only.
type VirtualMachineNamespaceLister interface {
        // List lists all VirtualMachines in the indexer for a given namespace.
        // Objects returned here must be treated as read-only.
        List(selector labels.Selector) (ret []*v1.VirtualMachine, err error)
        // Get retrieves the VirtualMachine from the indexer for a given namespace and name.
        // Objects returned here must be treated as read-only.
        Get(name string) (*v1.VirtualMachine, error)
        VirtualMachineNamespaceListerExpansion
}

// virtualMachineNamespaceLister implements the VirtualMachineNamespaceLister
// interface.
type virtualMachineNamespaceLister struct {
        indexer   cache.Indexer
        namespace string
}

// List lists all VirtualMachines in the indexer for a given namespace.
func (s virtualMachineNamespaceLister) List(selector labels.Selector) (ret []*v1.VirtualMachine, err error) {
        err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) {
                ret = append(ret, m.(*v1.VirtualMachine))
        })
        return ret, err
}

// Get retrieves the VirtualMachine from the indexer for a given namespace and name.
func (s virtualMachineNamespaceLister) Get(name string) (*v1.VirtualMachine, error) {
        obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name)
        if err != nil {
                return nil, err
        }
        if !exists {
                return nil, errors.NewNotFound(v1.Resource("virtualmachine"), name)
        }
        return obj.(*v1.VirtualMachine), nil
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by lister-gen. DO NOT EDIT.

package v1

import (
        v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "k8s.io/apimachinery/pkg/api/errors"
        "k8s.io/apimachinery/pkg/labels"
        "k8s.io/client-go/tools/cache"
)

// VirtualMachineMigrationLister helps list VirtualMachineMigrations.
// All objects returned here must be treated as read-only.
type VirtualMachineMigrationLister interface {
        // List lists all VirtualMachineMigrations in the indexer.
        // Objects returned here must be treated as read-only.
        List(selector labels.Selector) (ret []*v1.VirtualMachineMigration, err error)
        // VirtualMachineMigrations returns an object that can list and get VirtualMachineMigrations.
        VirtualMachineMigrations(namespace string) VirtualMachineMigrationNamespaceLister
        VirtualMachineMigrationListerExpansion
}

// virtualMachineMigrationLister implements the VirtualMachineMigrationLister interface.
type virtualMachineMigrationLister struct {
        indexer cache.Indexer
}

// NewVirtualMachineMigrationLister returns a new VirtualMachineMigrationLister.
func NewVirtualMachineMigrationLister(indexer cache.Indexer) VirtualMachineMigrationLister {
        return &virtualMachineMigrationLister{indexer: indexer}
}

// List lists all VirtualMachineMigrations in the indexer.
func (s *virtualMachineMigrationLister) List(selector labels.Selector) (ret []*v1.VirtualMachineMigration, err error) {
        err = cache.ListAll(s.indexer, selector, func(m interface{}) {
                ret = append(ret, m.(*v1.VirtualMachineMigration))
        })
        return ret, err
}

// VirtualMachineMigrations returns an object that can list and get VirtualMachineMigrations.
func (s *virtualMachineMigrationLister) VirtualMachineMigrations(namespace string) VirtualMachineMigrationNamespaceLister {
        return virtualMachineMigrationNamespaceLister{indexer: s.indexer, namespace: namespace}
}

// VirtualMachineMigrationNamespaceLister helps list and get VirtualMachineMigrations.
// All objects returned here must be treated as read-only.
type VirtualMachineMigrationNamespaceLister interface {
        // List lists all VirtualMachineMigrations in the indexer for a given namespace.
        // Objects returned here must be treated as read-only.
        List(selector labels.Selector) (ret []*v1.VirtualMachineMigration, err error)
        // Get retrieves the VirtualMachineMigration from the indexer for a given namespace and name.
        // Objects returned here must be treated as read-only.
        Get(name string) (*v1.VirtualMachineMigration, error)
        VirtualMachineMigrationNamespaceListerExpansion
}

// virtualMachineMigrationNamespaceLister implements the VirtualMachineMigrationNamespaceLister
// interface.
type virtualMachineMigrationNamespaceLister struct {
        indexer   cache.Indexer
        namespace string
}

// List lists all VirtualMachineMigrations in the indexer for a given namespace.
func (s virtualMachineMigrationNamespaceLister) List(selector labels.Selector) (ret []*v1.VirtualMachineMigration, err error) {
        err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) {
                ret = append(ret, m.(*v1.VirtualMachineMigration))
        })
        return ret, err
}

// Get retrieves the VirtualMachineMigration from the indexer for a given namespace and name.
func (s virtualMachineMigrationNamespaceLister) Get(name string) (*v1.VirtualMachineMigration, error) {
        obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name)
        if err != nil {
                return nil, err
        }
        if !exists {
                return nil, errors.NewNotFound(v1.Resource("virtualmachinemigration"), name)
        }
        return obj.(*v1.VirtualMachineMigration), nil
}

package agent

import (
        "fmt"
        "os"
)

// EnvArgs stores the static configuration data assigned to the autoscaler agent by its
// environment
type EnvArgs struct {
        // ConfigPath gives the path to read static configuration from. It is taken from the CONFIG_PATH
        // environment variable.
        ConfigPath string

        // K8sNodeName is the Kubernetes node the autoscaler agent is running on. It is taken from the
        // K8S_NODE_NAME environment variable, which is set equal to the pod's Spec.NodeName.
        //
        // The Kubernetes documentation doesn't say this, but the NodeName is always populated with the
        // final node the pod was placed on by the time the environment variables are set.
        K8sNodeName string

        // K8sPodIP is the IP address of the Kubernetes pod that this autoscaler-agent is running in
        K8sPodIP string
}

func getEnvVar(err *error, require_nonempty bool, varName string) string {
        if *err != nil {
                return ""
        }

        s := os.Getenv(varName)
        if s == "" && require_nonempty {
                *err = fmt.Errorf("Missing %s in environment", varName)
        }

        return s
}

func ArgsFromEnv() (EnvArgs, error) {
        var err error

        args := EnvArgs{
                ConfigPath:  getEnvVar(&err, true, "CONFIG_PATH"),
                K8sNodeName: getEnvVar(&err, true, "K8S_NODE_NAME"),
                K8sPodIP:    getEnvVar(&err, true, "K8S_POD_IP"),
        }

        if err != nil {
                return EnvArgs{}, err
        } else {
                return args, err
        }
}

package billing

import (
        "context"
        "errors"
        "fmt"
        "math"
        "time"

        "go.uber.org/zap"

        "k8s.io/apimachinery/pkg/types"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/reporting"
        "github.com/neondatabase/autoscaling/pkg/util/taskgroup"
)

type Config struct {
        Clients                ClientsConfig `json:"clients"`
        CPUMetricName          string        `json:"cpuMetricName"`
        ActiveTimeMetricName   string        `json:"activeTimeMetricName"`
        CollectEverySeconds    uint          `json:"collectEverySeconds"`
        AccumulateEverySeconds uint          `json:"accumulateEverySeconds"`
}

type metricsState struct {
        historical      map[metricsKey]vmMetricsHistory
        present         map[metricsKey]vmMetricsInstant
        lastCollectTime *time.Time
        pushWindowStart time.Time
}

type metricsKey struct {
        uid        types.UID
        endpointID string
}

type vmMetricsHistory struct {
        lastSlice *metricsTimeSlice
        total     vmMetricsSeconds
}

type metricsTimeSlice struct {
        metrics   vmMetricsInstant
        startTime time.Time
        endTime   time.Time
}

func (m *metricsTimeSlice) Duration() time.Duration { return m.endTime.Sub(m.startTime) }

type vmMetricsInstant struct {
        // cpu stores the cpu allocation at a particular instant.
        cpu vmv1.MilliCPU
}

// vmMetricsSeconds is like vmMetrics, but the values cover the allocation over time
type vmMetricsSeconds struct {
        // cpu stores the CPU seconds allocated to the VM, roughly equivalent to the integral of CPU
        // usage over time.
        cpu float64
        // activeTime stores the total time that the VM was active
        activeTime time.Duration
}

type MetricsCollector struct {
        conf    *Config
        sink    *reporting.EventSink[*IncrementalEvent]
        metrics PromMetrics
}

func NewMetricsCollector(
        ctx context.Context,
        parentLogger *zap.Logger,
        conf *Config,
        metrics PromMetrics,
) (*MetricsCollector, error) {
        logger := parentLogger.Named("billing")

        clients, err := createClients(ctx, logger, conf.Clients)
        if err != nil {
                return nil, err
        }

        sink := reporting.NewEventSink(logger, metrics.reporting, clients...)

        return &MetricsCollector{
                conf:    conf,
                sink:    sink,
                metrics: metrics,
        }, nil
}

func (mc *MetricsCollector) Run(
        ctx context.Context,
        logger *zap.Logger,
        store VMStoreForNode,
) error {
        tg := taskgroup.NewGroup(logger, taskgroup.WithParentContext(ctx))

        // note: sink has its own context, so that it is canceled only after runCollector finishes.
        sinkCtx, cancelSink := context.WithCancel(context.Background())
        defer cancelSink() // make sure resources are cleaned up

        tg.Go("collect", func(logger *zap.Logger) error {
                defer cancelSink() // cancel event sending *only when we're done collecting*
                return mc.runCollector(tg.Ctx(), logger, store)
        })

        tg.Go("sink-run", func(logger *zap.Logger) error {
                err := mc.sink.Run(sinkCtx) // note: NOT tg.Ctx(); see more above.
                if err != nil {
                        return fmt.Errorf("billing events sink failed: %w", err)
                }
                return nil
        })

        return tg.Wait()
}

func (mc *MetricsCollector) runCollector(
        ctx context.Context,
        logger *zap.Logger,
        store VMStoreForNode,
) error {
        collectTicker := time.NewTicker(time.Second * time.Duration(mc.conf.CollectEverySeconds))
        defer collectTicker.Stop()
        // Offset by half a second, so it's a bit more deterministic.
        time.Sleep(500 * time.Millisecond)
        accumulateTicker := time.NewTicker(time.Second * time.Duration(mc.conf.AccumulateEverySeconds))
        defer accumulateTicker.Stop()

        state := metricsState{
                historical:      make(map[metricsKey]vmMetricsHistory),
                present:         make(map[metricsKey]vmMetricsInstant),
                lastCollectTime: nil,
                pushWindowStart: time.Now(),
        }

        state.collect(logger, store, mc.metrics)

        for {
                select {
                case <-collectTicker.C:
                        logger.Info("Collecting billing state")
                        if store.Stopped() && ctx.Err() == nil {
                                err := errors.New("VM store stopped but background context is still live")
                                logger.Panic("Validation check failed", zap.Error(err))
                                return err
                        }
                        state.collect(logger, store, mc.metrics)
                case <-accumulateTicker.C:
                        logger.Info("Creating billing batch")
                        state.drainEnqueue(logger, mc.conf, GetHostname(), mc.sink)
                case <-ctx.Done():
                        return nil
                }
        }
}

func (s *metricsState) collect(logger *zap.Logger, store VMStoreForNode, metrics PromMetrics) {
        now := time.Now()

        metricsBatch := metrics.forBatch()
        defer metricsBatch.finish() // This doesn't *really* need to be deferred, but it's up here so we don't forget

        old := s.present
        s.present = make(map[metricsKey]vmMetricsInstant)
        var vmsOnThisNode []*vmv1.VirtualMachine
        if store.Failing() {
                logger.Error("VM store is currently stopped. No events will be recorded")
        } else {
                vmsOnThisNode = store.ListIndexed(func(i *VMNodeIndex) []*vmv1.VirtualMachine {
                        return i.List()
                })
        }
        for _, vm := range vmsOnThisNode {
                endpointID, isEndpoint := vm.Annotations[api.AnnotationBillingEndpointID]
                metricsBatch.inc(isEndpointFlag(isEndpoint), autoscalingEnabledFlag(api.HasAutoscalingEnabled(vm)), vm.Status.Phase)
                if !isEndpoint {
                        // we're only reporting metrics for VMs with endpoint IDs, and this VM doesn't have one
                        continue
                }

                if !vm.Status.Phase.IsAlive() || vm.Status.CPUs == nil {
                        continue
                }

                key := metricsKey{
                        uid:        vm.UID,
                        endpointID: endpointID,
                }
                presentMetrics := vmMetricsInstant{
                        cpu: *vm.Status.CPUs,
                }
                if oldMetrics, ok := old[key]; ok {
                        // The VM was present from s.lastTime to now. Add a time slice to its metrics history.
                        timeSlice := metricsTimeSlice{
                                metrics: vmMetricsInstant{
                                        // strategically under-bill by assigning the minimum to the entire time slice.
                                        cpu: min(oldMetrics.cpu, presentMetrics.cpu),
                                },
                                // note: we know s.lastTime != nil because otherwise old would be empty.
                                startTime: *s.lastCollectTime,
                                endTime:   now,
                        }

                        vmHistory, ok := s.historical[key]
                        if !ok {
                                vmHistory = vmMetricsHistory{
                                        lastSlice: nil,
                                        total:     vmMetricsSeconds{cpu: 0, activeTime: time.Duration(0)},
                                }
                        }
                        // append the slice, merging with the previous if the resource usage was the same
                        vmHistory.appendSlice(timeSlice)
                        s.historical[key] = vmHistory
                }

                s.present[key] = presentMetrics
        }

        s.lastCollectTime = &now
}

func (h *vmMetricsHistory) appendSlice(timeSlice metricsTimeSlice) {
        // Try to extend the existing period of continuous usage
        if h.lastSlice != nil && h.lastSlice.tryMerge(timeSlice) {
                return
        }

        // Something's new. Push previous time slice, start new one:
        h.finalizeCurrentTimeSlice()
        h.lastSlice = &timeSlice
}

// finalizeCurrentTimeSlice pushes the current time slice onto h.total
//
// This ends up rounding down the total time spent on a given time slice, so it's best to defer
// calling this function until it's actually needed.
func (h *vmMetricsHistory) finalizeCurrentTimeSlice() {
        if h.lastSlice == nil {
                return
        }

        duration := h.lastSlice.Duration()
        if duration < 0 {
                panic("negative duration")
        }

        // TODO: This approach is imperfect. Floating-point math is probably *fine*, but really not
        // something we want to rely on. A "proper" solution is a lot of work, but long-term valuable.
        metricsSeconds := vmMetricsSeconds{
                cpu:        duration.Seconds() * h.lastSlice.metrics.cpu.AsFloat64(),
                activeTime: duration,
        }
        h.total.cpu += metricsSeconds.cpu
        h.total.activeTime += metricsSeconds.activeTime

        h.lastSlice = nil
}

// tryMerge attempts to merge s and next (assuming that next is after s), returning true only if
// that merging was successful.
//
// Merging may fail if s.endTime != next.startTime or s.metrics != next.metrics.
func (s *metricsTimeSlice) tryMerge(next metricsTimeSlice) bool {
        merged := s.endTime == next.startTime && s.metrics == next.metrics
        if merged {
                s.endTime = next.endTime
        }
        return merged
}

func logAddedEvent(logger *zap.Logger, event *IncrementalEvent) *IncrementalEvent {
        logger.Info(
                "Adding event to batch",
                zap.String("IdempotencyKey", event.IdempotencyKey),
                zap.String("EndpointID", event.EndpointID),
                zap.String("MetricName", event.MetricName),
                zap.Int("Value", event.Value),
        )
        return event
}

// drainEnqueue clears the current history, adding it as events to the queue
func (s *metricsState) drainEnqueue(
        logger *zap.Logger,
        conf *Config,
        hostname string,
        sink *reporting.EventSink[*IncrementalEvent],
) {
        now := time.Now()

        countInBatch := 0
        batchSize := 2 * len(s.historical)

        enqueue := sink.Enqueue

        for key, history := range s.historical {
                history.finalizeCurrentTimeSlice()

                countInBatch += 1
                enqueue(logAddedEvent(logger, enrichEvents(now, hostname, countInBatch, batchSize, &IncrementalEvent{
                        MetricName:     conf.CPUMetricName,
                        Type:           "", // set by enrichEvents
                        IdempotencyKey: "", // set by enrichEvents
                        EndpointID:     key.endpointID,
                        // TODO: maybe we should store start/stop time in the vmMetricsHistory object itself?
                        // That way we can be aligned to collection, rather than pushing.
                        StartTime: s.pushWindowStart,
                        StopTime:  now,
                        Value:     int(math.Round(history.total.cpu)),
                })))
                countInBatch += 1
                enqueue(logAddedEvent(logger, enrichEvents(now, hostname, countInBatch, batchSize, &IncrementalEvent{
                        MetricName:     conf.ActiveTimeMetricName,
                        Type:           "", // set by enrichEvents
                        IdempotencyKey: "", // set by enrichEvents
                        EndpointID:     key.endpointID,
                        StartTime:      s.pushWindowStart,
                        StopTime:       now,
                        Value:          int(math.Round(history.total.activeTime.Seconds())),
                })))
        }

        s.pushWindowStart = now
        s.historical = make(map[metricsKey]vmMetricsHistory)
}

package billing

// Management of billing clients

import (
        "context"
        "fmt"
        "net/http"
        "time"

        "github.com/lithammer/shortuuid"
        "go.uber.org/zap"

        "github.com/neondatabase/autoscaling/pkg/reporting"
)

type ClientsConfig struct {
        AzureBlob *AzureBlobStorageClientConfig `json:"azureBlob"`
        HTTP      *HTTPClientConfig             `json:"http"`
        S3        *S3ClientConfig               `json:"s3"`
}

type S3ClientConfig struct {
        reporting.BaseClientConfig
        reporting.S3ClientConfig
        PrefixInBucket string `json:"prefixInBucket"`
}

type AzureBlobStorageClientConfig struct {
        reporting.BaseClientConfig
        reporting.AzureBlobStorageClientConfig
        PrefixInContainer string `json:"prefixInContainer"`
}

type HTTPClientConfig struct {
        reporting.BaseClientConfig
        URL string `json:"url"`
}

type billingClient = reporting.Client[*IncrementalEvent]

func createClients(ctx context.Context, logger *zap.Logger, cfg ClientsConfig) ([]billingClient, error) {
        var clients []billingClient

        if c := cfg.HTTP; c != nil {
                client := reporting.NewHTTPClient(http.DefaultClient, reporting.HTTPClientConfig{
                        URL:    fmt.Sprintf("%s/usage_events", c.URL),
                        Method: http.MethodPost,
                })
                logger.Info("Created HTTP client for billing events", zap.Any("config", c))

                clients = append(clients, billingClient{
                        Name:            "http",
                        Base:            client,
                        BaseConfig:      c.BaseClientConfig,
                        NewBatchBuilder: jsonArrayBatch(reporting.NewByteBuffer), // note: NOT gzipped.
                })

        }
        if c := cfg.AzureBlob; c != nil {
                generateKey := newBlobStorageKeyGenerator(c.PrefixInContainer)
                client, err := reporting.NewAzureBlobStorageClient(c.AzureBlobStorageClientConfig, generateKey)
                if err != nil {
                        return nil, fmt.Errorf("error creating Azure Blob Storage client: %w", err)
                }
                logger.Info("Created Azure Blob Storage client for billing events", zap.Any("config", c))

                clients = append(clients, billingClient{
                        Name:            "azureblob",
                        Base:            client,
                        BaseConfig:      c.BaseClientConfig,
                        NewBatchBuilder: jsonArrayBatch(reporting.NewGZIPBuffer),
                })
        }
        if c := cfg.S3; c != nil {
                generateKey := newBlobStorageKeyGenerator(c.PrefixInBucket)
                client, err := reporting.NewS3Client(ctx, c.S3ClientConfig, generateKey)
                if err != nil {
                        return nil, fmt.Errorf("error creating S3 client: %w", err)
                }
                logger.Info("Created S3 client for billing events", zap.Any("config", c))

                clients = append(clients, billingClient{
                        Name:            "s3",
                        Base:            client,
                        BaseConfig:      c.BaseClientConfig,
                        NewBatchBuilder: jsonArrayBatch(reporting.NewGZIPBuffer),
                })
        }

        return clients, nil
}

func jsonArrayBatch[B reporting.IOBuffer](buf func() B) func() reporting.BatchBuilder[*IncrementalEvent] {
        return func() reporting.BatchBuilder[*IncrementalEvent] {
                return reporting.NewJSONArrayBuilder[*IncrementalEvent](buf(), "events")
        }
}

// Returns a function to generate keys for the placement of billing events data into blob storage.
//
// Example: prefixInContainer/year=2021/month=01/day=26/hh:mm:ssZ_{uuid}.ndjson.gz
//
// NOTE: This key format is different from the one we use for scaling events, but similar to the one
// proxy/storage use.
func newBlobStorageKeyGenerator(prefix string) func() string {
        return func() string {
                now := time.Now()
                id := shortuuid.New()

                return fmt.Sprintf("%s/year=%d/month=%02d/day=%02d/%s_%s.ndjson.gz",
                        prefix,
                        now.Year(), now.Month(), now.Day(),
                        now.Format("15:04:05Z"),
                        id,
                )
        }
}

package billing

import (
        "fmt"
        "time"
)

type Event interface {
        *AbsoluteEvent | *IncrementalEvent

        // eventMethods must be separate from Event so that we can assert that *AbsoluteEvent and
        // *IncrementalEvent both implement it - Go does not allow converting to a value of type Event
        // because it contains "*AbsoluteEvent | *IncrementalEvent", and such constraints can only be
        // used inside of generics.
        eventMethods
}

// eventMethods is a requirement for Event, but exists separately so that we can assert that the
// event types implement it.
//
// The reason this interface even exists in the first place is because we're not allowed to assume
// that a type E implementing Event actually has the common fields from AbsoluteEvent and
// IncrementalEvent, even though it's constrained to either of those types.
type eventMethods interface {
        setType()
        getIdempotencyKey() *string
}

var (
        _ eventMethods = (*AbsoluteEvent)(nil)
        _ eventMethods = (*IncrementalEvent)(nil)
)

type AbsoluteEvent struct {
        IdempotencyKey string    `json:"idempotency_key"`
        MetricName     string    `json:"metric"`
        Type           string    `json:"type"`
        TenantID       string    `json:"tenant_id"`
        TimelineID     string    `json:"timeline_id"`
        Time           time.Time `json:"time"`
        Value          int       `json:"value"`
}

// setType implements eventMethods
func (e *AbsoluteEvent) setType() {
        e.Type = "absolute"
}

// getIdempotencyKey implements eventMethods
func (e *AbsoluteEvent) getIdempotencyKey() *string {
        return &e.IdempotencyKey
}

type IncrementalEvent struct {
        IdempotencyKey string    `json:"idempotency_key"`
        MetricName     string    `json:"metric"`
        Type           string    `json:"type"`
        EndpointID     string    `json:"endpoint_id"`
        StartTime      time.Time `json:"start_time"`
        StopTime       time.Time `json:"stop_time"`
        Value          int       `json:"value"`
}

// setType implements eventMethods
func (e *IncrementalEvent) setType() {
        e.Type = "incremental"
}

// getIdempotencyKey implements eventMethods
func (e *IncrementalEvent) getIdempotencyKey() *string {
        return &e.IdempotencyKey
}

// enrichEvents sets the event's Type and IdempotencyKey fields, so that users of this API don't need to
// manually set them
func enrichEvents[E Event](now time.Time, hostname string, countInBatch, batchSize int, event E) E {
        event.setType()

        // RFC3339 with microsecond precision. Possible to get collisions with millis, nanos are extra.
        // And everything's in UTC, so there's no sense including the offset.
        formattedTime := now.In(time.UTC).Format("2006-01-02T15:04:05.999999Z")

        key := event.getIdempotencyKey()
        if *key == "" {
                *key = fmt.Sprintf("%s-%s-%d/%d", formattedTime, hostname, countInBatch, batchSize)
        }

        return event
}

package billing

import (
        "fmt"
        "os"
)

var hostname string

func init() {
        var err error
        hostname, err = os.Hostname()
        if err != nil {
                panic(fmt.Errorf("failed to get hostname: %w", err))
        }
}

// GetHostname returns the hostname to be used for enriching billing events (see Enrich())
//
// This function MUST NOT be run before init has finished.
func GetHostname() string {
        return hostname
}

package billing

// Types and implementation relating to VMNodeIndex, which provides indexing for watch.Watch for
// efficient lookup of VMs on a particular node.

import (
        "k8s.io/apimachinery/pkg/types"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/util/watch"
)

type VMStoreForNode = watch.IndexedStore[vmv1.VirtualMachine, *VMNodeIndex]

// VMNodeIndex is a watch.Index that stores all of the VMs for a particular node
//
// We have to implement this ourselves because K8s does not (as of 2023-04-04) support field
// selectors on CRDs, so we can't have the API server filter out VMs for us.
//
// For more info, see: https://github.com/kubernetes/kubernetes/issues/53459
// This comment in particular was particularly instructive:
// https://github.com/kubernetes/kubernetes/issues/53459#issuecomment-1146200268
type VMNodeIndex struct {
        forNode map[types.UID]*vmv1.VirtualMachine
        node    string
}

func NewVMNodeIndex(node string) *VMNodeIndex {
        return &VMNodeIndex{
                forNode: make(map[types.UID]*vmv1.VirtualMachine),
                node:    node,
        }
}

func (i *VMNodeIndex) Add(vm *vmv1.VirtualMachine) {
        if vm.Status.Node == i.node {
                i.forNode[vm.UID] = vm
        }
}

func (i *VMNodeIndex) Update(oldVM, newVM *vmv1.VirtualMachine) {
        i.Delete(oldVM)
        i.Add(newVM)
}

func (i *VMNodeIndex) Delete(vm *vmv1.VirtualMachine) {
        // note: delete is a no-op if the key isn't present.
        delete(i.forNode, vm.UID)
}

func (i *VMNodeIndex) List() []*vmv1.VirtualMachine {
        items := make([]*vmv1.VirtualMachine, 0, len(i.forNode))
        for _, vm := range i.forNode {
                items = append(items, vm)
        }
        return items
}

package billing

// Prometheus metrics for the agent's billing subsystem

import (
        "strconv"

        "github.com/prometheus/client_golang/prometheus"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/reporting"
        "github.com/neondatabase/autoscaling/pkg/util"
)

type PromMetrics struct {
        reporting *reporting.EventSinkMetrics

        vmsProcessedTotal *prometheus.CounterVec
        vmsCurrent        *prometheus.GaugeVec
}

func NewPromMetrics(reg prometheus.Registerer) PromMetrics {
        return PromMetrics{
                reporting: reporting.NewEventSinkMetrics("autoscaling_agent_billing", reg),

                vmsProcessedTotal: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_agent_billing_vms_processed_total",
                                Help: "Total number of times the autoscaler-agent's billing subsystem processes any VM",
                        },
                        []string{"is_endpoint", "autoscaling_enabled", "phase"},
                )),
                vmsCurrent: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: "autoscaling_agent_billing_vms_current",
                                Help: "Total current VMs visible to the autoscaler-agent's billing subsystem, labeled by some bits of metadata",
                        },
                        []string{"is_endpoint", "autoscaling_enabled", "phase"},
                )),
        }
}

type batchMetrics struct {
        total map[batchMetricsLabels]int

        vmsProcessedTotal *prometheus.CounterVec
        vmsCurrent        *prometheus.GaugeVec
}

type batchMetricsLabels struct {
        isEndpoint         string
        autoscalingEnabled string
        phase              string
}

func (m PromMetrics) forBatch() batchMetrics {
        return batchMetrics{
                total: make(map[batchMetricsLabels]int),

                vmsProcessedTotal: m.vmsProcessedTotal,
                vmsCurrent:        m.vmsCurrent,
        }
}

type (
        isEndpointFlag         bool
        autoscalingEnabledFlag bool
)

func (b batchMetrics) inc(isEndpoint isEndpointFlag, autoscalingEnabled autoscalingEnabledFlag, phase vmv1.VmPhase) {
        key := batchMetricsLabels{
                isEndpoint:         strconv.FormatBool(bool(isEndpoint)),
                autoscalingEnabled: strconv.FormatBool(bool(autoscalingEnabled)),
                phase:              string(phase),
        }

        b.total[key] = b.total[key] + 1
        b.vmsProcessedTotal.
                WithLabelValues(key.isEndpoint, key.autoscalingEnabled, key.phase).
                Inc()
}

func (b batchMetrics) finish() {
        b.vmsCurrent.Reset()

        for key, count := range b.total {
                b.vmsCurrent.WithLabelValues(key.isEndpoint, key.autoscalingEnabled, key.phase).Set(float64(count))
        }
}

package agent

import (
        "encoding/json"
        "fmt"
        "os"

        "github.com/tychoish/fun/erc"

        "github.com/neondatabase/autoscaling/pkg/agent/billing"
        "github.com/neondatabase/autoscaling/pkg/agent/scalingevents"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/reporting"
)

type Config struct {
        RefreshStateIntervalSeconds uint `json:"refereshStateIntervalSeconds"`

        Billing       billing.Config       `json:"billing"`
        ScalingEvents scalingevents.Config `json:"scalingEvents"`

        Scaling   ScalingConfig    `json:"scaling"`
        Metrics   MetricsConfig    `json:"metrics"`
        Scheduler SchedulerConfig  `json:"scheduler"`
        Monitor   MonitorConfig    `json:"monitor"`
        NeonVM    NeonVMConfig     `json:"neonvm"`
        DumpState *DumpStateConfig `json:"dumpState"`
}

type RateThresholdConfig struct {
        IntervalSeconds uint `json:"intervalSeconds"`
        Threshold       uint `json:"threshold"`
}

type MonitorConfig struct {
        ResponseTimeoutSeconds uint `json:"responseTimeoutSeconds"`
        // ConnectionTimeoutSeconds gives how long we may take to connect to the
        // monitor before cancelling.
        ConnectionTimeoutSeconds uint `json:"connectionTimeoutSeconds"`
        // ConnectionRetryMinWaitSeconds gives the minimum amount of time we must wait between attempts
        // to connect to the vm-monitor, regardless of whether they're successful.
        ConnectionRetryMinWaitSeconds uint `json:"connectionRetryMinWaitSeconds"`
        // ServerPort is the port that the dispatcher serves from
        ServerPort uint16 `json:"serverPort"`
        // UnhealthyAfterSilenceDurationSeconds gives the duration, in seconds, after which failing to
        // receive a successful request from the monitor indicates that it is probably unhealthy.
        UnhealthyAfterSilenceDurationSeconds uint `json:"unhealthyAfterSilenceDurationSeconds"`
        // UnhealthyStartupGracePeriodSeconds gives the duration, in seconds, after which we will no
        // longer excuse total VM monitor failures - i.e. when unhealthyAfterSilenceDurationSeconds
        // kicks in.
        UnhealthyStartupGracePeriodSeconds uint `json:"unhealthyStartupGracePeriodSeconds"`
        // MaxHealthCheckSequentialFailuresSeconds gives the duration, in seconds, after which we
        // should restart the connection to the vm-monitor if health checks aren't succeeding.
        MaxHealthCheckSequentialFailuresSeconds uint `json:"maxHealthCheckSequentialFailuresSeconds"`
        // MaxFailedRequestRate defines the maximum rate of failed monitor requests, above which
        // a VM is considered stuck.
        MaxFailedRequestRate RateThresholdConfig `json:"maxFailedRequestRate"`

        // RetryFailedRequestSeconds gives the duration, in seconds, that we must wait before retrying a
        // request that previously failed.
        RetryFailedRequestSeconds uint `json:"retryFailedRequestSeconds"`
        // RetryDeniedDownscaleSeconds gives the duration, in seconds, that we must wait before retrying
        // a downscale request that was previously denied
        RetryDeniedDownscaleSeconds uint `json:"retryDeniedDownscaleSeconds"`
        // RequestedUpscaleValidSeconds gives the duration, in seconds, that requested upscaling should
        // be respected for, before allowing re-downscaling.
        RequestedUpscaleValidSeconds uint `json:"requestedUpscaleValidSeconds"`
}

// DumpStateConfig configures the endpoint to dump all internal state
type DumpStateConfig struct {
        // Port is the port to serve on
        Port uint16 `json:"port"`
        // TimeoutSeconds gives the maximum duration, in seconds, that we allow for a request to dump
        // internal state.
        TimeoutSeconds uint `json:"timeoutSeconds"`
}

// ScalingConfig defines the scheduling we use for scaling up and down
type ScalingConfig struct {
        // ComputeUnit is the desired ratio between CPU and memory that the autoscaler-agent should
        // uphold when making changes to a VM
        ComputeUnit api.Resources `json:"computeUnit"`
        // DefaultConfig gives the default scaling config, to be used if there is no configuration
        // supplied with the "autoscaling.neon.tech/config" annotation.
        DefaultConfig api.ScalingConfig `json:"defaultConfig"`
}

// MetricsConfig defines a few parameters for metrics requests to the VM
type MetricsConfig struct {
        System MetricsSourceConfig `json:"system"`
        LFC    MetricsSourceConfig `json:"lfc"`
}

type MetricsSourceConfig struct {
        // Port is the port that VMs are expected to provide the metrics on
        //
        // For system metrics, vm-builder installs vector (from vector.dev) to expose them on port 9100.
        Port uint16 `json:"port"`
        // RequestTimeoutSeconds gives the timeout duration, in seconds, for metrics requests
        RequestTimeoutSeconds uint `json:"requestTimeoutSeconds"`
        // SecondsBetweenRequests sets the number of seconds to wait between metrics requests
        SecondsBetweenRequests uint `json:"secondsBetweenRequests"`
}

// SchedulerConfig defines a few parameters for scheduler requests
type SchedulerConfig struct {
        // SchedulerName is the name of the scheduler we're expecting to communicate with.
        //
        // Any VMs that don't have a matching Spec.SchedulerName will not be autoscaled.
        SchedulerName string `json:"schedulerName"`
        // RequestTimeoutSeconds gives the timeout duration, in seconds, for requests to the scheduler
        //
        // If zero, requests will have no timeout.
        RequestTimeoutSeconds uint `json:"requestTimeoutSeconds"`
        // RequestAtLeastEverySeconds gives the maximum duration we should go without attempting a
        // request to the scheduler, even if nothing's changed.
        RequestAtLeastEverySeconds uint `json:"requestAtLeastEverySeconds"`
        // RetryFailedRequestSeconds gives the duration, in seconds, that we must wait after a previous
        // failed request before making another one.
        RetryFailedRequestSeconds uint `json:"retryFailedRequestSeconds"`
        // RetryDeniedUpscaleSeconds gives the duration, in seconds, that we must wait before resending
        // a request for resources that were not approved
        RetryDeniedUpscaleSeconds uint `json:"retryDeniedUpscaleSeconds"`
        // RequestPort defines the port to access the scheduler's ✨special✨ API with
        RequestPort uint16 `json:"requestPort"`
        // MaxFailedRequestRate defines the maximum rate of failed scheduler requests, above which
        // a VM is considered stuck.
        MaxFailedRequestRate RateThresholdConfig `json:"maxFailedRequestRate"`
}

// NeonVMConfig defines a few parameters for NeonVM requests
type NeonVMConfig struct {
        // RequestTimeoutSeconds gives the timeout duration, in seconds, for VM patch requests
        RequestTimeoutSeconds uint `json:"requestTimeoutSeconds"`
        // RetryFailedRequestSeconds gives the duration, in seconds, that we must wait after a previous
        // failed request before making another one.
        RetryFailedRequestSeconds uint `json:"retryFailedRequestSeconds"`

        // MaxFailedRequestRate defines the maximum rate of failed NeonVM requests, above which
        // a VM is considered stuck.
        MaxFailedRequestRate RateThresholdConfig `json:"maxFailedRequestRate"`
}

func ReadConfig(path string) (*Config, error) {
        file, err := os.Open(path)
        if err != nil {
                return nil, fmt.Errorf("Error opening config file %q: %w", path, err)
        }

        defer file.Close()
        var config Config
        jsonDecoder := json.NewDecoder(file)
        jsonDecoder.DisallowUnknownFields()
        if err = jsonDecoder.Decode(&config); err != nil {
                return nil, fmt.Errorf("Error decoding JSON config in %q: %w", path, err)
        }

        if err = config.validate(); err != nil {
                return nil, fmt.Errorf("Invalid config: %w", err)
        }

        return &config, nil
}

func (c *Config) validate() error {
        ec := &erc.Collector{}

        const (
                emptyTmpl = "field %q cannot be empty"
                zeroTmpl  = "field %q cannot be zero"
        )

        validateBaseReportingConfig := func(cfg *reporting.BaseClientConfig, key string) {
                erc.Whenf(ec, cfg.PushEverySeconds == 0, zeroTmpl, fmt.Sprintf("%s.pushEverySeconds", key))
                erc.Whenf(ec, cfg.PushRequestTimeoutSeconds == 0, zeroTmpl, fmt.Sprintf("%s.pushRequestTimeoutSeconds", key))
                erc.Whenf(ec, cfg.MaxBatchSize == 0, zeroTmpl, fmt.Sprintf("%s.maxBatchSize", key))
        }
        validateS3ReportingConfig := func(cfg *reporting.S3ClientConfig, key string) {
                erc.Whenf(ec, cfg.Bucket == "", emptyTmpl, fmt.Sprintf(".%s.bucket", key))
                erc.Whenf(ec, cfg.Region == "", emptyTmpl, fmt.Sprintf(".%s.region", key))
        }
        validateAzureBlobReportingConfig := func(cfg *reporting.AzureBlobStorageClientConfig, key string) {
                erc.Whenf(ec, cfg.Endpoint == "", emptyTmpl, fmt.Sprintf(".%s.endpoint", key))
                erc.Whenf(ec, cfg.Container == "", emptyTmpl, fmt.Sprintf("%s.container", key))
        }

        erc.Whenf(ec, c.Billing.ActiveTimeMetricName == "", emptyTmpl, ".billing.activeTimeMetricName")
        erc.Whenf(ec, c.Billing.CPUMetricName == "", emptyTmpl, ".billing.cpuMetricName")
        erc.Whenf(ec, c.Billing.CollectEverySeconds == 0, zeroTmpl, ".billing.collectEverySeconds")
        erc.Whenf(ec, c.Billing.AccumulateEverySeconds == 0, zeroTmpl, ".billing.accumulateEverySeconds")
        if c.Billing.Clients.AzureBlob != nil {
                validateBaseReportingConfig(&c.Billing.Clients.AzureBlob.BaseClientConfig, ".billing.clients.azureBlob")
                validateAzureBlobReportingConfig(&c.Billing.Clients.AzureBlob.AzureBlobStorageClientConfig, ".billing.clients.azureBlob")
                erc.Whenf(ec, c.Billing.Clients.AzureBlob.PrefixInContainer == "", emptyTmpl, ".billing.clients.azureBlob.prefixInContainer")
        }
        if c.Billing.Clients.HTTP != nil {
                validateBaseReportingConfig(&c.Billing.Clients.HTTP.BaseClientConfig, ".billing.clients.http")
                erc.Whenf(ec, c.Billing.Clients.HTTP.URL == "", emptyTmpl, ".billing.clients.http.url")
        }
        if c.Billing.Clients.S3 != nil {
                validateBaseReportingConfig(&c.Billing.Clients.S3.BaseClientConfig, "billing.clients.s3")
                validateS3ReportingConfig(&c.Billing.Clients.S3.S3ClientConfig, ".billing.clients.s3")
                erc.Whenf(ec, c.Billing.Clients.S3.PrefixInBucket == "", emptyTmpl, ".billing.clients.s3.prefixInBucket")
        }

        erc.Whenf(ec, c.ScalingEvents.CUMultiplier == 0, zeroTmpl, ".scalingEvents.cuMultiplier")
        erc.Whenf(ec, c.ScalingEvents.RereportThreshold == 0, zeroTmpl, ".scalingEvents.rereportThreshold")
        erc.Whenf(ec, c.ScalingEvents.RegionName == "", emptyTmpl, ".scalingEvents.regionName")
        if c.ScalingEvents.Clients.AzureBlob != nil {
                validateBaseReportingConfig(&c.ScalingEvents.Clients.AzureBlob.BaseClientConfig, ".scalingEvents.clients.azureBlob")
                validateAzureBlobReportingConfig(&c.ScalingEvents.Clients.AzureBlob.AzureBlobStorageClientConfig, ".scalingEvents.clients.azureBlob")
                erc.Whenf(ec, c.ScalingEvents.Clients.AzureBlob.PrefixInContainer == "", emptyTmpl, ".scalingEvents.clients.azureBlob.prefixInContainer")
        }
        if c.ScalingEvents.Clients.S3 != nil {
                validateBaseReportingConfig(&c.ScalingEvents.Clients.S3.BaseClientConfig, "scalingEvents.clients.s3")
                validateS3ReportingConfig(&c.ScalingEvents.Clients.S3.S3ClientConfig, ".scalingEvents.clients.s3")
                erc.Whenf(ec, c.ScalingEvents.Clients.S3.PrefixInBucket == "", emptyTmpl, ".scalingEvents.clients.s3.prefixInBucket")
        }

        erc.Whenf(ec, c.DumpState != nil && c.DumpState.Port == 0, zeroTmpl, ".dumpState.port")
        erc.Whenf(ec, c.DumpState != nil && c.DumpState.TimeoutSeconds == 0, zeroTmpl, ".dumpState.timeoutSeconds")

        validateMetricsConfig := func(cfg MetricsSourceConfig, key string) {
                erc.Whenf(ec, cfg.Port == 0, zeroTmpl, fmt.Sprintf(".metrics.%s.port", key))
                erc.Whenf(ec, cfg.RequestTimeoutSeconds == 0, zeroTmpl, fmt.Sprintf(".metrics.%s.requestTimeoutSeconds", key))
                erc.Whenf(ec, cfg.SecondsBetweenRequests == 0, zeroTmpl, fmt.Sprintf(".metrics.%s.secondsBetweenRequests", key))
        }
        validateMetricsConfig(c.Metrics.System, "system")
        validateMetricsConfig(c.Metrics.LFC, "lfc")
        erc.Whenf(ec, c.Scaling.ComputeUnit.VCPU == 0, zeroTmpl, ".scaling.computeUnit.vCPUs")
        erc.Whenf(ec, c.Scaling.ComputeUnit.Mem == 0, zeroTmpl, ".scaling.computeUnit.mem")
        erc.Whenf(ec, c.NeonVM.RequestTimeoutSeconds == 0, zeroTmpl, ".scaling.requestTimeoutSeconds")
        erc.Whenf(ec, c.NeonVM.RetryFailedRequestSeconds == 0, zeroTmpl, ".scaling.retryFailedRequestSeconds")
        erc.Whenf(ec, c.NeonVM.MaxFailedRequestRate.IntervalSeconds == 0, zeroTmpl, ".neonvm.maxFailedRequestRate.intervalSeconds")
        erc.Whenf(ec, c.Monitor.ResponseTimeoutSeconds == 0, zeroTmpl, ".monitor.responseTimeoutSeconds")
        erc.Whenf(ec, c.Monitor.ConnectionTimeoutSeconds == 0, zeroTmpl, ".monitor.connectionTimeoutSeconds")
        erc.Whenf(ec, c.Monitor.ConnectionRetryMinWaitSeconds == 0, zeroTmpl, ".monitor.connectionRetryMinWaitSeconds")
        erc.Whenf(ec, c.Monitor.ServerPort == 0, zeroTmpl, ".monitor.serverPort")
        erc.Whenf(ec, c.Monitor.UnhealthyAfterSilenceDurationSeconds == 0, zeroTmpl, ".monitor.unhealthyAfterSilenceDurationSeconds")
        erc.Whenf(ec, c.Monitor.UnhealthyStartupGracePeriodSeconds == 0, zeroTmpl, ".monitor.unhealthyStartupGracePeriodSeconds")
        erc.Whenf(ec, c.Monitor.MaxHealthCheckSequentialFailuresSeconds == 0, zeroTmpl, ".monitor.maxHealthCheckSequentialFailuresSeconds")
        erc.Whenf(ec, c.Monitor.RetryFailedRequestSeconds == 0, zeroTmpl, ".monitor.retryFailedRequestSeconds")
        erc.Whenf(ec, c.Monitor.RetryDeniedDownscaleSeconds == 0, zeroTmpl, ".monitor.retryDeniedDownscaleSeconds")
        erc.Whenf(ec, c.Monitor.RequestedUpscaleValidSeconds == 0, zeroTmpl, ".monitor.requestedUpscaleValidSeconds")
        erc.Whenf(ec, c.Monitor.MaxFailedRequestRate.IntervalSeconds == 0, zeroTmpl, ".monitor.maxFailedRequestRate.intervalSeconds")
        // add all errors if there are any: https://github.com/neondatabase/autoscaling/pull/195#discussion_r1170893494
        ec.Add(c.Scaling.DefaultConfig.ValidateDefaults())
        erc.Whenf(ec, c.Scheduler.RequestPort == 0, zeroTmpl, ".scheduler.requestPort")
        erc.Whenf(ec, c.Scheduler.RequestTimeoutSeconds == 0, zeroTmpl, ".scheduler.requestTimeoutSeconds")
        erc.Whenf(ec, c.Scheduler.RequestAtLeastEverySeconds == 0, zeroTmpl, ".scheduler.requestAtLeastEverySeconds")
        erc.Whenf(ec, c.Scheduler.RetryFailedRequestSeconds == 0, zeroTmpl, ".scheduler.retryFailedRequestSeconds")
        erc.Whenf(ec, c.Scheduler.RetryDeniedUpscaleSeconds == 0, zeroTmpl, ".scheduler.retryDeniedUpscaleSeconds")
        erc.Whenf(ec, c.Scheduler.SchedulerName == "", emptyTmpl, ".scheduler.schedulerName")
        erc.Whenf(ec, c.Scheduler.MaxFailedRequestRate.IntervalSeconds == 0, zeroTmpl, ".monitor.maxFailedRequestRate.intervalSeconds")

        return ec.Resolve()
}

package core

import (
        "time"

        "go.uber.org/zap/zapcore"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/api"
)

type ActionSet struct {
        Wait             *ActionWait             `json:"wait,omitempty"`
        PluginRequest    *ActionPluginRequest    `json:"pluginRequest,omitempty"`
        NeonVMRequest    *ActionNeonVMRequest    `json:"neonvmRequest,omitempty"`
        MonitorDownscale *ActionMonitorDownscale `json:"monitorDownscale,omitempty"`
        MonitorUpscale   *ActionMonitorUpscale   `json:"monitorUpscale,omitempty"`
}

type ActionWait struct {
        Duration time.Duration `json:"duration"`
}

type ActionPluginRequest struct {
        LastPermit     *api.Resources        `json:"current"`
        Target         api.Resources         `json:"target"`
        Metrics        *api.Metrics          `json:"metrics"`
        TargetRevision vmv1.RevisionWithTime `json:"targetRevision"`
}

type ActionNeonVMRequest struct {
        Current        api.Resources         `json:"current"`
        Target         api.Resources         `json:"target"`
        TargetRevision vmv1.RevisionWithTime `json:"targetRevision"`
}

type ActionMonitorDownscale struct {
        Current        api.Resources         `json:"current"`
        Target         api.Resources         `json:"target"`
        TargetRevision vmv1.RevisionWithTime `json:"targetRevision"`
}

type ActionMonitorUpscale struct {
        Current        api.Resources         `json:"current"`
        Target         api.Resources         `json:"target"`
        TargetRevision vmv1.RevisionWithTime `json:"targetRevision"`
}

func addObjectPtr[T zapcore.ObjectMarshaler](enc zapcore.ObjectEncoder, key string, value *T) error {
        if value != nil {
                return enc.AddObject(key, *value)
        } else {
                // nil ObjectMarshaler is not sound, but nil reflected is, and it shortcuts reflection
                return enc.AddReflected(key, nil)
        }
}

func (s ActionSet) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        _ = addObjectPtr(enc, "wait", s.Wait)
        _ = addObjectPtr(enc, "pluginRequest", s.PluginRequest)
        _ = addObjectPtr(enc, "neonvmRequest", s.NeonVMRequest)
        _ = addObjectPtr(enc, "monitorDownscale", s.MonitorDownscale)
        _ = addObjectPtr(enc, "monitorUpscale", s.MonitorUpscale)
        return nil
}

// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionWait can be used with zap.Object
func (a ActionWait) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        enc.AddDuration("duration", a.Duration)
        return nil
}

// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionPluginRequest can be used with zap.Object
func (a ActionPluginRequest) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        _ = addObjectPtr(enc, "lastPermit", a.LastPermit)
        _ = enc.AddObject("target", a.Target)
        _ = enc.AddReflected("metrics", a.Metrics)
        return nil
}

// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionNeonVMRequest can be used with zap.Object
func (a ActionNeonVMRequest) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        _ = enc.AddObject("current", a.Current)
        _ = enc.AddObject("target", a.Target)
        return nil
}

// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionMonitorDownscale can be used with zap.Object
func (a ActionMonitorDownscale) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        _ = enc.AddObject("current", a.Current)
        _ = enc.AddObject("target", a.Target)
        return nil
}

// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionMonitorUpscale can be used with zap.Object
func (a ActionMonitorUpscale) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        _ = enc.AddObject("current", a.Current)
        _ = enc.AddObject("target", a.Target)
        return nil
}

package core

// Implementation of (*State).Dump()

import (
        "encoding/json"
        "time"

        "github.com/neondatabase/autoscaling/pkg/api"
)

func shallowCopy[T any](ptr *T) *T {
        if ptr == nil {
                return nil
        } else {
                x := *ptr
                return &x
        }
}

// StateDump provides introspection into the current values of the fields of State
//
// It implements json.Marshaler.
type StateDump struct {
        internal state
}

func (d StateDump) MarshalJSON() ([]byte, error) {
        return json.Marshal(d.internal)
}

// Dump produces a JSON-serializable copy of the State
func (s *State) Dump() StateDump {
        return StateDump{
                internal: state{
                        Debug:                s.internal.Debug,
                        Config:               s.internal.Config,
                        VM:                   s.internal.VM,
                        Plugin:               s.internal.Plugin.deepCopy(),
                        Monitor:              s.internal.Monitor.deepCopy(),
                        NeonVM:               s.internal.NeonVM.deepCopy(),
                        Metrics:              shallowCopy[SystemMetrics](s.internal.Metrics),
                        LFCMetrics:           shallowCopy[LFCMetrics](s.internal.LFCMetrics),
                        TargetRevision:       s.internal.TargetRevision,
                        LastDesiredResources: s.internal.LastDesiredResources,
                },
        }
}

func (s *pluginState) deepCopy() pluginState {
        return pluginState{
                OngoingRequest:  s.OngoingRequest,
                LastRequest:     shallowCopy[pluginRequested](s.LastRequest),
                LastFailureAt:   shallowCopy[time.Time](s.LastFailureAt),
                Permit:          shallowCopy[api.Resources](s.Permit),
                CurrentRevision: s.CurrentRevision,
        }
}

func (s *monitorState) deepCopy() monitorState {
        return monitorState{
                OngoingRequest:     shallowCopy[ongoingMonitorRequest](s.OngoingRequest),
                RequestedUpscale:   shallowCopy[requestedUpscale](s.RequestedUpscale),
                DeniedDownscale:    shallowCopy[deniedDownscale](s.DeniedDownscale),
                Approved:           shallowCopy[api.Resources](s.Approved),
                DownscaleFailureAt: shallowCopy[time.Time](s.DownscaleFailureAt),
                UpscaleFailureAt:   shallowCopy[time.Time](s.UpscaleFailureAt),
                CurrentRevision:    s.CurrentRevision,
        }
}

func (s *neonvmState) deepCopy() neonvmState {
        return neonvmState{
                LastSuccess:      shallowCopy[api.Resources](s.LastSuccess),
                OngoingRequested: shallowCopy[api.Resources](s.OngoingRequested),
                RequestFailedAt:  shallowCopy[time.Time](s.RequestFailedAt),
                TargetRevision:   s.TargetRevision,
                CurrentRevision:  s.CurrentRevision,
        }
}

package core

// extracted components of how "goal CU" is determined

import (
        "math"

        "github.com/samber/lo"
        "go.uber.org/zap"
        "go.uber.org/zap/zapcore"
        "golang.org/x/exp/constraints"

        "github.com/neondatabase/autoscaling/pkg/api"
)

type ScalingGoal struct {
        HasAllMetrics bool
        Parts         ScalingGoalParts
}

type ScalingGoalParts struct {
        CPU *float64
        Mem *float64
        LFC *float64
}

func (g *ScalingGoal) GoalCU() uint32 {
        return uint32(math.Ceil(max(
                math.Round(lo.FromPtr(g.Parts.CPU)), // for historical compatibility, use round() instead of ceil()
                lo.FromPtr(g.Parts.Mem),
                lo.FromPtr(g.Parts.LFC),
        )))
}

func calculateGoalCU(
        warn func(string),
        cfg api.ScalingConfig,
        computeUnit api.Resources,
        systemMetrics *SystemMetrics,
        lfcMetrics *LFCMetrics,
) (ScalingGoal, []zap.Field) {
        hasAllMetrics := systemMetrics != nil && (!*cfg.EnableLFCMetrics || lfcMetrics != nil)
        if !hasAllMetrics {
                warn("Making scaling decision without all required metrics available")
        }

        var logFields []zap.Field
        var parts ScalingGoalParts

        var wss *api.Bytes // estimated working set size

        if lfcMetrics != nil {
                var lfcLogFunc func(zapcore.ObjectEncoder) error
                var lfcGoalCU float64
                lfcGoalCU, wss, lfcLogFunc = calculateLFCGoalCU(warn, cfg, computeUnit, *lfcMetrics)
                parts.LFC = lo.ToPtr(lfcGoalCU)
                if lfcLogFunc != nil {
                        logFields = append(logFields, zap.Object("lfc", zapcore.ObjectMarshalerFunc(lfcLogFunc)))
                }
        }

        if systemMetrics != nil {
                cpuGoalCU := calculateCPUGoalCU(cfg, computeUnit, *systemMetrics)
                parts.CPU = lo.ToPtr(cpuGoalCU)

                memGoalCU := calculateMemGoalCU(cfg, computeUnit, *systemMetrics)
                parts.Mem = lo.ToPtr(memGoalCU)
        }

        if systemMetrics != nil && wss != nil {
                memTotalGoalCU := calculateMemTotalGoalCU(cfg, computeUnit, *systemMetrics, *wss)
                parts.Mem = lo.ToPtr(max(*parts.Mem, memTotalGoalCU))
        }

        return ScalingGoal{HasAllMetrics: hasAllMetrics, Parts: parts}, logFields
}

// For CPU:
// Goal compute unit is at the point where (CPUs) × (LoadAverageFractionTarget) == (load average),
// which we can get by dividing LA by LAFT, and then dividing by the number of CPUs per CU
func calculateCPUGoalCU(
        cfg api.ScalingConfig,
        computeUnit api.Resources,
        systemMetrics SystemMetrics,
) float64 {
        stableThreshold := *cfg.CPUStableZoneRatio * systemMetrics.LoadAverage5Min
        mixedThreshold := stableThreshold + *cfg.CPUMixedZoneRatio*systemMetrics.LoadAverage5Min

        diff := math.Abs(systemMetrics.LoadAverage1Min - systemMetrics.LoadAverage5Min)
        // load1Weight is 0 when diff < stableThreshold, and 1 when diff > mixedThreshold.
        // If diff is between the thresholds, it'll be between 0 and 1.
        load1Weight := blendingFactor(diff, stableThreshold, mixedThreshold)

        blendedLoadAverage := load1Weight*systemMetrics.LoadAverage1Min + (1-load1Weight)*systemMetrics.LoadAverage5Min

        goalCPUs := blendedLoadAverage / *cfg.LoadAverageFractionTarget
        cpuGoalCU := goalCPUs / computeUnit.VCPU.AsFloat64()
        return cpuGoalCU
}

func blendingFactor[T constraints.Float](value, t1, t2 T) T {
        if value <= t1 {
                return 0
        }
        if value >= t2 {
                return 1
        }
        // 1e-6 is just a precaution, if t1==t2, we'd return earlier.
        return (value - t1) / (t2 - t1 + 1e-6)
}

// For Mem:
// Goal compute unit is at the point where (Mem) * (MemoryUsageFractionTarget) == (Mem Usage)
// We can get the desired memory allocation in bytes by dividing MU by MUFT, and then convert
// that to CUs.
func calculateMemGoalCU(
        cfg api.ScalingConfig,
        computeUnit api.Resources,
        systemMetrics SystemMetrics,
) float64 {
        // goal memory size, just looking at allocated memory (not including page cache...)
        memGoalBytes := math.Round(systemMetrics.MemoryUsageBytes / *cfg.MemoryUsageFractionTarget)

        return memGoalBytes / float64(computeUnit.Mem)
}

// goal memory size, looking at allocated memory and min(page cache usage, LFC working set size)
func calculateMemTotalGoalCU(
        cfg api.ScalingConfig,
        computeUnit api.Resources,
        systemMetrics SystemMetrics,
        wss api.Bytes,
) float64 {
        lfcCached := min(float64(wss), systemMetrics.MemoryCachedBytes)
        totalGoalBytes := (lfcCached + systemMetrics.MemoryUsageBytes) / *cfg.MemoryTotalFractionTarget

        return totalGoalBytes / float64(computeUnit.Mem)
}

func calculateLFCGoalCU(
        warn func(string),
        cfg api.ScalingConfig,
        computeUnit api.Resources,
        lfcMetrics LFCMetrics,
) (float64, *api.Bytes, func(zapcore.ObjectEncoder) error) {
        wssValues := lfcMetrics.ApproximateworkingSetSizeBuckets
        // At this point, we can assume that the values are equally spaced at 1 minute apart,
        // starting at 1 minute.
        offsetIndex := *cfg.LFCMinWaitBeforeDownscaleMinutes - 1 // -1 because values start at 1m
        windowSize := *cfg.LFCWindowSizeMinutes
        // Handle invalid metrics:
        if len(wssValues) < offsetIndex+windowSize {
                warn("not enough working set size values to make scaling determination")
                return 0, nil, nil
        } else {
                var estimateWss float64
                if *cfg.LFCUseLargestWindow {
                        estimateWss = wssValues[len(wssValues)-1]
                } else {
                        estimateWss = EstimateTrueWorkingSetSize(wssValues, WssEstimatorConfig{
                                MaxAllowedIncreaseFactor: 3.0, // hard-code this for now.
                                InitialOffset:            offsetIndex,
                                WindowSize:               windowSize,
                        })
                }
                projectSliceEnd := offsetIndex // start at offsetIndex to avoid panics if not monotonically non-decreasing
                for ; projectSliceEnd < len(wssValues) && wssValues[projectSliceEnd] <= estimateWss; projectSliceEnd++ {
                }
                projectLen := 0.5 // hard-code this for now.
                predictedHighestNextMinute := ProjectNextHighest(wssValues[:projectSliceEnd], projectLen)

                // predictedHighestNextMinute is still in units of 8KiB pages. Let's convert that
                // into GiB...
                estimateWssMem := predictedHighestNextMinute * 8192
                // ... and then invert the discount form only some of the memory going towards LFC...
                requiredMem := estimateWssMem / *cfg.LFCToMemoryRatio
                // ... and then convert that into the actual CU required to fit the working set:
                requiredCU := requiredMem / computeUnit.Mem.AsFloat64()

                lfcLogFields := func(obj zapcore.ObjectEncoder) error {
                        obj.AddFloat64("estimateWssPages", estimateWss)
                        obj.AddFloat64("predictedNextWssPages", predictedHighestNextMinute)
                        obj.AddFloat64("requiredCU", requiredCU)
                        return nil
                }

                return requiredCU, lo.ToPtr(api.Bytes(estimateWssMem)), lfcLogFields
        }
}

package core

// Definition of the Metrics type, plus reading it from vector.dev's prometheus format host metrics

import (
        "cmp"
        "fmt"
        "io"
        "slices"
        "strconv"
        "time"

        promtypes "github.com/prometheus/client_model/go"
        promfmt "github.com/prometheus/common/expfmt"
        "github.com/tychoish/fun/erc"

        "github.com/neondatabase/autoscaling/pkg/api"
)

type SystemMetrics struct {
        LoadAverage1Min   float64
        LoadAverage5Min   float64
        MemoryUsageBytes  float64
        MemoryCachedBytes float64
}

func (m SystemMetrics) ToAPI() api.Metrics {
        return api.Metrics{
                LoadAverage1Min:  float32(m.LoadAverage1Min),
                LoadAverage5Min:  nil,
                MemoryUsageBytes: nil,
        }
}

type LFCMetrics struct {
        CacheHitsTotal   float64
        CacheMissesTotal float64
        CacheWritesTotal float64

        // lfc_approximate_working_set_size_windows, currently requires that values are exactly every
        // minute
        ApproximateworkingSetSizeBuckets []float64
}

// FromPrometheus represents metric types that can be parsed from prometheus output.
type FromPrometheus interface {
        fromPrometheus(map[string]*promtypes.MetricFamily) error
}

// ParseMetrics reads the prometheus text-format content, parses it, and uses M's implementation of
// FromPrometheus to populate it before returning.
func ParseMetrics(content io.Reader, metrics FromPrometheus) error {
        var parser promfmt.TextParser
        mfs, err := parser.TextToMetricFamilies(content)
        if err != nil {
                return fmt.Errorf("failed to parse content as prometheus text format: %w", err)
        }

        if err := metrics.fromPrometheus(mfs); err != nil {
                return fmt.Errorf("failed to extract metrics: %w", err)
        }

        return nil
}

func extractFloatGauge(mf *promtypes.MetricFamily) (float64, error) {
        if mf.GetType() != promtypes.MetricType_GAUGE {
                return 0, fmt.Errorf("wrong metric type: expected %s but got %s", promtypes.MetricType_GAUGE, mf.GetType())
        } else if len(mf.Metric) != 1 {
                return 0, fmt.Errorf("expected 1 metric, found %d", len(mf.Metric))
        }

        return mf.Metric[0].GetGauge().GetValue(), nil
}

// Helper function to return an error for a missing metric
func missingMetric(name string) error {
        return fmt.Errorf("missing expected metric %s", name)
}

// fromPrometheus implements FromPrometheus, so SystemMetrics can be used with ParseMetrics.
func (m *SystemMetrics) fromPrometheus(mfs map[string]*promtypes.MetricFamily) error {
        ec := &erc.Collector{}

        getFloat := func(metricName string) float64 {
                if mf := mfs[metricName]; mf != nil {
                        f, err := extractFloatGauge(mf)
                        ec.Add(err) // does nothing if err == nil
                        return f
                } else {
                        ec.Add(missingMetric(metricName))
                        return 0
                }
        }

        load1 := getFloat("host_load1")
        load5 := getFloat("host_load5")
        memTotal := getFloat("host_memory_total_bytes")
        memAvailable := getFloat("host_memory_available_bytes")
        memCached := getFloat("host_memory_cached_bytes")

        tmp := SystemMetrics{
                LoadAverage1Min: load1,
                LoadAverage5Min: load5,
                // Add an extra 100 MiB to account for kernel memory usage
                MemoryUsageBytes:  memTotal - memAvailable + 100*(1<<20),
                MemoryCachedBytes: memCached,
        }

        if err := ec.Resolve(); err != nil {
                return err
        }

        *m = tmp
        return nil
}

// fromPrometheus implements FromPrometheus, so LFCMetrics can be used with ParseMetrics.
func (m *LFCMetrics) fromPrometheus(mfs map[string]*promtypes.MetricFamily) error {
        ec := &erc.Collector{}

        getFloat := func(metricName string) float64 {
                if mf := mfs[metricName]; mf != nil {
                        f, err := extractFloatGauge(mf)
                        ec.Add(err) // does nothing if err == nil
                        return f
                } else {
                        ec.Add(missingMetric(metricName))
                        return 0
                }
        }

        wssBuckets, err := extractWorkingSetSizeWindows(mfs)
        ec.Add(err)

        tmp := LFCMetrics{
                CacheHitsTotal:   getFloat("lfc_hits"),
                CacheMissesTotal: getFloat("lfc_misses"),
                CacheWritesTotal: getFloat("lfc_writes"),

                ApproximateworkingSetSizeBuckets: wssBuckets,
        }

        if err := ec.Resolve(); err != nil {
                return err
        }

        *m = tmp
        return nil
}

func extractWorkingSetSizeWindows(mfs map[string]*promtypes.MetricFamily) ([]float64, error) {
        metricName := "lfc_approximate_working_set_size_windows"
        mf := mfs[metricName]
        if mf == nil {
                return nil, missingMetric(metricName)
        }

        if mf.GetType() != promtypes.MetricType_GAUGE {
                return nil, fmt.Errorf("wrong metric type: expected %s, but got %s", promtypes.MetricType_GAUGE, mf.GetType())
        } else if len(mf.Metric) < 1 {
                return nil, fmt.Errorf("expected >= metric, found %d", len(mf.Metric))
        }

        type pair struct {
                duration time.Duration
                value    float64
        }

        var pairs []pair
        for _, m := range mf.Metric {
                // Find the duration label
                durationLabel := "duration_seconds"
                durationIndex := slices.IndexFunc(m.Label, func(l *promtypes.LabelPair) bool {
                        return l.GetName() == durationLabel
                })
                if durationIndex == -1 {
                        return nil, fmt.Errorf("metric missing label %q", durationLabel)
                }

                durationSeconds, err := strconv.Atoi(m.Label[durationIndex].GetValue())
                if err != nil {
                        return nil, fmt.Errorf("couldn't parse metric's %q label as int: %w", durationLabel, err)
                }

                pairs = append(pairs, pair{
                        duration: time.Second * time.Duration(durationSeconds),
                        value:    m.GetGauge().GetValue(),
                })
        }

        slices.SortFunc(pairs, func(x, y pair) int {
                return cmp.Compare(x.duration, y.duration)
        })

        // Check that the values make are as expected: they should all be 1 minute apart, starting
        // at 1 minute.
        // NOTE: this assumption is relied on elsewhere for scaling on ApproximateworkingSetSizeBuckets.
        // Please search for usages before changing this behavior.
        if pairs[0].duration != time.Minute {
                return nil, fmt.Errorf("expected smallest duration to be %v, got %v", time.Minute, pairs[0].duration)
        }
        for i := range pairs {
                expected := time.Minute * time.Duration(i+1)
                if pairs[i].duration != expected {
                        return nil, fmt.Errorf(
                                "expected duration values to be exactly 1m apart, got unexpected value %v instead of %v",
                                pairs[i].duration,
                                expected,
                        )
                }
        }

        var values []float64
        for _, p := range pairs {
                values = append(values, p.value)
        }
        return values, nil
}

package revsource

import (
        "errors"
        "time"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)

const (
        Upscale vmv1.Flag = 1 << iota
        Downscale
)

// MaxRevisions is the maximum number of revisions that can be stored in the RevisionSource.
// This is to prevent memory leaks.
// Upon reaching it, the oldest revisions are discarded.
const MaxRevisions = 100

// RevisionSource can generate and observe revisions.
// Each Revision is a value and a set of flags (for meta-information).
// Once RevisionSource observes a previously generated Revision after some time,
// the time it took since that Revision was generated.
type RevisionSource struct {
        cb ObserveCallback

        // The in-flight revisions are stored in-order.
        // After the revision is observed, it is removed from the measurements, and the offset is increased.
        measurements []time.Time
        offset       int64
}

func NewRevisionSource(initialRevision int64, cb ObserveCallback) *RevisionSource {
        return &RevisionSource{
                cb:           cb,
                measurements: nil,
                offset:       initialRevision + 1, // Will start from the next one
        }
}

func (c *RevisionSource) nextValue() int64 {
        return c.offset + int64(len(c.measurements))
}

func (c *RevisionSource) Next(now time.Time, flags vmv1.Flag) vmv1.Revision {
        ret := vmv1.Revision{
                Value: c.nextValue(),
                Flags: flags,
        }
        c.measurements = append(c.measurements, now)

        if len(c.measurements) > MaxRevisions {
                c.measurements = c.measurements[1:]
                c.offset++
        }

        return ret
}

func (c *RevisionSource) Observe(moment time.Time, rev vmv1.Revision) error {
        if rev.Value < c.offset {
                // Already observed
                return nil
        }

        idx := rev.Value - c.offset
        if idx >= int64(len(c.measurements)) {
                return errors.New("revision is in the future")
        }

        diff := moment.Sub(c.measurements[idx])

        if c.cb != nil {
                c.cb(diff, rev.Flags)
        }

        // Forget the measurement, and all the measurements before it.
        c.offset = rev.Value + 1
        c.measurements = c.measurements[idx+1:]

        return nil
}

type ObserveCallback func(dur time.Duration, flags vmv1.Flag)

// Propagate sets the target revision to be current, optionally measuring the time it took
// for propagation.
func Propagate(
        now time.Time,
        target vmv1.RevisionWithTime,
        currentSlot *vmv1.Revision,
        cb ObserveCallback,
) {
        if currentSlot == nil {
                return
        }
        if currentSlot.Value >= target.Value {
                return
        }
        if cb != nil {
                diff := now.Sub(target.UpdatedAt.Time)
                cb(diff, target.Flags)
        }
        *currentSlot = target.Revision
}

package core

// The core scaling logic at the heart of the autoscaler-agent. This file implements everything with
// mostly pure-ish functions, so that all the making & receiving requests can be done elsewhere.
//
// Broadly our strategy is to mimic the kind of eventual consistency that is itself used in
// Kubernetes. The scaling logic wasn't always implemented like this, but because the
// autoscaler-agent *fundamentally* exists in an eventual consistency world, we have to either:
//  (a) make assumptions that we know are false; or
//  (b) design our system so it assumes less.
// We used to solve this by (a). We ran into¹ issues² going that way, because sometimes those false
// assumptions come back to haunt you.
//
// That said, there's still some tricky semantics we want to maintain. Internally, the
// autoscaler-agent must be designed around eventual consistency, but the API we expose to the
// vm-monitor is strictly synchronous. As such, there's some subtle logic to make sure that we're
// not violating our own guarantees unless required to.
//
// ---
// ¹ https://github.com/neondatabase/autoscaling/issues/23
// ² https://github.com/neondatabase/autoscaling/issues/350

import (
        "errors"
        "fmt"
        "strings"
        "time"

        "github.com/samber/lo"
        "go.uber.org/zap"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/agent/core/revsource"
        "github.com/neondatabase/autoscaling/pkg/api"
)

type ObservabilityCallbacks struct {
        PluginLatency  revsource.ObserveCallback
        MonitorLatency revsource.ObserveCallback
        NeonVMLatency  revsource.ObserveCallback

        ActualScaling       ReportActualScalingEventCallback
        HypotheticalScaling ReportHypotheticalScalingEventCallback
}

type (
        ReportActualScalingEventCallback       func(timestamp time.Time, current uint32, target uint32)
        ReportHypotheticalScalingEventCallback func(timestamp time.Time, current uint32, target uint32, parts ScalingGoalParts)
)

type RevisionSource interface {
        Next(ts time.Time, flags vmv1.Flag) vmv1.Revision
        Observe(moment time.Time, rev vmv1.Revision) error
}

// Config represents some of the static configuration underlying the decision-making of State
type Config struct {
        // ComputeUnit is the desired ratio between CPU and memory, copied from the global
        // autoscaler-agent config.
        ComputeUnit api.Resources

        // DefaultScalingConfig is just copied from the global autoscaler-agent config.
        // If the VM's ScalingConfig is nil, we use this field instead.
        DefaultScalingConfig api.ScalingConfig

        // NeonVMRetryWait gives the amount of time to wait to retry after a failed request
        NeonVMRetryWait time.Duration

        // PluginRequestTick gives the period at which we should be making requests to the scheduler
        // plugin, even if nothing's changed.
        PluginRequestTick time.Duration

        // PluginRetryWait gives the amount of time to wait to retry after a failed request
        PluginRetryWait time.Duration

        // PluginDeniedRetryWait gives the amount of time we must wait before re-requesting resources
        // that were not fully granted.
        PluginDeniedRetryWait time.Duration

        // MonitorDeniedDownscaleCooldown gives the time we must wait between making duplicate
        // downscale requests to the vm-monitor where the previous failed.
        MonitorDeniedDownscaleCooldown time.Duration

        // MonitorRequestedUpscaleValidPeriod gives the duration for which requested upscaling from the
        // vm-monitor must be respected.
        MonitorRequestedUpscaleValidPeriod time.Duration

        // MonitorRetryWait gives the amount of time to wait to retry after a *failed* request.
        MonitorRetryWait time.Duration

        // Log provides an outlet for (*State).NextActions() to give informative messages or warnings
        // about conditions that are impeding its ability to execute.
        Log LogConfig `json:"-"`

        // RevisionSource is the source of revisions to track the progress during scaling.
        RevisionSource RevisionSource `json:"-"`

        // ObservabilityCallbacks are the callbacks to submit datapoints for observability.
        ObservabilityCallbacks ObservabilityCallbacks `json:"-"`
}

type LogConfig struct {
        // Info, if not nil, will be called to provide information during normal functioning.
        // For example, we log the calculated desired resources on every call to NextActions.
        Info func(string, ...zap.Field)
        // Warn, if not nil, will be called to log conditions that are impeding the ability to move the
        // current resources to what's desired.
        // A typical warning may be something like "wanted to do X but couldn't because of Y".
        Warn func(string, ...zap.Field)
}

// State holds all of the necessary internal state for a VM in order to make scaling
// decisions
type State struct {
        internal state
}

// one level of indirection below State so that the fields can be public, and JSON-serializable
type state struct {
        Config Config

        // unused. Exists to make it easier to add print debugging (via .config.Warn) for a single call
        // to NextActions.
        Debug bool

        // VM gives the current state of the VM - or at least, the state of the fields we care about.
        //
        // NB: any contents behind pointers in VM are immutable. Any time the field is updated, we
        // replace it with a fresh object.
        VM api.VmInfo

        // Plugin records all state relevant to communications with the scheduler plugin
        Plugin pluginState

        // Monitor records all state relevant to communications with the vm-monitor
        Monitor monitorState

        // NeonVM records all state relevant to the NeonVM k8s API
        NeonVM neonvmState

        Metrics *SystemMetrics

        LFCMetrics *LFCMetrics

        // TargetRevision is the revision agent works towards.
        TargetRevision vmv1.Revision

        // LastDesiredResources is the last target agent wanted to scale to.
        LastDesiredResources *api.Resources
}

type pluginState struct {
        // OngoingRequest is true iff there is currently an ongoing request to *this* scheduler plugin.
        OngoingRequest bool
        // LastRequest, if not nil, gives information about the most recently started request to the
        // plugin (maybe unfinished!)
        LastRequest *pluginRequested
        // LastFailureAt, if not nil, gives the time of the most recent request failure
        LastFailureAt *time.Time
        // Permit, if not nil, stores the Permit in the most recent PluginResponse. This field will be
        // nil if we have not been able to contact *any* scheduler.
        Permit *api.Resources

        // CurrentRevision is the most recent revision the plugin has acknowledged.
        CurrentRevision vmv1.Revision
}

type pluginRequested struct {
        At        time.Time
        Resources api.Resources
}

type monitorState struct {
        OngoingRequest *ongoingMonitorRequest

        // RequestedUpscale, if not nil, stores the most recent *unresolved* upscaling requested by the
        // vm-monitor, along with the time at which it occurred.
        RequestedUpscale *requestedUpscale

        // DeniedDownscale, if not nil, stores the result of the latest denied /downscale request.
        DeniedDownscale *deniedDownscale

        // Approved stores the most recent Resources associated with either (a) an accepted downscale
        // request, or (b) a successful upscale notification.
        Approved *api.Resources

        // DownscaleFailureAt, if not nil, stores the time at which a downscale request most recently
        // failed (where "failed" means that some unexpected error occurred, not that it was merely
        // denied).
        DownscaleFailureAt *time.Time
        // UpscaleFailureAt, if not nil, stores the time at which an upscale request most recently
        // failed
        UpscaleFailureAt *time.Time

        // CurrentRevision is the most recent revision the monitor has acknowledged.
        CurrentRevision vmv1.Revision
}

func (ms *monitorState) active() bool {
        return ms.Approved != nil
}

type ongoingMonitorRequest struct {
        Kind      monitorRequestKind
        Requested api.Resources
}

type monitorRequestKind string

const (
        monitorRequestKindDownscale monitorRequestKind = "downscale"
        monitorRequestKindUpscale   monitorRequestKind = "upscale"
)

type requestedUpscale struct {
        At        time.Time
        Base      api.Resources
        Requested api.MoreResources
}

type deniedDownscale struct {
        At        time.Time
        Current   api.Resources
        Requested api.Resources
}

type neonvmState struct {
        LastSuccess *api.Resources
        // OngoingRequested, if not nil, gives the resources requested
        OngoingRequested *api.Resources
        RequestFailedAt  *time.Time

        // TargetRevision is the revision agent works towards. Contrary to monitor/plugin, we
        // store it not only in action, but also here. This is needed, because for NeonVM propagation
        // happens after the changes are actually applied, when the action object is long gone.
        TargetRevision  vmv1.RevisionWithTime
        CurrentRevision vmv1.Revision
}

func (ns *neonvmState) ongoingRequest() bool {
        return ns.OngoingRequested != nil
}

func NewState(vm api.VmInfo, config Config) *State {
        return &State{
                internal: state{
                        Config: config,
                        Debug:  false,
                        VM:     vm,
                        Plugin: pluginState{
                                OngoingRequest:  false,
                                LastRequest:     nil,
                                LastFailureAt:   nil,
                                Permit:          nil,
                                CurrentRevision: vmv1.ZeroRevision,
                        },
                        Monitor: monitorState{
                                OngoingRequest:     nil,
                                RequestedUpscale:   nil,
                                DeniedDownscale:    nil,
                                Approved:           nil,
                                DownscaleFailureAt: nil,
                                UpscaleFailureAt:   nil,
                                CurrentRevision:    vmv1.ZeroRevision,
                        },
                        NeonVM: neonvmState{
                                LastSuccess:      nil,
                                OngoingRequested: nil,
                                RequestFailedAt:  nil,
                                TargetRevision:   vmv1.ZeroRevision.WithTime(time.Time{}),
                                CurrentRevision:  vmv1.ZeroRevision,
                        },
                        Metrics:              nil,
                        LFCMetrics:           nil,
                        LastDesiredResources: nil,
                        TargetRevision:       vmv1.ZeroRevision,
                },
        }
}

func (s *state) info(msg string, fields ...zap.Field) {
        if s.Config.Log.Info != nil {
                s.Config.Log.Info(msg, fields...)
        }
}

func (s *state) warn(msg string /* , fields ...zap.Field */) {
        if s.Config.Log.Warn != nil {
                s.Config.Log.Warn(msg /* , fields... */)
        }
}

func (s *state) warnf(msg string, args ...any) {
        s.warn(fmt.Sprintf(msg, args...))
}

// NextActions is used to implement the state machine. It's a pure function that *just* indicates
// what the executor should do.
func (s *State) NextActions(now time.Time) ActionSet {
        return s.internal.nextActions(now)
}

func (s *state) nextActions(now time.Time) ActionSet {
        var actions ActionSet

        desiredResources, calcDesiredResourcesWait := s.desiredResourcesFromMetricsOrRequestedUpscaling(now)
        if calcDesiredResourcesWait == nil {
                // our handling later on is easier if we can assume it's non-nil
                calcDesiredResourcesWait = func(ActionSet) *time.Duration { return nil }
        }

        // ----
        // Requests to the scheduler plugin:
        var pluginRequiredWait *time.Duration
        actions.PluginRequest, pluginRequiredWait = s.calculatePluginAction(now, desiredResources)

        // ----
        // Requests to NeonVM:
        var pluginRequested *api.Resources
        var pluginRequestedPhase string = "<this string should not appear>"
        if s.Plugin.OngoingRequest {
                pluginRequested = &s.Plugin.LastRequest.Resources
                pluginRequestedPhase = "ongoing"
        } else if actions.PluginRequest != nil {
                pluginRequested = &actions.PluginRequest.Target
                pluginRequestedPhase = "planned"
        }
        var neonvmRequiredWait *time.Duration
        actions.NeonVMRequest, neonvmRequiredWait = s.calculateNeonVMAction(now, desiredResources, pluginRequested, pluginRequestedPhase)

        // ----
        // Requests to vm-monitor (upscaling)
        //
        // NB: upscaling takes priority over downscaling requests, because otherwise we'd potentially
        // forego notifying the vm-monitor of increased resources because we were busy asking if it
        // could downscale.
        var monitorUpscaleRequiredWait *time.Duration
        actions.MonitorUpscale, monitorUpscaleRequiredWait = s.calculateMonitorUpscaleAction(now, desiredResources)

        // ----
        // Requests to vm-monitor (downscaling)
        plannedUpscale := actions.MonitorUpscale != nil
        var monitorDownscaleRequiredWait *time.Duration
        actions.MonitorDownscale, monitorDownscaleRequiredWait = s.calculateMonitorDownscaleAction(now, desiredResources, plannedUpscale)

        // --- and that's all the request types! ---

        // If there's anything waiting, we should also note how long we should wait for.
        // There's two components we could be waiting on: the scheduler plugin, and the vm-monitor.
        maximumDuration := time.Duration(int64(uint64(1)<<63 - 1))
        requiredWait := maximumDuration

        requiredWaits := []*time.Duration{
                calcDesiredResourcesWait(actions),
                pluginRequiredWait,
                neonvmRequiredWait,
                monitorUpscaleRequiredWait,
                monitorDownscaleRequiredWait,
        }
        for _, w := range requiredWaits {
                if w != nil {
                        requiredWait = min(requiredWait, *w)
                }
        }

        // If we're waiting on anything, add it as an action
        if requiredWait != maximumDuration {
                actions.Wait = &ActionWait{Duration: requiredWait}
        }

        return actions
}

func (s *state) calculatePluginAction(
        now time.Time,
        desiredResources api.Resources,
) (*ActionPluginRequest, *time.Duration) {
        logFailureReason := func(reason string) {
                s.warnf("Wanted to make a request to the scheduler plugin, but %s", reason)
        }

        // additional resources we want to request OR previous downscaling we need to inform the plugin of
        // NOTE: only valid if s.plugin.permit != nil AND there's no ongoing NeonVM request.
        requestResources := s.clampResources(
                s.VM.Using(),
                desiredResources,
                ptr(s.VM.Using()), // don't decrease below VM using (decrease happens *before* telling the plugin)
                nil,               // but any increase is ok
        )
        // resources if we're just informing the plugin of current resource usage.
        currentResources := s.VM.Using()
        if s.NeonVM.OngoingRequested != nil {
                // include any ongoing NeonVM request, because we're already using that.
                currentResources = currentResources.Max(*s.NeonVM.OngoingRequested)
        }

        // We want to make a request to the scheduler plugin if:
        //  1. it's been long enough since the previous request (so we're obligated by PluginRequestTick); or
        //  2.a. we want to request resources / inform it of downscale;
        //    b. there isn't any ongoing, conflicting request; and
        //    c. we haven't recently been denied these resources
        var timeUntilNextRequestTick time.Duration
        if s.Plugin.LastRequest != nil {
                timeUntilNextRequestTick = s.Config.PluginRequestTick - now.Sub(s.Plugin.LastRequest.At)
        }

        timeForRequest := timeUntilNextRequestTick <= 0

        var timeUntilRetryBackoffExpires time.Duration
        requestPreviouslyDenied := !s.Plugin.OngoingRequest &&
                s.Plugin.LastRequest != nil &&
                s.Plugin.Permit != nil &&
                s.Plugin.LastRequest.Resources.HasFieldGreaterThan(*s.Plugin.Permit)
        if requestPreviouslyDenied {
                timeUntilRetryBackoffExpires = s.Plugin.LastRequest.At.Add(s.Config.PluginDeniedRetryWait).Sub(now)
        }

        waitingOnRetryBackoff := timeUntilRetryBackoffExpires > 0

        // changing the resources we're requesting from the plugin
        wantToRequestNewResources := s.Plugin.LastRequest != nil && s.Plugin.Permit != nil &&
                requestResources != *s.Plugin.Permit
        // ... and this isn't a duplicate (or, at least it's been long enough)
        shouldRequestNewResources := wantToRequestNewResources && !waitingOnRetryBackoff

        permittedRequestResources := requestResources
        if !shouldRequestNewResources {
                permittedRequestResources = currentResources
        }

        // Can't make a duplicate request
        if s.Plugin.OngoingRequest {
                // ... but if the desired request is different from what we would be making,
                // then it's worth logging
                if s.Plugin.LastRequest.Resources != permittedRequestResources {
                        logFailureReason("there's already an ongoing request for different resources")
                }
                return nil, nil
        }

        // Can't make a request if we failed too recently
        if s.Plugin.LastFailureAt != nil {
                timeUntilFailureBackoffExpires := s.Plugin.LastFailureAt.Add(s.Config.PluginRetryWait).Sub(now)
                if timeUntilFailureBackoffExpires > 0 {
                        logFailureReason("previous request failed too recently")
                        return nil, &timeUntilFailureBackoffExpires
                }
        }

        // At this point, all that's left is either making the request, or saying to wait.
        // The rest of the complication is just around accurate logging.
        if timeForRequest || shouldRequestNewResources {
                return &ActionPluginRequest{
                        LastPermit: s.Plugin.Permit,
                        Target:     permittedRequestResources,
                        // convert maybe-nil '*Metrics' to maybe-nil '*core.Metrics'
                        Metrics: func() *api.Metrics {
                                if s.Metrics != nil {
                                        return lo.ToPtr(s.Metrics.ToAPI())
                                } else {
                                        return nil
                                }
                        }(),
                        TargetRevision: s.TargetRevision.WithTime(now),
                }, nil
        } else {
                if wantToRequestNewResources && waitingOnRetryBackoff {
                        logFailureReason("previous request for more resources was denied too recently")
                }
                waitTime := timeUntilNextRequestTick
                if waitingOnRetryBackoff {
                        waitTime = min(waitTime, timeUntilRetryBackoffExpires)
                }
                return nil, &waitTime
        }
}

func ptr[T any](t T) *T { return &t }

func (s *state) calculateNeonVMAction(
        now time.Time,
        desiredResources api.Resources,
        pluginRequested *api.Resources,
        pluginRequestedPhase string,
) (*ActionNeonVMRequest, *time.Duration) {
        targetRevision := s.TargetRevision
        if desiredResources.HasFieldLessThan(s.VM.Using()) && s.Monitor.CurrentRevision.Value > 0 {
                // We are downscaling, so we needed a permit from the monitor
                targetRevision = targetRevision.Min(s.Monitor.CurrentRevision)
        }

        if desiredResources.HasFieldGreaterThan(s.VM.Using()) && s.Plugin.CurrentRevision.Value > 0 {
                // We are upscaling, so we needed a permit from the plugin
                targetRevision = targetRevision.Min(s.Plugin.CurrentRevision)
        }

        // clamp desiredResources to what we're allowed to make a request for
        desiredResources = s.clampResources(
                s.VM.Using(),                       // current: what we're using already
                desiredResources,                   // target: desired resources
                ptr(s.monitorApprovedLowerBound()), // lower bound: downscaling that the monitor has approved
                ptr(s.pluginApprovedUpperBound()),  // upper bound: upscaling that the plugin has approved
        )

        // If we're already using the desired resources, then no need to make a request
        if s.VM.Using() == desiredResources {
                return nil, nil
        }

        conflictingPluginRequest := pluginRequested != nil && pluginRequested.HasFieldLessThan(desiredResources)

        if !s.NeonVM.ongoingRequest() && !conflictingPluginRequest {
                // We *should* be all clear to make a request; not allowed to make one if we failed too
                // recently
                if s.NeonVM.RequestFailedAt != nil {
                        timeUntilFailureBackoffExpires := s.NeonVM.RequestFailedAt.Add(s.Config.NeonVMRetryWait).Sub(now)
                        if timeUntilFailureBackoffExpires > 0 {
                                s.warn("Wanted to make a request to NeonVM API, but recent request failed too recently")
                                return nil, &timeUntilFailureBackoffExpires
                        }
                }

                s.NeonVM.TargetRevision = targetRevision.WithTime(now)
                return &ActionNeonVMRequest{
                        Current:        s.VM.Using(),
                        Target:         desiredResources,
                        TargetRevision: s.NeonVM.TargetRevision,
                }, nil
        } else {
                var reqs []string
                if s.Plugin.OngoingRequest {
                        reqs = append(reqs, fmt.Sprintf("plugin request %s", pluginRequestedPhase))
                }
                if s.NeonVM.ongoingRequest() && *s.NeonVM.OngoingRequested != desiredResources {
                        reqs = append(reqs, "NeonVM request (for different resources) ongoing")
                }

                if len(reqs) != 0 {
                        s.warnf("Wanted to make a request to NeonVM API, but there's already %s", strings.Join(reqs, " and "))
                }

                return nil, nil
        }
}

func (s *state) calculateMonitorUpscaleAction(
        now time.Time,
        desiredResources api.Resources,
) (*ActionMonitorUpscale, *time.Duration) {
        // can't do anything if we don't have an active connection to the vm-monitor
        if !s.Monitor.active() {
                return nil, nil
        }

        requestResources := s.clampResources(
                *s.Monitor.Approved,      // current: last resources we got the OK from the monitor on
                s.VM.Using(),             // target: what the VM is currently using
                ptr(*s.Monitor.Approved), // don't decrease below what the monitor is currently set to (this is an *upscale* request)
                ptr(desiredResources.Max(*s.Monitor.Approved)), // don't increase above desired resources
        )

        // Clamp the request resources so we're not increasing by more than 1 CU:
        requestResources = s.clampResources(
                *s.Monitor.Approved,
                requestResources,
                nil, // no lower bound
                ptr(requestResources.Add(s.Config.ComputeUnit)), // upper bound: must not increase by >1 CU
        )

        // Check validity of the request that we would send, before sending it
        if requestResources.HasFieldLessThan(*s.Monitor.Approved) {
                panic(fmt.Errorf(
                        "resources for vm-monitor upscaling are less than what was last approved: %+v has field less than %+v",
                        requestResources,
                        *s.Monitor.Approved,
                ))
        }

        wantToDoRequest := requestResources != *s.Monitor.Approved
        if !wantToDoRequest {
                return nil, nil
        }

        // Can't make another request if there's already one ongoing
        if s.Monitor.OngoingRequest != nil {
                var requestDescription string
                if s.Monitor.OngoingRequest.Kind == monitorRequestKindUpscale && s.Monitor.OngoingRequest.Requested != requestResources {
                        requestDescription = "upscale request (for different resources)"
                } else if s.Monitor.OngoingRequest.Kind == monitorRequestKindDownscale {
                        requestDescription = "downscale request"
                }

                if requestDescription != "" {
                        s.warnf("Wanted to send vm-monitor upscale request, but waiting on ongoing %s", requestDescription)
                }
                return nil, nil
        }

        // Can't make another request if we failed too recently:
        if s.Monitor.UpscaleFailureAt != nil {
                timeUntilFailureBackoffExpires := s.Monitor.UpscaleFailureAt.Add(s.Config.MonitorRetryWait).Sub(now)
                if timeUntilFailureBackoffExpires > 0 {
                        s.warn("Wanted to send vm-monitor upscale request, but failed too recently")
                        return nil, &timeUntilFailureBackoffExpires
                }
        }

        // Otherwise, we can make the request:
        return &ActionMonitorUpscale{
                Current:        *s.Monitor.Approved,
                Target:         requestResources,
                TargetRevision: s.TargetRevision.WithTime(now),
        }, nil
}

func (s *state) calculateMonitorDownscaleAction(
        now time.Time,
        desiredResources api.Resources,
        plannedUpscaleRequest bool,
) (*ActionMonitorDownscale, *time.Duration) {
        // can't do anything if we don't have an active connection to the vm-monitor
        if !s.Monitor.active() {
                if desiredResources.HasFieldLessThan(s.VM.Using()) {
                        s.warn("Wanted to send vm-monitor downscale request, but there's no active connection")
                }
                return nil, nil
        }

        requestResources := s.clampResources(
                *s.Monitor.Approved,      // current: what the monitor is already aware of
                desiredResources,         // target: what we'd like the VM to be using
                nil,                      // lower bound: any decrease is fine
                ptr(*s.Monitor.Approved), // upper bound: don't increase (this is only downscaling!)
        )

        // Clamp the request resources so we're not decreasing by more than 1 CU:
        requestResources = s.clampResources(
                *s.Monitor.Approved,
                requestResources,
                ptr(s.Monitor.Approved.SaturatingSub(s.Config.ComputeUnit)), // Must not decrease by >1 CU
                nil, // no upper bound
        )

        // Check validity of the request that we would send, before sending it
        if requestResources.HasFieldGreaterThan(*s.Monitor.Approved) {
                panic(fmt.Errorf(
                        "resources for vm-monitor downscaling are greater than what was last approved: %+v has field greater than %+v",
                        requestResources,
                        *s.Monitor.Approved,
                ))
        }

        wantToDoRequest := requestResources != *s.Monitor.Approved
        if !wantToDoRequest {
                return nil, nil
        }

        // Can't make another request if there's already one ongoing (or if an upscaling request is
        // planned)
        if plannedUpscaleRequest {
                s.warn("Wanted to send vm-monitor downscale request, but waiting on other planned upscale request")
                return nil, nil
        } else if s.Monitor.OngoingRequest != nil {
                var requestDescription string
                if s.Monitor.OngoingRequest.Kind == monitorRequestKindDownscale && s.Monitor.OngoingRequest.Requested != requestResources {
                        requestDescription = "downscale request (for different resources)"
                } else if s.Monitor.OngoingRequest.Kind == monitorRequestKindUpscale {
                        requestDescription = "upscale request"
                }

                if requestDescription != "" {
                        s.warnf("Wanted to send vm-monitor downscale request, but waiting on other ongoing %s", requestDescription)
                }
                return nil, nil
        }

        // Can't make another request if we failed too recently:
        if s.Monitor.DownscaleFailureAt != nil {
                timeUntilFailureBackoffExpires := s.Monitor.DownscaleFailureAt.Add(s.Config.MonitorRetryWait).Sub(now)
                if timeUntilFailureBackoffExpires > 0 {
                        s.warn("Wanted to send vm-monitor downscale request but failed too recently")
                        return nil, &timeUntilFailureBackoffExpires
                }
        }

        // Can't make another request if a recent request for resources less than or equal to the
        // proposed request was denied. In general though, this should be handled by
        // DesiredResourcesFromMetricsOrRequestedUpscaling, so it's we're better off panicking here.
        if s.timeUntilDeniedDownscaleExpired(now) > 0 && !s.Monitor.DeniedDownscale.Requested.HasFieldLessThan(requestResources) {
                panic(errors.New(
                        "Wanted to send vm-monitor downscale request, but too soon after previously denied downscaling that should have been handled earlier",
                ))
        }

        // Nothing else to check, we're good to make the request
        return &ActionMonitorDownscale{
                Current:        *s.Monitor.Approved,
                Target:         requestResources,
                TargetRevision: s.TargetRevision.WithTime(now),
        }, nil
}

func (s *state) scalingConfig() api.ScalingConfig {
        // nb: WithOverrides allows its arg to be nil, in which case it does nothing.
        return s.Config.DefaultScalingConfig.WithOverrides(s.VM.Config.ScalingConfig)
}

// public version, for testing.
func (s *State) DesiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) (api.Resources, func(ActionSet) *time.Duration) {
        return s.internal.desiredResourcesFromMetricsOrRequestedUpscaling(now)
}

func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) (api.Resources, func(ActionSet) *time.Duration) {
        // There's some annoying edge cases that this function has to be able to handle properly. For
        // the sake of completeness, they are:
        //
        // 1. s.vm.Using() is not a multiple of s.computeUnit
        // 2. s.vm.Max() is less than s.computeUnit (or: has at least one resource that is)
        // 3. s.vm.Using() is a fractional multiple of s.computeUnit, but !allowDecrease and rounding up
        //    is greater than s.vm.Max()
        // 4. s.vm.Using() is much larger than s.vm.Min() and not a multiple of s.computeUnit, but load
        //    is low so we should just decrease *anyways*.
        //
        // ---
        //
        // Broadly, the implementation works like this:
        // For CPU:
        // Based on load average, calculate the "goal" number of CPUs (and therefore compute units)
        //
        // For Memory:
        // Based on memory usage, calculate the VM's desired memory allocation and extrapolate a
        // goal number of CUs from that.
        //
        // 1. Take the maximum of these two goal CUs to create a unified goal CU
        // 2. Cap the goal CU by min/max, etc
        // 3. that's it!

        reportGoals := func(goalCU uint32, parts ScalingGoalParts) {
                currentCU, ok := s.VM.Using().DivResources(s.Config.ComputeUnit)
                if !ok {
                        return // skip reporting if the current CU is not right.
                }

                if report := s.Config.ObservabilityCallbacks.HypotheticalScaling; report != nil {
                        report(now, uint32(currentCU), goalCU, parts)
                }
        }

        sg, goalCULogFields := calculateGoalCU(
                s.warn,
                s.scalingConfig(),
                s.Config.ComputeUnit,
                s.Metrics,
                s.LFCMetrics,
        )
        goalCU := sg.GoalCU()
        // If we don't have all the metrics we need, we'll later prevent downscaling to avoid flushing
        // the VM's cache on autoscaler-agent restart if we have SystemMetrics but not LFCMetrics.
        hasAllMetrics := sg.HasAllMetrics

        if hasAllMetrics {
                reportGoals(goalCU, sg.Parts)
        }

        // Copy the initial value of the goal CU so that we can accurately track whether either
        // requested upscaling or denied downscaling affected the outcome.
        // Otherwise as written, it'd be possible to update goalCU from requested upscaling and
        // incorrectly miss that denied downscaling could have had the same effect.
        initialGoalCU := goalCU

        var requestedUpscalingAffectedResult bool

        // Update goalCU based on any explicitly requested upscaling
        timeUntilRequestedUpscalingExpired := s.timeUntilRequestedUpscalingExpired(now)
        requestedUpscalingInEffect := timeUntilRequestedUpscalingExpired > 0
        if requestedUpscalingInEffect {
                reqCU := s.requiredCUForRequestedUpscaling(s.Config.ComputeUnit, *s.Monitor.RequestedUpscale)
                if reqCU > initialGoalCU {
                        // FIXME: this isn't quite correct, because if initialGoalCU is already equal to the
                        // maximum goal CU we *could* have, this won't actually have an effect.
                        requestedUpscalingAffectedResult = true
                        goalCU = max(goalCU, reqCU)
                }
        }

        var deniedDownscaleAffectedResult bool

        // Update goalCU based on any previously denied downscaling
        timeUntilDeniedDownscaleExpired := s.timeUntilDeniedDownscaleExpired(now)
        deniedDownscaleInEffect := timeUntilDeniedDownscaleExpired > 0
        if deniedDownscaleInEffect {
                reqCU := s.requiredCUForDeniedDownscale(s.Config.ComputeUnit, s.Monitor.DeniedDownscale.Requested)
                if reqCU > initialGoalCU {
                        deniedDownscaleAffectedResult = true
                        goalCU = max(goalCU, reqCU)
                }
        }

        // resources for the desired "goal" compute units
        goalResources := s.Config.ComputeUnit.Mul(uint16(goalCU))

        // If we don't have all the metrics we need to make a proper decision, make sure that we aren't
        // going to scale down below the current resources.
        // Otherwise, we can make an under-informed decision that has undesirable impacts (e.g., scaling
        // down because we don't have LFC metrics and flushing the cache because of it).
        if !hasAllMetrics {
                goalResources = goalResources.Max(s.VM.Using())
        }

        // bound goalResources by the minimum and maximum resource amounts for the VM
        result := goalResources.Min(s.VM.Max()).Max(s.VM.Min())

        // ... but if we aren't allowed to downscale, then we *must* make sure that the VM's usage value
        // won't decrease to the previously denied amount, even if it's greater than the maximum.
        //
        // We can run into siutations like this when VM scale-down on bounds change fails, so we end up
        // with a usage value greater than the maximum.
        //
        // It's not a great situation to be in, but it's easier to make the policy "give the users a
        // little extra if we mess up" than "oops we OOM-killed your DB, hope you weren't doing anything".
        if deniedDownscaleInEffect {
                // roughly equivalent to "result >= s.monitor.deniedDownscale.requested"
                if !result.HasFieldGreaterThan(s.Monitor.DeniedDownscale.Requested) {
                        // This can only happen if s.vm.Max() is less than goalResources, because otherwise this
                        // would have been factored into goalCU, affecting goalResources. Hence, the warning.
                        s.warn("Can't decrease desired resources to within VM maximum because of vm-monitor previously denied downscale request")
                }
                preMaxResult := result
                result = result.Max(s.minRequiredResourcesForDeniedDownscale(s.Config.ComputeUnit, *s.Monitor.DeniedDownscale))
                if result != preMaxResult {
                        deniedDownscaleAffectedResult = true
                }
        }

        // Check that the result is sound.
        //
        // With the current (naive) implementation, this is trivially ok. In future versions, it might
        // not be so simple, so it's good to have this integrity check here.
        if !deniedDownscaleAffectedResult && result.HasFieldGreaterThan(s.VM.Max()) {
                panic(fmt.Errorf(
                        "produced invalid desired state: result has field greater than max. this = %+v", *s,
                ))
        } else if result.HasFieldLessThan(s.VM.Min()) {
                panic(fmt.Errorf(
                        "produced invalid desired state: result has field less than min. this = %+v", *s,
                ))
        }

        calculateWaitTime := func(actions ActionSet) *time.Duration {
                var waiting bool
                waitTime := time.Duration(int64(1<<63 - 1)) // time.Duration is an int64. As an "unset" value, use the maximum.

                if deniedDownscaleAffectedResult && actions.MonitorDownscale == nil && s.Monitor.OngoingRequest == nil {
                        waitTime = min(waitTime, timeUntilDeniedDownscaleExpired)
                        waiting = true
                }
                if requestedUpscalingAffectedResult {
                        waitTime = min(waitTime, timeUntilRequestedUpscalingExpired)
                        waiting = true
                }

                if waiting {
                        return &waitTime
                } else {
                        return nil
                }
        }
        s.updateTargetRevision(now, result, s.VM.Using())

        // TODO: we are both saving the result into LastDesiredResources and returning it. This is
        // redundant, and we should remove one of the two.
        s.LastDesiredResources = &result

        logFields := []zap.Field{
                zap.Object("current", s.VM.Using()),
                zap.Object("target", result),
                zap.Object("targetRevision", &s.TargetRevision),
        }
        logFields = append(logFields, goalCULogFields...)
        s.info("Calculated desired resources", logFields...)

        return result, calculateWaitTime
}

func (s *state) updateTargetRevision(now time.Time, desired api.Resources, current api.Resources) {
        if s.LastDesiredResources == nil {
                s.LastDesiredResources = &current
        }

        if *s.LastDesiredResources == desired {
                // Nothing changed, so no need to update the target revision
                return
        }

        var flags vmv1.Flag

        if desired.HasFieldGreaterThan(*s.LastDesiredResources) {
                flags.Set(revsource.Upscale)
        }
        if desired.HasFieldLessThan(*s.LastDesiredResources) {
                flags.Set(revsource.Downscale)
        }

        s.TargetRevision = s.Config.RevisionSource.Next(now, flags)
}

func (s *state) updateNeonVMCurrentRevision(currentRevision vmv1.RevisionWithTime) {
        revsource.Propagate(currentRevision.UpdatedAt.Time,
                s.NeonVM.TargetRevision,
                &s.NeonVM.CurrentRevision,
                s.Config.ObservabilityCallbacks.NeonVMLatency,
        )
        err := s.Config.RevisionSource.Observe(currentRevision.UpdatedAt.Time, currentRevision.Revision)
        if err != nil {
                s.warnf("Failed to observe clock source: %v", err)
        }

        // We also zero out LastDesiredResources, because we are now starting from
        // a new current resources.
        s.LastDesiredResources = nil
}

func (s *state) timeUntilRequestedUpscalingExpired(now time.Time) time.Duration {
        if s.Monitor.RequestedUpscale != nil {
                return s.Monitor.RequestedUpscale.At.Add(s.Config.MonitorRequestedUpscaleValidPeriod).Sub(now)
        } else {
                return 0
        }
}

// NB: we could just use s.plugin.computeUnit or s.monitor.requestedUpscale from inside the
// function, but those are sometimes nil. This way, it's clear that it's the caller's responsibility
// to ensure that the values are non-nil.
func (s *state) requiredCUForRequestedUpscaling(computeUnit api.Resources, requestedUpscale requestedUpscale) uint32 {
        var required uint32
        requested := requestedUpscale.Requested
        base := requestedUpscale.Base

        // note: 1 + floor(x / M) gives the minimum integer value greater than x / M.

        if requested.Cpu {
                required = max(required, 1+uint32(base.VCPU/computeUnit.VCPU))
        }
        if requested.Memory {
                required = max(required, 1+uint32(base.Mem/computeUnit.Mem))
        }

        return required
}

func (s *state) timeUntilDeniedDownscaleExpired(now time.Time) time.Duration {
        if s.Monitor.DeniedDownscale != nil {
                return s.Monitor.DeniedDownscale.At.Add(s.Config.MonitorDeniedDownscaleCooldown).Sub(now)
        } else {
                return 0
        }
}

// NB: like requiredCUForRequestedUpscaling, we make the caller provide the values so that it's
// more clear that it's the caller's responsibility to ensure the values are non-nil.
func (s *state) requiredCUForDeniedDownscale(computeUnit, deniedResources api.Resources) uint32 {
        // note: floor(x / M) + 1 gives the minimum integer value greater than x / M.
        requiredFromCPU := 1 + uint32(deniedResources.VCPU/computeUnit.VCPU)
        requiredFromMem := 1 + uint32(deniedResources.Mem/computeUnit.Mem)

        return max(requiredFromCPU, requiredFromMem)
}

func (s *state) minRequiredResourcesForDeniedDownscale(computeUnit api.Resources, denied deniedDownscale) api.Resources {
        // for each resource, increase the value by one CU's worth, but not greater than the value we
        // were at while attempting to downscale.
        //
        // phrasing it like this cleanly handles some subtle edge cases when denied.current isn't a
        // multiple of the compute unit.
        return api.Resources{
                VCPU: min(denied.Current.VCPU, computeUnit.VCPU*(1+denied.Requested.VCPU/computeUnit.VCPU)),
                Mem:  min(denied.Current.Mem, computeUnit.Mem*(1+denied.Requested.Mem/computeUnit.Mem)),
        }
}

// clampResources uses the directionality of the difference between s.vm.Using() and desired to
// clamp the desired resources with the upper *or* lower bound
func (s *state) clampResources(
        current api.Resources,
        desired api.Resources,
        lowerBound *api.Resources,
        upperBound *api.Resources,
) api.Resources {
        // Internal validity checks:
        if lowerBound != nil && lowerBound.HasFieldGreaterThan(current) {
                panic(fmt.Errorf(
                        "clampResources called with invalid arguments: lowerBound=%+v has field greater than current=%+v",
                        lowerBound,
                        current,
                ))
        } else if upperBound != nil && upperBound.HasFieldLessThan(current) {
                panic(fmt.Errorf(
                        "clampResources called with invalid arguments: upperBound=%+v has field less than current=%+v",
                        upperBound,
                        current,
                ))
        }

        cpu := desired.VCPU
        if desired.VCPU < current.VCPU && lowerBound != nil {
                cpu = max(desired.VCPU, lowerBound.VCPU)
        } else if desired.VCPU > current.VCPU && upperBound != nil {
                cpu = min(desired.VCPU, upperBound.VCPU)
        }

        mem := desired.Mem
        if desired.Mem < current.Mem && lowerBound != nil {
                mem = max(desired.Mem, lowerBound.Mem)
        } else if desired.Mem > current.Mem && upperBound != nil {
                mem = min(desired.Mem, upperBound.Mem)
        }

        return api.Resources{VCPU: cpu, Mem: mem}
}

func (s *state) monitorApprovedLowerBound() api.Resources {
        if s.Monitor.Approved != nil {
                return *s.Monitor.Approved
        } else {
                return s.VM.Using()
        }
}

func (s *state) pluginApprovedUpperBound() api.Resources {
        if s.Plugin.Permit != nil {
                return *s.Plugin.Permit
        } else {
                return s.VM.Using()
        }
}

//////////////////////////////////////////
// PUBLIC FUNCTIONS TO UPDATE THE STATE //
//////////////////////////////////////////

// Debug sets s.debug = enabled. This method is exclusively meant to be used in tests, to make it
// easier to enable print debugging only for a single call to NextActions, via s.warn() or otherwise.
func (s *State) Debug(enabled bool) {
        s.internal.Debug = enabled
}

func (s *State) UpdatedVM(vm api.VmInfo) {
        // FIXME: overriding this is required right now because we trust that a successful request to
        // NeonVM means the VM was already updated, which... isn't true, and otherwise we could run into
        // sync issues.
        // A first-pass solution is possible by reading the values of VirtualMachine.Spec, but the
        // "proper" solution would read from VirtualMachine.Status, which (at time of writing) isn't
        // sound. For more, see:
        // - https://github.com/neondatabase/autoscaling/pull/371#issuecomment-1752110131
        // - https://github.com/neondatabase/autoscaling/issues/462
        vm.SetUsing(s.internal.VM.Using())
        s.internal.VM = vm
        if vm.CurrentRevision != nil {
                s.internal.updateNeonVMCurrentRevision(*vm.CurrentRevision)
        }

        // Make sure that if LFC metrics are disabled & later enabled, we don't make decisions based on
        // stale data.
        if !*s.internal.scalingConfig().EnableLFCMetrics {
                s.internal.LFCMetrics = nil
        }
}

func (s *State) UpdateSystemMetrics(metrics SystemMetrics) {
        s.internal.Metrics = &metrics
}

func (s *State) UpdateLFCMetrics(metrics LFCMetrics) {
        s.internal.LFCMetrics = &metrics
}

// PluginHandle provides write access to the scheduler plugin pieces of an UpdateState
type PluginHandle struct {
        s *state
}

func (s *State) Plugin() PluginHandle {
        return PluginHandle{&s.internal}
}

func (h PluginHandle) StartingRequest(now time.Time, resources api.Resources) {
        h.s.Plugin.LastRequest = &pluginRequested{
                At:        now,
                Resources: resources,
        }
        h.s.Plugin.OngoingRequest = true
}

func (h PluginHandle) RequestFailed(now time.Time) {
        h.s.Plugin.OngoingRequest = false
        h.s.Plugin.LastFailureAt = &now
}

func (h PluginHandle) RequestSuccessful(
        now time.Time,
        targetRevision vmv1.RevisionWithTime,
        resp api.PluginResponse,
) (_err error) {
        h.s.Plugin.OngoingRequest = false

        defer func() {
                if _err != nil {
                        h.s.Plugin.LastFailureAt = &now
                }
        }()

        if err := resp.Permit.ValidateNonZero(); err != nil {
                return fmt.Errorf("Invalid permit: %w", err)
        }

        // Errors from resp in connection with the prior request
        if resp.Permit.HasFieldGreaterThan(h.s.Plugin.LastRequest.Resources) {
                return fmt.Errorf(
                        "Permit has resources greater than request (%+v vs. %+v)",
                        resp.Permit, h.s.Plugin.LastRequest.Resources,
                )
        }

        // Errors from resp in connection with the prior request AND the VM state
        if vmUsing := h.s.VM.Using(); resp.Permit.HasFieldLessThan(vmUsing) {
                return fmt.Errorf("Permit has resources less than VM (%+v vs %+v)", resp.Permit, vmUsing)
        }

        // All good - set everything.

        // NOTE: We don't set the compute unit, even though the plugin response contains it. We're in
        // the process of moving the source of truth for ComputeUnit from the scheduler plugin to the
        // autoscaler-agent.
        h.s.Plugin.Permit = &resp.Permit
        revsource.Propagate(now,
                targetRevision,
                &h.s.Plugin.CurrentRevision,
                h.s.Config.ObservabilityCallbacks.PluginLatency,
        )
        return nil
}

// MonitorHandle provides write access to the vm-monitor pieces of an UpdateState
type MonitorHandle struct {
        s *state
}

func (s *State) Monitor() MonitorHandle {
        return MonitorHandle{&s.internal}
}

func (h MonitorHandle) Reset() {
        h.s.Monitor = monitorState{
                OngoingRequest:     nil,
                RequestedUpscale:   nil,
                DeniedDownscale:    nil,
                Approved:           nil,
                DownscaleFailureAt: nil,
                UpscaleFailureAt:   nil,
                CurrentRevision:    vmv1.ZeroRevision,
        }
}

func (h MonitorHandle) Active(active bool) {
        if active {
                approved := h.s.VM.Using()
                h.s.Monitor.Approved = &approved // TODO: this is racy
        } else {
                h.s.Monitor.Approved = nil
        }
}

func (h MonitorHandle) UpscaleRequested(now time.Time, resources api.MoreResources) {
        h.s.Monitor.RequestedUpscale = &requestedUpscale{
                At:        now,
                Base:      *h.s.Monitor.Approved,
                Requested: resources,
        }
}

func (h MonitorHandle) StartingUpscaleRequest(now time.Time, resources api.Resources) {
        h.s.Monitor.OngoingRequest = &ongoingMonitorRequest{
                Kind:      monitorRequestKindUpscale,
                Requested: resources,
        }
        h.s.Monitor.UpscaleFailureAt = nil
}

func (h MonitorHandle) UpscaleRequestSuccessful(now time.Time) {
        h.s.Monitor.Approved = &h.s.Monitor.OngoingRequest.Requested
        h.s.Monitor.OngoingRequest = nil
}

func (h MonitorHandle) UpscaleRequestFailed(now time.Time) {
        h.s.Monitor.OngoingRequest = nil
        h.s.Monitor.UpscaleFailureAt = &now
}

func (h MonitorHandle) StartingDownscaleRequest(now time.Time, resources api.Resources) {
        h.s.Monitor.OngoingRequest = &ongoingMonitorRequest{
                Kind:      monitorRequestKindDownscale,
                Requested: resources,
        }
        h.s.Monitor.DownscaleFailureAt = nil
}

func (h MonitorHandle) DownscaleRequestAllowed(now time.Time, rev vmv1.RevisionWithTime) {
        h.s.Monitor.Approved = &h.s.Monitor.OngoingRequest.Requested
        h.s.Monitor.OngoingRequest = nil
        revsource.Propagate(now,
                rev,
                &h.s.Monitor.CurrentRevision,
                h.s.Config.ObservabilityCallbacks.MonitorLatency,
        )
}

// Downscale request was successful but the monitor denied our request.
func (h MonitorHandle) DownscaleRequestDenied(now time.Time, targetRevision vmv1.RevisionWithTime) {
        h.s.Monitor.DeniedDownscale = &deniedDownscale{
                At:        now,
                Current:   *h.s.Monitor.Approved,
                Requested: h.s.Monitor.OngoingRequest.Requested,
        }
        h.s.Monitor.OngoingRequest = nil
        revsource.Propagate(now,
                targetRevision,
                &h.s.Monitor.CurrentRevision,
                h.s.Config.ObservabilityCallbacks.MonitorLatency,
        )
}

func (h MonitorHandle) DownscaleRequestFailed(now time.Time) {
        h.s.Monitor.OngoingRequest = nil
        h.s.Monitor.DownscaleFailureAt = &now
}

type NeonVMHandle struct {
        s *state
}

func (s *State) NeonVM() NeonVMHandle {
        return NeonVMHandle{&s.internal}
}

func (h NeonVMHandle) StartingRequest(now time.Time, resources api.Resources) {
        if report := h.s.Config.ObservabilityCallbacks.ActualScaling; report != nil {
                currentCU, currentOk := h.s.VM.Using().DivResources(h.s.Config.ComputeUnit)
                targetCU, targetOk := resources.DivResources(h.s.Config.ComputeUnit)

                if currentOk && targetOk {
                        report(now, uint32(currentCU), uint32(targetCU))
                }
        }

        // FIXME: add time to ongoing request info (or maybe only in RequestFailed?)
        h.s.NeonVM.OngoingRequested = &resources
}

func (h NeonVMHandle) RequestSuccessful(now time.Time) {
        if h.s.NeonVM.OngoingRequested == nil {
                panic("received NeonVM().RequestSuccessful() update without ongoing request")
        }

        resources := *h.s.NeonVM.OngoingRequested

        // FIXME: This is actually incorrect; we shouldn't trust that the VM has already been updated
        // just because the request completed. It takes longer for the reconcile cycle(s) to make the
        // necessary changes.
        // See the comments in (*State).UpdatedVM() for more info.
        h.s.VM.SetUsing(resources)

        h.s.NeonVM.OngoingRequested = nil
}

func (h NeonVMHandle) RequestFailed(now time.Time) {
        h.s.NeonVM.OngoingRequested = nil
        h.s.NeonVM.RequestFailedAt = &now
}

package testhelpers

import (
        "errors"
        "fmt"
        "reflect"
        "testing"

        "github.com/samber/lo"
        "github.com/stretchr/testify/assert"
)

type Assert struct {
        t                     *testing.T
        storedWarnings        *[]string
        waitingOnPreparedCall *bool

        tinfo transactionInfo
}

type transactionInfo struct {
        expectedWarnings []string
}

// NewAssert creates a new Assert object wrapping the provided *testing.T
func NewAssert(t *testing.T) Assert {
        return Assert{
                t:                     t,
                storedWarnings:        &[]string{},
                waitingOnPreparedCall: lo.ToPtr(false),
                tinfo: transactionInfo{
                        expectedWarnings: []string{},
                },
        }
}

// StoredWarnings returns a reference to the warnings that will be checked, intended to be used with
// the InitialStateOpt constructor WithStoredWarnings
func (a Assert) StoredWarnings() *[]string {
        return a.storedWarnings
}

// WithWarnings returns an Assert that expects the given warnings to be emitted on each operation
func (a Assert) WithWarnings(warnings ...string) Assert {
        a.tinfo.expectedWarnings = warnings
        return a
}

// Do calls the function with the provided arguments, checking that no unexpected warnings were
// generated
//
// This is only valid for functions that return nothing.
func (a Assert) Do(f any, args ...any) {
        a.Call(f, args...).Equals( /* empty args list means no returns */ )
}

// NoError calls the function with the provided arguments, checking that the error it returns is
// nil, and that no unexpected warnings were generated.
func (a Assert) NoError(f any, args ...any) {
        a.Call(f, args...).Equals(nil)
}

// Call sets up a prepared function call, which will not be executed until one of its methods is
// actually called, which will perform all the relevant checks.
//
// Variadic functions are not supported.
func (a Assert) Call(f any, args ...any) PreparedFunctionCall {
        if *a.waitingOnPreparedCall {
                panic(errors.New("previous Call() constructed but not executed (must use `Do()`, `NoError()`, or `Call().Equals()`)"))
        }

        fv := reflect.ValueOf(f)
        fTy := fv.Type()
        if fTy.Kind() != reflect.Func {
                panic(errors.New("f must be a function"))
        } else if fTy.IsVariadic() {
                panic(errors.New("f is variadic"))
        }

        var argValues []reflect.Value
        for _, a := range args {
                argValues = append(argValues, reflect.ValueOf(a))
        }

        *a.waitingOnPreparedCall = true

        return PreparedFunctionCall{a: a, f: fv, args: argValues}
}

// PreparedFunctionCall is a function call that has been set up by (Assert).Call() but not executed
type PreparedFunctionCall struct {
        a    Assert
        f    reflect.Value
        args []reflect.Value
}

// Equals calls the prepared function, checking that all the return values are equal to what's
// expected, and that no unexpected warnings were generated.
func (f PreparedFunctionCall) Equals(expected ...any) {
        *f.a.waitingOnPreparedCall = false

        fTy := f.f.Type()

        numOut := fTy.NumOut()
        if len(expected) != numOut {
                panic(fmt.Errorf(
                        "Mismatched number of out parameters from function: func has %d but expected len is %d",
                        numOut,
                        len(expected),
                ))
        }

        type unknownInterface any

        var actualReturnTypes []reflect.Type
        var expectedReturnTypes []reflect.Type
        for i := 0; i < numOut; i += 1 {
                actual := fTy.Out(i)
                actualReturnTypes = append(actualReturnTypes, actual)

                // Can't call reflect.Value.Type on nil, so if we're given a nil value, we have to be a
                // little more permissive.
                var expectedTy reflect.Type
                if expected[i] != nil {
                        expectedTy = reflect.TypeOf(expected[i])
                } else if actual.Kind() == reflect.Interface {
                        // well, the actual value can be a nil interface too, so it's probably fine
                        expectedTy = actual
                } else {
                        // but... if the actual value isn't an interface, there's a problem
                        expectedTy = reflect.TypeOf((*unknownInterface)(nil)).Elem()
                }
                expectedReturnTypes = append(expectedReturnTypes, expectedTy)
        }

        if !reflect.DeepEqual(expectedReturnTypes, actualReturnTypes) {
                panic(fmt.Errorf(
                        "provided return types not equal to the function's: function has %v, but expected has %v",
                        actualReturnTypes,
                        expectedReturnTypes,
                ))
        }

        returnValues := f.f.Call(f.args)
        for i := range returnValues {
                assert.Equal(f.a.t, expected[i], returnValues[i].Interface())
        }
        assert.Equal(f.a.t, f.a.tinfo.expectedWarnings, *f.a.storedWarnings)
        if f.a.t.Failed() {
                f.a.t.FailNow()
        }
        *f.a.storedWarnings = []string{}
}

package testhelpers

import (
        "fmt"
        "testing"
        "time"

        "github.com/stretchr/testify/require"
)

// FakeClock is a small facility that makes it easy to operate on duration since start with
// relative times, rather than absolute times.
type FakeClock struct {
        t    *testing.T
        base time.Time
        now  time.Time
}

// NewFakeClock creates a new fake clock, with the initial time set to an unspecified, round number.
func NewFakeClock(t *testing.T) *FakeClock {
        base, err := time.Parse(time.RFC3339, "2000-01-01T00:00:00Z") // a nice round number, to make things easier
        if err != nil {
                panic(err)
        }

        return &FakeClock{t: t, base: base, now: base}
}

// Now returns the current time of the clock
func (c *FakeClock) Now() time.Time {
        return c.now
}

// Elapsed returns the total time added (via Inc) since the clock was started
func (c *FakeClock) Elapsed() Elapsed {
        return Elapsed{c.t, c.now.Sub(c.base)}
}

// Inc adds duration to the current time of the clock
func (c *FakeClock) Inc(duration time.Duration) Elapsed {
        if duration < 0 {
                panic(fmt.Errorf("(*FakeClock).Inc() called with negative duration %s", duration))
        }
        c.now = c.now.Add(duration)
        return c.Elapsed()
}

type Elapsed struct {
        t *testing.T
        time.Duration
}

func (e Elapsed) AssertEquals(expected time.Duration) {
        require.Equal(e.t, expected, e.Duration)
}

package testhelpers

import (
        "fmt"
        "testing"

        "go.uber.org/zap"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/agent/core"
        "github.com/neondatabase/autoscaling/pkg/api"
)

type InitialVmInfoConfig struct {
        ComputeUnit    api.Resources
        MemorySlotSize api.Bytes

        MinCU uint16
        MaxCU uint16
}

type InitialStateConfig struct {
        VM InitialVmInfoConfig

        Core core.Config
}

type InitialStateOpt interface {
        modifyStateConfig(*core.Config)
}

type VmInfoOpt interface {
        InitialStateOpt

        modifyVmInfoConfig(*InitialVmInfoConfig)
        modifyVmInfoWithConfig(InitialVmInfoConfig, *api.VmInfo)
}

func CreateInitialState(config InitialStateConfig, opts ...InitialStateOpt) *core.State {
        vmOpts := []VmInfoOpt{}
        for _, o := range opts {
                if vo, ok := o.(VmInfoOpt); ok {
                        vmOpts = append(vmOpts, vo)
                }
        }

        vm := CreateVmInfo(config.VM, vmOpts...)

        for _, o := range opts {
                o.modifyStateConfig(&config.Core)
        }

        return core.NewState(vm, config.Core)
}

func CreateVmInfo(config InitialVmInfoConfig, opts ...VmInfoOpt) api.VmInfo {
        for _, o := range opts {
                o.modifyVmInfoConfig(&config)
        }

        if config.ComputeUnit.Mem%config.MemorySlotSize != 0 {
                panic(fmt.Errorf(
                        "compute unit is not divisible by memory slot size: %v is not divisible by %v",
                        config.ComputeUnit.Mem,
                        config.MemorySlotSize,
                ))
        }

        vm := api.VmInfo{
                Name:      "test",
                Namespace: "test",
                Cpu: api.VmCpuInfo{
                        Min: vmv1.MilliCPU(config.MinCU) * config.ComputeUnit.VCPU,
                        Use: vmv1.MilliCPU(config.MinCU) * config.ComputeUnit.VCPU,
                        Max: vmv1.MilliCPU(config.MaxCU) * config.ComputeUnit.VCPU,
                },
                Mem: api.VmMemInfo{
                        SlotSize: config.MemorySlotSize,
                        Min:      config.MinCU * uint16(config.ComputeUnit.Mem/config.MemorySlotSize),
                        Use:      config.MinCU * uint16(config.ComputeUnit.Mem/config.MemorySlotSize),
                        Max:      config.MaxCU * uint16(config.ComputeUnit.Mem/config.MemorySlotSize),
                },
                Config: api.VmConfig{
                        AutoMigrationEnabled: false,
                        AlwaysMigrate:        false,
                        ScalingConfig:        nil,
                        ScalingEnabled:       true,
                },
                CurrentRevision: nil,
        }

        for _, o := range opts {
                o.modifyVmInfoWithConfig(config, &vm)
        }

        return vm
}

type (
        coreConfigModifier   func(*core.Config)
        vmInfoConfigModifier func(*InitialVmInfoConfig)
        vmInfoModifier       func(InitialVmInfoConfig, *api.VmInfo)
)

var (
        _ VmInfoOpt = vmInfoConfigModifier(nil)
        _ VmInfoOpt = vmInfoModifier(nil)
)

func (m coreConfigModifier) modifyStateConfig(c *core.Config) { (func(*core.Config))(m)(c) }
func (m vmInfoConfigModifier) modifyStateConfig(*core.Config) {}
func (m vmInfoModifier) modifyStateConfig(*core.Config)       {}

func (m vmInfoModifier) modifyVmInfoConfig(*InitialVmInfoConfig) {}
func (m vmInfoConfigModifier) modifyVmInfoConfig(c *InitialVmInfoConfig) {
        (func(*InitialVmInfoConfig))(m)(c)
}

func (m vmInfoConfigModifier) modifyVmInfoWithConfig(InitialVmInfoConfig, *api.VmInfo) {}
func (m vmInfoModifier) modifyVmInfoWithConfig(c InitialVmInfoConfig, vm *api.VmInfo) {
        (func(InitialVmInfoConfig, *api.VmInfo))(m)(c, vm)
}

func WithConfigSetting(f func(*core.Config)) InitialStateOpt {
        return coreConfigModifier(f)
}

func WithStoredWarnings(warnings *[]string) InitialStateOpt {
        return WithConfigSetting(func(c *core.Config) {
                warn := c.Log.Warn
                c.Log.Warn = func(msg string, fields ...zap.Field) {
                        *warnings = append(*warnings, msg)
                        if warn != nil {
                                warn(msg, fields...)
                        }
                }
        })
}

func WithTestingLogfWarnings(t *testing.T) InitialStateOpt {
        return WithConfigSetting(func(c *core.Config) {
                warn := c.Log.Warn
                c.Log.Warn = func(msg string, fields ...zap.Field) {
                        t.Log(msg)
                        if warn != nil {
                                warn(msg, fields...)
                        }
                }
        })
}

func WithMinMaxCU(minCU, maxCU uint16) VmInfoOpt {
        return vmInfoConfigModifier(func(c *InitialVmInfoConfig) {
                c.MinCU = minCU
                c.MaxCU = maxCU
        })
}

func WithCurrentCU(cu uint16) VmInfoOpt {
        return vmInfoModifier(func(c InitialVmInfoConfig, vm *api.VmInfo) {
                vm.SetUsing(c.ComputeUnit.Mul(cu))
        })
}

func WithCurrentRevision(rev vmv1.RevisionWithTime) VmInfoOpt {
        return vmInfoModifier(func(c InitialVmInfoConfig, vm *api.VmInfo) {
                vm.CurrentRevision = &rev
        })
}

package testhelpers

import (
        "time"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)

type ExpectedRevision struct {
        vmv1.Revision
        Now func() time.Time
}

func NewExpectedRevision(now func() time.Time) *ExpectedRevision {
        return &ExpectedRevision{
                Now:      now,
                Revision: vmv1.ZeroRevision,
        }
}

func (e *ExpectedRevision) WithTime() vmv1.RevisionWithTime {
        return e.Revision.WithTime(e.Now())
}

type NilRevisionSource struct{}

func (c *NilRevisionSource) Next(_ time.Time, _ vmv1.Flag) vmv1.Revision {
        return vmv1.Revision{
                Value: 0,
                Flags: 0,
        }
}
func (c *NilRevisionSource) Observe(_ time.Time, _ vmv1.Revision) error { return nil }

package core

// Working set size estimation
// For more, see: https://www.notion.so/neondatabase/874ef1cc942a4e6592434dbe9e609350

import (
        "fmt"
)

type WssEstimatorConfig struct {
        // MaxAllowedIncreaseFactor is the maximum tolerable increase in slope between windows.
        // If the slope increases by more than this factor, we will cut off the working set size as the
        // border between the two windows.
        MaxAllowedIncreaseFactor float64
        // InitialOffset is the index of the minimum working set size we must consider.
        //
        // In practice, this is taken from the scaling config's LFCMinWaitBeforeDownscaleMinutes, with
        // the expectation that datapoints are all one minute apart, starting at 1m. So a value of 15m
        // translates to an InitialOffset of 14 (-1 because indexes start at zero, but the first
        // datapoint is 1m).
        InitialOffset int
        // WindowSize sets the offset for datapoints used in the calculation of the slope before & after
        // a point. For window size W, we calculate the slope at point P as value[P]-value[P-(W-1)].
        // This value must be >= 2.
        //
        // In practice, this value is taken from the scaling config's LFCWindowSizeMinutes, with the
        // expectation that datapoints are all one minute apart. So, a value of 5 minutes translates to
        // a WindowSize of 5.
        WindowSize int
}

// EstimateTrueWorkingSetSize returns an estimate of the "true" current working set size, given a
// series of datapoints for the observed working set size over increasing time intervals.
//
// In practice, the 'series' is e.g., values of 'neon.lfc_approximate_working_set_size_seconds(d)'
// for equidistant values of 'd' from 1 minute to 60 minutes.
//
// This function panics if:
// * cfg.WindowSize < 2
// * cfg.InitialOffset < cfg.WindowSize - 1
func EstimateTrueWorkingSetSize(
        series []float64,
        cfg WssEstimatorConfig,
) float64 {
        if cfg.WindowSize < 2 {
                panic(fmt.Errorf("cfg.WindowSize must be >= 2 (got %v)", cfg.WindowSize))
        } else if cfg.InitialOffset < cfg.WindowSize-1 {
                panic(fmt.Errorf("cfg.InitialOffset must be >= cfg.WindowSize - 1 (got %v < %v - 1)", cfg.InitialOffset, cfg.WindowSize))
        }

        // For a window size of e.g. 5 points, we're looking back from series[t] to series[t-4], because
        // series[t] is already included. (and similarly for looking forward to series[t+4]).
        // 'w' is a shorthand for that -1 to make the code in the loop below cleaner.
        w := cfg.WindowSize - 1

        for t := cfg.InitialOffset; t < len(series)-w; t += 1 {
                // In theory the HLL estimator will guarantee that - at any instant - increasing the
                // duration for the working set will not decrease the value.
                // However in practice, the individual values are not calculated at the same time, so we
                // must still account for the possibility that series[t] < series[t-w], or similarly for
                // series[t+w] and series[t].
                // Hence, max(0.0, ...)
                d0 := max(0.0, series[t]-series[t-w])
                d1 := max(0.0, series[t+w]-series[t])

                if d1 > d0*cfg.MaxAllowedIncreaseFactor {
                        return series[t]
                }
        }

        return series[len(series)-1]
}

// ProjectNextHighest looks at the rate of change between points in 'series', returning the maximum
// value if any of these slopes were to continue for 'projectLen' additional datapoints.
//
// For example, given the series '0, 1, 3, 4, 5', projectLen of 3, and ceil equal to 6,
// ProjectNextHighest will return 9 (because 1 → 3 would reach 9 if it continued for another 3
// datapoints (→ 5 → 7 → 9).
//
// Internally, ProjectNextHighest is used to allow preemptive scale-up when we can see that the
// observed working set size is increasing, but we don't know how big it'll get.
// In short, this function helps answer: "How much should we scale-up to accommodate expected
// increases in demand?".
func ProjectNextHighest(series []float64, projectLen float64) float64 {
        if len(series) < 2 {
                panic(fmt.Errorf("Cannot ProjectNextHighest with series of length %d (must be >= 2)", len(series)))
        }

        highest := series[0]
        for i := 1; i < len(series); i += 1 {
                x0 := series[i-1]
                x1 := max(x0, series[i]) // ignore decreases
                predicted := x1 + (x1-x0)*projectLen
                highest = max(highest, predicted)
        }

        return highest
}

package agent

// The Dispatcher is our interface with the monitor. We interact via a websocket
// connection through a simple RPC-style protocol.

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "sync"
        "sync/atomic"
        "time"

        "go.uber.org/zap"
        "nhooyr.io/websocket"
        "nhooyr.io/websocket/wsjson"

        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/util"
)

const (
        MinMonitorProtocolVersion api.MonitorProtoVersion = api.MonitorProtoV1_0
        MaxMonitorProtocolVersion api.MonitorProtoVersion = api.MonitorProtoV1_0
)

// This struct represents the result of a dispatcher.Call. Because the SignalSender
// passed in can only be generic over one type - we have this mock enum. Only
// one field should ever be non-nil, and it should always be clear which field
// is readable. For example, the caller of dispatcher.call(HealthCheck { .. })
// should only read the healthcheck field.
type MonitorResult struct {
        Result       *api.DownscaleResult
        Confirmation *api.UpscaleConfirmation
        HealthCheck  *api.HealthCheck
}

// The Dispatcher is the main object managing the websocket connection to the
// monitor. For more information on the protocol, see pkg/api/types.go
type Dispatcher struct {
        // The underlying connection we are managing
        conn *websocket.Conn

        // When someone sends a message, the dispatcher will attach a transaction id
        // to it so that it knows when a response is back. When it receives a message
        // with the same transaction id, it knows that that is the response to the original
        // message and will send it down the SignalSender so the original sender can use it.
        waiters map[uint64]util.SignalSender[waiterResult]

        // lock guards mutating the waiters, exitError, and (closing) exitSignal field.
        // conn and lastTransactionID are all thread safe.
        // runner, exit, and protoVersion are never modified.
        lock sync.Mutex

        // The runner that this dispatcher is part of
        runner *Runner

        exit func(status websocket.StatusCode, err error, transformErr func(error) error)

        exitError  error
        exitSignal chan struct{}

        // lastTransactionID is the last transaction id. When we need a new one
        // we simply bump it and take the new number.
        //
        // In order to prevent collisions between the IDs generated here vs by
        // the monitor, we only generate even IDs, and the monitor only generates
        // odd ones. So generating a new value is done by adding 2.
        lastTransactionID atomic.Uint64

        protoVersion api.MonitorProtoVersion
}

type waiterResult struct {
        err error
        res *MonitorResult
}

// Create a new Dispatcher, establishing a connection with the vm-monitor and setting up all the
// background threads to manage the connection.
func NewDispatcher(
        ctx context.Context,
        logger *zap.Logger,
        addr string,
        runner *Runner,
        sendUpscaleRequested func(request api.MoreResources, withLock func()),
) (_finalDispatcher *Dispatcher, _ error) {
        // Create a new root-level context for this Dispatcher so that we can cancel if need be
        ctx, cancelRootContext := context.WithCancel(ctx)
        defer func() {
                // cancel on failure or panic
                if _finalDispatcher == nil {
                        cancelRootContext()
                }
        }()

        connectTimeout := time.Second * time.Duration(runner.global.config.Monitor.ConnectionTimeoutSeconds)
        conn, protoVersion, err := connectToMonitor(ctx, logger, addr, connectTimeout)
        if err != nil {
                return nil, err
        }

        disp := &Dispatcher{
                conn:              conn,
                waiters:           make(map[uint64]util.SignalSender[waiterResult]),
                runner:            runner,
                lock:              sync.Mutex{},
                exit:              nil, // set below
                exitError:         nil,
                exitSignal:        make(chan struct{}),
                lastTransactionID: atomic.Uint64{}, // Note: initialized to 0, so it's even, as required.
                protoVersion:      *protoVersion,
        }
        disp.exit = func(status websocket.StatusCode, err error, transformErr func(error) error) {
                disp.lock.Lock()
                defer disp.lock.Unlock()

                if disp.Exited() {
                        return
                }

                close(disp.exitSignal)
                disp.exitError = err
                cancelRootContext()

                var closeReason string
                if err != nil {
                        if transformErr != nil {
                                closeReason = transformErr(err).Error()
                        } else {
                                closeReason = err.Error()
                        }
                } else {
                        closeReason = "normal exit"
                }

                // Run the actual websocket closing in a separate goroutine so we don't block while holding
                // the lock. It can take up to 10s to close:
                //
                // > [Close] will write a WebSocket close frame with a timeout of 5s and then wait 5s for
                // > the peer to send a close frame.
                //
                // This *potentially* runs us into race issues, but those are probably less bad to deal
                // with, tbh.
                go disp.conn.Close(status, closeReason)
        }

        go func() {
                <-ctx.Done()
                disp.exit(websocket.StatusNormalClosure, nil, nil)
        }()

        msgHandlerLogger := logger.Named("message-handler")
        runner.spawnBackgroundWorker(ctx, msgHandlerLogger, "vm-monitor message handler", func(c context.Context, l *zap.Logger) {
                disp.run(c, l, sendUpscaleRequested)
        })
        runner.spawnBackgroundWorker(ctx, logger.Named("health-checks"), "vm-monitor health checks", func(ctx context.Context, logger *zap.Logger) {
                timeout := time.Second * time.Duration(runner.global.config.Monitor.ResponseTimeoutSeconds)
                // FIXME: make this duration configurable
                ticker := time.NewTicker(5 * time.Second)
                defer ticker.Stop()

                // if we've had sequential failures for more than
                var firstSequentialFailure *time.Time
                continuedFailureAbortTimeout := time.Second * time.Duration(runner.global.config.Monitor.MaxHealthCheckSequentialFailuresSeconds)

                // if we don't have any errors, we will log only every 10th successful health check
                const logEveryNth = 10
                var okSequence int
                var failSequence int

                for {
                        select {
                        case <-ctx.Done():
                                return
                        case <-ticker.C:
                        }

                        startTime := time.Now()
                        _, err := disp.Call(ctx, logger, timeout, "HealthCheck", api.HealthCheck{})
                        endTime := time.Now()

                        logFields := []zap.Field{
                                zap.Duration("duration", endTime.Sub(startTime)),
                        }
                        if okSequence != 0 {
                                logFields = append(logFields, zap.Int("okSequence", okSequence))
                        }
                        if failSequence != 0 {
                                logFields = append(logFields, zap.Int("failSequence", failSequence))
                        }

                        if err != nil {
                                // health check failed, reset the ok sequence count
                                okSequence = 0
                                failSequence++
                                logger.Error("vm-monitor health check failed", append(logFields, zap.Error(err))...)

                                if firstSequentialFailure == nil {
                                        now := time.Now()
                                        firstSequentialFailure = &now
                                } else if since := time.Since(*firstSequentialFailure); since > continuedFailureAbortTimeout {
                                        err := fmt.Errorf("vm-monitor has been failing health checks for at least %s", continuedFailureAbortTimeout)
                                        logger.Error(fmt.Sprintf("%s, triggering connection restart", err.Error()))
                                        disp.exit(websocket.StatusInternalError, err, nil)
                                }
                        } else {
                                // health check was successful, so reset the sequential failures count
                                failSequence = 0
                                okSequence++
                                firstSequentialFailure = nil

                                if okSequence%logEveryNth == 0 {
                                        logger.Info("vm-monitor health check successful", logFields...)
                                }

                                runner.status.update(runner.global, func(s podStatus) podStatus {
                                        now := time.Now()
                                        s.lastSuccessfulMonitorComm = &now
                                        return s
                                })
                        }
                }
        })
        return disp, nil
}

func connectToMonitor(
        ctx context.Context,
        logger *zap.Logger,
        addr string,
        timeout time.Duration,
) (_ *websocket.Conn, _ *api.MonitorProtoVersion, finalErr error) {
        ctx, cancel := context.WithTimeout(ctx, timeout)
        defer cancel()

        logger.Info("Connecting to vm-monitor via websocket", zap.String("addr", addr))

        // We do not need to close the response body according to docs.
        // Doing so causes memory bugs.
        c, _, err := websocket.Dial(ctx, addr, nil) //nolint:bodyclose // see comment above
        if err != nil {
                return nil, nil, fmt.Errorf("error establishing websocket connection to %s: %w", addr, err)
        }

        // If we return early, make sure we close the websocket
        var failureReason websocket.StatusCode
        defer func() {
                if finalErr != nil {
                        if failureReason == 0 {
                                failureReason = websocket.StatusInternalError
                        }
                        c.Close(failureReason, finalErr.Error())
                }
        }()

        versionRange := api.VersionRange[api.MonitorProtoVersion]{
                Min: MinMonitorProtocolVersion,
                Max: MaxMonitorProtocolVersion,
        }
        logger.Info("Sending protocol version range", zap.Any("range", versionRange))

        // Figure out protocol version
        err = wsjson.Write(ctx, c, versionRange)
        if err != nil {
                return nil, nil, fmt.Errorf("error sending protocol range to monitor: %w", err)
        }

        logger.Info("Reading monitor version response")
        var resp api.MonitorProtocolResponse
        err = wsjson.Read(ctx, c, &resp)
        if err != nil {
                logger.Error("Failed to read monitor response", zap.Error(err))
                failureReason = websocket.StatusProtocolError
                return nil, nil, fmt.Errorf("Error reading vm-monitor response during protocol handshake: %w", err)
        }

        logger.Info("Got monitor version response", zap.Any("response", resp))
        if resp.Error != nil {
                logger.Error("Got error response from vm-monitor", zap.Any("response", resp), zap.String("error", *resp.Error))
                failureReason = websocket.StatusProtocolError
                return nil, nil, fmt.Errorf("Monitor returned error during protocol handshake: %q", *resp.Error)
        }

        logger.Info("negotiated protocol version with monitor", zap.Any("response", resp), zap.String("version", resp.Version.String()))
        return c, &resp.Version, nil
}

// ExitSignal returns a channel that is closed when the Dispatcher is no longer running
func (disp *Dispatcher) ExitSignal() <-chan struct{} {
        return disp.exitSignal
}

// Exited returns whether the Dispatcher is no longer running
//
// Exited will return true iff the channel returned by ExitSignal is closed.
func (disp *Dispatcher) Exited() bool {
        select {
        case <-disp.exitSignal:
                return true
        default:
                return false
        }
}

// ExitError returns the error that caused the dispatcher to exit, if there was one
func (disp *Dispatcher) ExitError() error {
        disp.lock.Lock()
        defer disp.lock.Unlock()
        return disp.exitError
}

// temporary method to hopefully help with https://github.com/neondatabase/autoscaling/issues/503
func (disp *Dispatcher) lenWaiters() int {
        disp.lock.Lock()
        defer disp.lock.Unlock()
        return len(disp.waiters)
}

// Send a message down the connection. Only call this method with types that
// SerializeMonitorMessage can handle.
func (disp *Dispatcher) send(ctx context.Context, logger *zap.Logger, id uint64, message any) error {
        data, err := api.SerializeMonitorMessage(message, id)
        if err != nil {
                return fmt.Errorf("error serializing message: %w", err)
        }
        // wsjson.Write serializes whatever is passed in, and go serializes []byte
        // by base64 encoding it, so use RawMessage to avoid serializing to []byte
        // (done by SerializeMonitorMessage), and then base64 encoding again
        raw := json.RawMessage(data)
        logger.Debug("sending message to monitor", zap.ByteString("message", raw))
        return wsjson.Write(ctx, disp.conn, &raw)
}

// registerWaiter registers a util.SignalSender to get notified when a
// message with the given id arrives.
func (disp *Dispatcher) registerWaiter(id uint64, sender util.SignalSender[waiterResult]) {
        disp.lock.Lock()
        defer disp.lock.Unlock()
        disp.waiters[id] = sender
}

// unregisterWaiter deletes a preexisting waiter without interacting with it.
func (disp *Dispatcher) unregisterWaiter(id uint64) {
        disp.lock.Lock()
        defer disp.lock.Unlock()
        delete(disp.waiters, id)
}

// Make a request to the monitor and wait for a response. The value passed as message must be a
// valid value to send to the monitor. See the docs for SerializeMonitorMessage for more.
//
// This function must NOT be called while holding disp.runner.lock.
func (disp *Dispatcher) Call(
        ctx context.Context,
        logger *zap.Logger,
        timeout time.Duration,
        messageType string,
        message any,
) (*MonitorResult, error) {
        id := disp.lastTransactionID.Add(2)
        sender, receiver := util.NewSingleSignalPair[waiterResult]()

        status := "internal error"
        defer func() {
                disp.runner.global.metrics.monitorRequestsOutbound.WithLabelValues(messageType, status).Inc()
        }()

        // register the waiter *before* sending, so that we avoid a potential race where we'd get a
        // reply to the message before being ready to receive it.
        disp.registerWaiter(id, sender)
        err := disp.send(ctx, logger, id, message)
        if err != nil {
                logger.Error("failed to send message", zap.Any("message", message), zap.Error(err))
                disp.unregisterWaiter(id)
                status = "[error: failed to send]"
                return nil, err
        }

        timer := time.NewTimer(timeout)
        defer timer.Stop()

        select {
        case result := <-receiver.Recv():
                if result.err != nil {
                        status = fmt.Sprintf("[error: %s]", result.err)
                        return nil, errors.New("monitor experienced an internal error")
                }

                status = "ok"
                return result.res, nil
        case <-timer.C:
                err := fmt.Errorf("timed out waiting %v for monitor response", timeout)
                disp.unregisterWaiter(id)
                status = "[error: timed out waiting for response]"
                return nil, err
        }
}

func extractField[T any](data map[string]interface{}, key string) (*T, error) {
        field, ok := data[key]
        if !ok {
                return nil, fmt.Errorf("data had no key %q", key)
        }

        coerced, ok := field.(T)
        if !ok {
                return nil, fmt.Errorf("data[%q] was not of type %T", key, *new(T))
        }

        return &coerced, nil
}

type messageHandlerFuncs struct {
        handleUpscaleRequest      func(api.UpscaleRequest)
        handleUpscaleConfirmation func(api.UpscaleConfirmation, uint64) error
        handleDownscaleResult     func(api.DownscaleResult, uint64) error
        handleMonitorError        func(api.InternalError, uint64) error
        handleHealthCheck         func(api.HealthCheck, uint64) error
}

// Handle messages from the monitor. Make sure that all message types the monitor
// can send are included in the inner switch statement.
func (disp *Dispatcher) HandleMessage(
        ctx context.Context,
        logger *zap.Logger,
        handlers messageHandlerFuncs,
) error {
        // Deserialization has several steps:
        // 1. Deserialize into an unstructured map[string]interface{}
        // 2. Read the `type` field to know the type of the message
        // 3. Then try to to deserialize again, but into that specific type
        // 4. All message also come with an integer id under the key `id`

        // wsjson.Read tries to deserialize the message. If we were to read to a
        // []byte, it would base64 encode it as part of deserialization. json.RawMessage
        // avoids this, and we manually deserialize later
        var message json.RawMessage
        if err := wsjson.Read(ctx, disp.conn, &message); err != nil {
                return fmt.Errorf("Error receiving message: %w", err)
        }

        logger.Debug("(pre-decoding): received a message", zap.ByteString("message", message))

        var unstructured map[string]interface{}
        if err := json.Unmarshal(message, &unstructured); err != nil {
                return fmt.Errorf("Error deserializing message: %q", string(message))
        }

        typeStr, err := extractField[string](unstructured, "type")
        if err != nil {
                return fmt.Errorf("Error extracting 'type' field: %w", err)
        }

        // go thinks all json numbers are float64 so we first deserialize to that to
        // avoid the type error, then cast to uint64
        f, err := extractField[float64](unstructured, "id")
        if err != nil {
                return fmt.Errorf("Error extracting 'id field: %w", err)
        }
        id := uint64(*f)

        var rootErr error

        // now that we have the waiter's ID, make sure that if there's some failure past this point, we
        // propagate that along to the monitor and remove it
        defer func() {
                // speculatively determine the root error, to send that along to the instance of Call
                // waiting for it.
                var err error

                panicPayload := recover()
                if panicPayload != nil {
                        err = errors.New("panicked")
                } else if rootErr != nil {
                        err = rootErr
                } else {
                        // if HandleMessage bailed without panicking or setting rootErr, but *also* without
                        // sending a message to the waiter, we should make sure that *something* gets sent, so
                        // the message doesn't just time out. But we don't have more information, so the error
                        // is still just "unknown".
                        err = errors.New("unknown")
                }

                disp.lock.Lock()
                defer disp.lock.Unlock()
                if sender, ok := disp.waiters[id]; ok {
                        sender.Send(waiterResult{err: err, res: nil})
                        delete(disp.waiters, id)
                } else if rootErr != nil {
                        // we had some error while handling the message with this ID, and there wasn't a
                        // corresponding waiter. We should make note of this in the metrics:
                        status := fmt.Sprintf("[error: %s]", rootErr)
                        disp.runner.global.metrics.monitorRequestsInbound.WithLabelValues(*typeStr, status)
                }

                // resume panicking if we were before
                if panicPayload != nil {
                        panic(panicPayload)
                }
        }()

        // Helper function to handle common unmarshalling logic
        unmarshal := func(value any) error {
                if err := json.Unmarshal(message, value); err != nil {
                        rootErr = errors.New("Failed unmarshaling JSON")
                        err := fmt.Errorf("Error unmarshaling %s: %w", *typeStr, err)
                        logger.Error(rootErr.Error(), zap.Error(err))
                        // we're already on the error path anyways
                        _ = disp.send(ctx, logger, id, api.InvalidMessage{Error: err.Error()})
                        return err
                }

                return nil
        }

        switch *typeStr {
        case "UpscaleRequest":
                var req api.UpscaleRequest
                if err := unmarshal(&req); err != nil {
                        return err
                }
                handlers.handleUpscaleRequest(req)
                return nil
        case "UpscaleConfirmation":
                var confirmation api.UpscaleConfirmation
                if err := unmarshal(&confirmation); err != nil {
                        return err
                }
                return handlers.handleUpscaleConfirmation(confirmation, id)
        case "DownscaleResult":
                var res api.DownscaleResult
                if err := unmarshal(&res); err != nil {
                        return err
                }
                return handlers.handleDownscaleResult(res, id)
        case "InternalError":
                var monitorErr api.InternalError
                if err := unmarshal(&monitorErr); err != nil {
                        return err
                }
                return handlers.handleMonitorError(monitorErr, id)
        case "HealthCheck":
                var healthCheck api.HealthCheck
                if err := unmarshal(&healthCheck); err != nil {
                        return err
                }
                return handlers.handleHealthCheck(healthCheck, id)
        case "InvalidMessage":
                var warning api.InvalidMessage
                if err := unmarshal(&warning); err != nil {
                        return err
                }
                logger.Warn("Received notification we sent an invalid message", zap.Any("warning", warning))
                return nil
        default:
                rootErr = errors.New("Received unknown message type")
                return disp.send(
                        ctx,
                        logger,
                        id,
                        api.InvalidMessage{Error: fmt.Sprintf("Received message of unknown type: %q", *typeStr)},
                )
        }
}

// Long running function that orchestrates all requests/responses.
func (disp *Dispatcher) run(ctx context.Context, logger *zap.Logger, upscaleRequester func(_ api.MoreResources, withLock func())) {
        logger.Info("Starting message handler")

        // Utility for logging + returning an error when we get a message with an
        // id we're unaware of. Note: unknownMessage is not a message type.
        handleUnkownMessage := func(messageType string, id uint64) error {
                fmtString := "Received %s with id %d but id is unknown or already timed out waiting for a reply"
                msg := fmt.Sprintf(fmtString, messageType, id)
                logger.Warn(msg, zap.Uint64("id", id))
                return disp.send(ctx, logger, id, api.InvalidMessage{Error: msg})
        }

        // Does not take a message id because we don't know when the agent will
        // upscale. The monitor will get the result back as a NotifyUpscale message
        // from us, with a new id.
        handleUpscaleRequest := func(req api.UpscaleRequest) {
                // TODO: it shouldn't be this function's responsibility to update metrics.
                defer func() {
                        disp.runner.global.metrics.monitorRequestsInbound.WithLabelValues("UpscaleRequest", "ok").Inc()
                }()

                resourceReq := api.MoreResources{
                        Cpu:    false,
                        Memory: true,
                }

                upscaleRequester(resourceReq, func() {
                        logger.Info("Updating requested upscale", zap.Any("requested", resourceReq))
                })
        }
        handleUpscaleConfirmation := func(_ api.UpscaleConfirmation, id uint64) error {
                disp.lock.Lock()
                defer disp.lock.Unlock()

                sender, ok := disp.waiters[id]
                if ok {
                        logger.Info("vm-monitor confirmed upscale", zap.Uint64("id", id))
                        sender.Send(waiterResult{
                                err: nil,
                                res: &MonitorResult{
                                        Confirmation: &api.UpscaleConfirmation{},
                                        Result:       nil,
                                        HealthCheck:  nil,
                                },
                        })
                        // Don't forget to delete the waiter
                        delete(disp.waiters, id)
                        return nil
                } else {
                        return handleUnkownMessage("UpscaleConfirmation", id)
                }
        }
        handleDownscaleResult := func(res api.DownscaleResult, id uint64) error {
                disp.lock.Lock()
                defer disp.lock.Unlock()

                sender, ok := disp.waiters[id]
                if ok {
                        logger.Info("vm-monitor returned downscale result", zap.Uint64("id", id), zap.Any("result", res))
                        sender.Send(waiterResult{
                                err: nil,
                                res: &MonitorResult{
                                        Result:       &res,
                                        Confirmation: nil,
                                        HealthCheck:  nil,
                                },
                        })
                        // Don't forget to delete the waiter
                        delete(disp.waiters, id)
                        return nil
                } else {
                        return handleUnkownMessage("DownscaleResult", id)
                }
        }
        handleMonitorError := func(err api.InternalError, id uint64) error {
                disp.lock.Lock()
                defer disp.lock.Unlock()

                sender, ok := disp.waiters[id]
                if ok {
                        logger.Warn(
                                "vm-monitor experienced an internal error",
                                zap.Uint64("id", id),
                                zap.String("error", err.Error),
                        )
                        // Indicate to the receiver that an error occurred
                        sender.Send(waiterResult{
                                err: errors.New("vm-monitor internal error"),
                                res: nil,
                        })
                        // Don't forget to delete the waiter
                        delete(disp.waiters, id)
                        return nil
                } else {
                        return handleUnkownMessage("MonitorError", id)
                }
        }
        handleHealthCheck := func(confirmation api.HealthCheck, id uint64) error {
                disp.lock.Lock()
                defer disp.lock.Unlock()

                sender, ok := disp.waiters[id]
                if ok {
                        logger.Debug("vm-monitor responded to health check", zap.Uint64("id", id))
                        // Indicate to the receiver that an error occurred
                        sender.Send(waiterResult{
                                err: nil,
                                res: &MonitorResult{
                                        HealthCheck:  &api.HealthCheck{},
                                        Result:       nil,
                                        Confirmation: nil,
                                },
                        })
                        // Don't forget to delete the waiter
                        delete(disp.waiters, id)
                        return nil
                } else {
                        return handleUnkownMessage("HealthCheck", id)
                }
        }

        handlers := messageHandlerFuncs{
                handleUpscaleRequest:      handleUpscaleRequest,
                handleUpscaleConfirmation: handleUpscaleConfirmation,
                handleDownscaleResult:     handleDownscaleResult,
                handleMonitorError:        handleMonitorError,
                handleHealthCheck:         handleHealthCheck,
        }

        for {
                err := disp.HandleMessage(ctx, logger, handlers)
                if err != nil {
                        if ctx.Err() != nil {
                                // The context is already cancelled, so this error is mostly likely
                                // expected. For example, if the context is cancelled because the
                                // runner exited, we should expect to fail to read off the connection,
                                // which is closed by the server exit.
                                logger.Warn("Error handling message", zap.Error(err))
                        } else {
                                logger.Error("Error handling message, shutting down connection", zap.Error(err))
                                err = fmt.Errorf("Error handling message: %w", err)
                                // note: in theory we *could* be more descriptive with these statuses, but the only
                                // consumer of this API is the vm-monitor, and it doesn't check those.
                                //
                                // Also note: there's a limit on the size of the close frame we're allowed to send,
                                // so the actual error we use to exit with must be somewhat reduced in size. These
                                // "Error handling message" errors can get quite long, so we'll only use the root
                                // cause of the error for the message.
                                disp.exit(websocket.StatusInternalError, err, util.RootError)
                        }
                        return
                }
        }
}

package agent

// Utilities for dumping internal state

import (
        "context"
        "errors"
        "fmt"
        "net"
        "net/http"
        "runtime"
        "slices"
        "strings"
        "sync"
        "time"

        "go.uber.org/zap"

        "github.com/neondatabase/autoscaling/pkg/util"
)

type StateDump struct {
        Stopped bool           `json:"stopped"`
        Pods    []podStateDump `json:"pods"`
}

func (s *agentState) StartDumpStateServer(shutdownCtx context.Context, logger *zap.Logger, config *DumpStateConfig) error {
        // Manually start the TCP listener so we can minimize errors in the background thread.
        addr := net.TCPAddr{IP: net.IPv4zero, Port: int(config.Port)}
        listener, err := net.ListenTCP("tcp", &addr)
        if err != nil {
                return fmt.Errorf("Error binding to %v", addr)
        }

        go func() {
                mux := http.NewServeMux()
                util.AddHandler(logger, mux, "/", http.MethodGet, "<empty>", func(ctx context.Context, logger *zap.Logger, body *struct{}) (*StateDump, int, error) {
                        timeout := time.Duration(config.TimeoutSeconds) * time.Second

                        startTime := time.Now()
                        ctx, cancel := context.WithTimeout(ctx, timeout)
                        defer cancel()

                        state, err := s.DumpState(ctx, shutdownCtx.Err() != nil)
                        if err != nil {
                                if ctx.Err() != nil && errors.Is(ctx.Err(), context.DeadlineExceeded) {
                                        totalDuration := time.Since(startTime)
                                        return nil, 500, fmt.Errorf("timed out after %s while getting state", totalDuration)
                                } else {
                                        // some other type of cancel; 400 is a little weird, but there isn't a great
                                        // option here.
                                        return nil, 400, fmt.Errorf("error while getting state: %w", err)
                                }
                        }

                        return state, 200, nil
                })
                // note: we don't shut down this server. It should be possible to continue fetching the
                // internal state after shutdown has started.
                server := &http.Server{Handler: mux}
                if err := server.Serve(listener); err != nil {
                        logger.Error("dump-state server exited", zap.Error(err))
                }
        }()

        return nil
}

func (s *agentState) DumpState(ctx context.Context, stopped bool) (*StateDump, error) {
        // Copy the high-level state, then process it
        podList, err := func() ([]*podState, error) {
                if err := s.lock.TryLock(ctx); err != nil {
                        return nil, err
                }
                defer s.lock.Unlock()

                list := make([]*podState, 0, len(s.pods))
                for name := range s.pods {
                        list = append(list, s.pods[name])
                }
                return list, nil
        }()
        if err != nil {
                return nil, err
        }

        state := StateDump{
                Stopped: stopped,
                Pods:    make([]podStateDump, len(podList)),
        }

        wg := sync.WaitGroup{}
        wg.Add(len(podList))
        concurrencyLimit := runtime.NumCPU()
        sema := make(chan struct{}, concurrencyLimit) // semaphore

        for i, pod := range podList {
                sema <- struct{}{} // enforce only 'concurrencyLimit' threads running at a time
                go func() {
                        defer func() {
                                <-sema
                                wg.Done()
                        }()

                        state.Pods[i] = pod.dump(ctx)
                }()
        }

        // note: pod.Dump() respects the context, even with locking. When the context expires before we
        // acquire a lock, there's still valuable information to return - it's worthwhile to wait for
        // that to make it back to state.Pods when the context expires, instead of proactively aborting
        // in *this* thread.
        wg.Wait()

        // Sort the pods by name, so that we produce a deterministic ordering
        slices.SortFunc(state.Pods, func(a, b podStateDump) int {
                if n := strings.Compare(a.PodName.Namespace, b.PodName.Namespace); n != 0 {
                        return n
                }

                return strings.Compare(a.PodName.Name, b.PodName.Name)
        })

        return &state, nil
}

package agent

import (
        "context"
        "fmt"

        "github.com/tychoish/fun/pubsub"
        "go.uber.org/zap"

        "k8s.io/client-go/kubernetes"

        vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
        "github.com/neondatabase/autoscaling/pkg/agent/billing"
        "github.com/neondatabase/autoscaling/pkg/agent/scalingevents"
        "github.com/neondatabase/autoscaling/pkg/agent/schedwatch"
        "github.com/neondatabase/autoscaling/pkg/util"
        "github.com/neondatabase/autoscaling/pkg/util/taskgroup"
        "github.com/neondatabase/autoscaling/pkg/util/watch"
)

type MainRunner struct {
        EnvArgs    EnvArgs
        Config     *Config
        KubeClient *kubernetes.Clientset
        VMClient   *vmclient.Clientset
}

func (r MainRunner) Run(logger *zap.Logger, ctx context.Context) error {
        vmEventQueue := pubsub.NewUnlimitedQueue[vmEvent]()
        defer vmEventQueue.Close()
        pushToQueue := func(ev vmEvent) {
                if err := vmEventQueue.Add(ev); err != nil {
                        logger.Warn("Failed to add vmEvent to queue", zap.Object("event", ev), zap.Error(err))
                }
        }

        globalMetrics, globalPromReg := makeGlobalMetrics()
        perVMMetrics, vmPromReg := makePerVMMetrics()

        watchMetrics := watch.NewMetrics("autoscaling_agent_watchers", globalPromReg)

        logger.Info("Starting VM watcher")
        vmWatchStore, err := startVMWatcher(ctx, logger, r.Config, r.VMClient, watchMetrics, perVMMetrics, r.EnvArgs.K8sNodeName, pushToQueue)
        if err != nil {
                return fmt.Errorf("Error starting VM watcher: %w", err)
        }
        defer vmWatchStore.Stop()
        logger.Info("VM watcher started")

        schedTracker, err := schedwatch.StartSchedulerWatcher(ctx, logger, r.KubeClient, watchMetrics, r.Config.Scheduler.SchedulerName)
        if err != nil {
                return fmt.Errorf("Starting scheduler watch server: %w", err)
        }
        defer schedTracker.Stop()

        scalingEventsMetrics := scalingevents.NewPromMetrics(globalPromReg)
        scalingReporter, err := scalingevents.NewReporter(ctx, logger, &r.Config.ScalingEvents, scalingEventsMetrics)
        if err != nil {
                return fmt.Errorf("Error creating scaling events reporter: %w", err)
        }

        globalState := r.newAgentState(
                logger,
                r.EnvArgs.K8sPodIP,
                schedTracker,
                scalingReporter,
                globalMetrics,
                perVMMetrics,
        )

        logger.Info("Starting billing metrics collector")
        storeForNode := watch.NewIndexedStore(vmWatchStore, billing.NewVMNodeIndex(r.EnvArgs.K8sNodeName))

        billingMetrics := billing.NewPromMetrics(globalPromReg)

        promLogger := logger.Named("prometheus")
        if err := util.StartPrometheusMetricsServer(ctx, promLogger.Named("global"), 9100, globalPromReg); err != nil {
                return fmt.Errorf("Error starting prometheus metrics server: %w", err)
        }
        if err := util.StartPrometheusMetricsServer(ctx, promLogger.Named("per-vm"), 9101, vmPromReg); err != nil {
                return fmt.Errorf("Error starting prometheus metrics server: %w", err)
        }

        if r.Config.DumpState != nil {
                logger.Info("Starting 'dump state' server")
                if err := globalState.StartDumpStateServer(ctx, logger.Named("dump-state"), r.Config.DumpState); err != nil {
                        return fmt.Errorf("Error starting dump state server: %w", err)
                }
        }

        mc, err := billing.NewMetricsCollector(ctx, logger, &r.Config.Billing, billingMetrics)
        if err != nil {
                return fmt.Errorf("error creating billing metrics collector: %w", err)
        }

        tg := taskgroup.NewGroup(logger, taskgroup.WithParentContext(ctx))
        tg.Go("scalingevents-run", func(logger *zap.Logger) error {
                return scalingReporter.Run(tg.Ctx())
        })
        tg.Go("billing", func(logger *zap.Logger) error {
                return mc.Run(tg.Ctx(), logger, storeForNode)
        })
        tg.Go("main-loop", func(logger *zap.Logger) error {
                logger.Info("Entering main loop")
                for {
                        event, err := vmEventQueue.Wait(ctx)
                        if err != nil {
                                if ctx.Err() != nil {
                                        // treat context canceled as a "normal" exit (because it is)
                                        return nil
                                }

                                logger.Error("vmEventQueue returned error", zap.Error(err))
                                return err
                        }
                        globalState.handleEvent(tg.Ctx(), logger, event)
                }
        })

        return tg.Wait()
}

package agent

// Implementations of the interfaces used by & defined in pkg/agent/executor
//
// This file is essentially the bridge between 'runner.go' and 'executor/',
// connecting the latter to the actual implementations in the former.

import (
        "context"
        "fmt"

        "go.uber.org/zap"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/agent/executor"
        "github.com/neondatabase/autoscaling/pkg/api"
)

var (
        _ executor.PluginInterface  = (*execPluginInterface)(nil)
        _ executor.NeonVMInterface  = (*execNeonVMInterface)(nil)
        _ executor.MonitorInterface = (*execMonitorInterface)(nil)
)

/////////////////////////////////////////////////////////////
// Scheduler Plugin -related interfaces and implementation //
/////////////////////////////////////////////////////////////

type execPluginInterface struct {
        runner *Runner
}

func makePluginInterface(r *Runner) *execPluginInterface {
        return &execPluginInterface{runner: r}
}

// scalingResponseType indicates type of scaling response from the scheduler plugin
type scalingResponseType string

const (
        scalingResponseTypeDenied            = "denied"
        scalingResponseTypeApproved          = "approved"
        scalingResponseTypePartiallyApproved = "partiallyApproved"
        scalingResponseTypeFailed            = "failed"
)

// Request implements executor.PluginInterface
func (iface *execPluginInterface) Request(
        ctx context.Context,
        logger *zap.Logger,
        lastPermit *api.Resources,
        target api.Resources,
        metrics *api.Metrics,
) (*api.PluginResponse, error) {
        if lastPermit != nil {
                iface.runner.recordResourceChange(*lastPermit, target, iface.runner.global.metrics.schedulerRequestedChange)
        }

        resp, err := iface.runner.DoSchedulerRequest(ctx, logger, target, lastPermit, metrics)

        if err == nil && lastPermit != nil {
                iface.runner.recordResourceChange(*lastPermit, resp.Permit, iface.runner.global.metrics.schedulerApprovedChange)
        }

        responseType := func() scalingResponseType {
                if err != nil { // request is failed
                        return scalingResponseTypeFailed
                }
                if resp.Permit == target { // request is fully approved by the scheduler
                        return scalingResponseTypeApproved
                }
                if lastPermit != nil && *lastPermit != resp.Permit { // request is partially approved by the scheduler
                        return scalingResponseTypePartiallyApproved
                }
                return scalingResponseTypeDenied // scheduler denied the request
        }()
        // update VM metrics
        switch responseType {
        case scalingResponseTypePartiallyApproved:
                iface.runner.global.metrics.scalingPartialApprovalsTotal.WithLabelValues(directionValueInc).Inc()
        case scalingResponseTypeDenied:
                iface.runner.global.metrics.scalingFullDeniesTotal.WithLabelValues(directionValueInc).Inc()
        default:
        }

        iface.runner.status.update(iface.runner.global, func(ps podStatus) podStatus {
                // update podStatus metrics on failures
                switch responseType {
                case scalingResponseTypeDenied, scalingResponseTypeFailed:
                        ps.failedSchedulerRequestCounter.Inc()
                default:
                }

                return ps
        })

        return resp, err
}

/////////////////////////////////////////////////
// NeonVM-related interface and implementation //
/////////////////////////////////////////////////

type execNeonVMInterface struct {
        runner *Runner
}

func makeNeonVMInterface(r *Runner) *execNeonVMInterface {
        return &execNeonVMInterface{runner: r}
}

// Request implements executor.NeonVMInterface
func (iface *execNeonVMInterface) Request(
        ctx context.Context,
        logger *zap.Logger,
        current, target api.Resources,
        targetRevision vmv1.RevisionWithTime,
) error {
        iface.runner.recordResourceChange(current, target, iface.runner.global.metrics.neonvmRequestedChange)

        err := iface.runner.doNeonVMRequest(ctx, target, targetRevision)
        if err != nil {
                iface.runner.status.update(iface.runner.global, func(ps podStatus) podStatus {
                        ps.failedNeonVMRequestCounter.Inc()
                        return ps
                })
                return fmt.Errorf("Error making VM patch request: %w", err)
        }

        return nil
}

////////////////////////////////////////////////////
// Monitor-related interface and implementation //
////////////////////////////////////////////////////

type execMonitorInterface struct {
        runner     *Runner
        core       *executor.ExecutorCore
        generation *executor.StoredGenerationNumber
}

func makeMonitorInterface(
        r *Runner,
        core *executor.ExecutorCore,
        generation *executor.StoredGenerationNumber,
) *execMonitorInterface {
        return &execMonitorInterface{runner: r, core: core, generation: generation}
}

func (iface *execMonitorInterface) CurrentGeneration() executor.GenerationNumber {
        return iface.generation.Get()
}

// GetHandle implements executor.MonitorInterface, and MUST only be called while holding the
// executor's lock.
//
// The locking requirement is why we're able to get away with an "unsynchronized" read of the value
// in the runner. For more, see the documentation on Runner.monitor.
func (iface *execMonitorInterface) GetHandle() executor.MonitorHandle {
        monitor := iface.runner.monitor

        if monitor == nil /* || monitor.dispatcher.Exited() */ {
                // NB: we can't check if dispatcher.Exited() because otherwise we might return nil when the
                // executor is told to make a request, because Exited() is not synchronized with changes to
                // the executor state.
                return nil
        }

        return &execMonitorHandle{
                runner:  iface.runner,
                monitor: monitor,
        }
}

type execMonitorHandle struct {
        runner  *Runner
        monitor *monitorInfo
}

func (h *execMonitorHandle) Generation() executor.GenerationNumber {
        return h.monitor.generation
}

func (h *execMonitorHandle) Downscale(
        ctx context.Context,
        logger *zap.Logger,
        current api.Resources,
        target api.Resources,
) (*api.DownscaleResult, error) {
        // Check validity of the message we're sending
        if target.HasFieldGreaterThan(current) {
                innerMsg := fmt.Errorf("%+v has field greater than %+v", target, current)
                panic(fmt.Errorf("(*execMonitorHandle).Downscale() called with target greater than current: %w", innerMsg))
        }

        h.runner.recordResourceChange(current, target, h.runner.global.metrics.monitorRequestedChange)

        result, err := doMonitorDownscale(ctx, logger, h.monitor.dispatcher, target)

        if err == nil {
                if result.Ok {
                        h.runner.recordResourceChange(current, target, h.runner.global.metrics.monitorApprovedChange)
                }
        } else {
                h.runner.status.update(h.runner.global, func(ps podStatus) podStatus {
                        ps.failedMonitorRequestCounter.Inc()
                        h.runner.global.metrics.scalingFullDeniesTotal.WithLabelValues(directionValueDec).Inc()
                        return ps
                })
        }

        return result, err
}

func (h *execMonitorHandle) Upscale(ctx context.Context, logger *zap.Logger, current, target api.Resources) error {
        // Check validity of the message we're sending
        if target.HasFieldLessThan(current) {
                innerMsg := fmt.Errorf("%+v has field less than %+v", target, current)
                panic(fmt.Errorf("(*execMonitorHandle).Upscale() called with target less than current: %w", innerMsg))
        }

        h.runner.recordResourceChange(current, target, h.runner.global.metrics.monitorRequestedChange)

        err := doMonitorUpscale(ctx, logger, h.monitor.dispatcher, target)

        if err == nil {
                h.runner.recordResourceChange(current, target, h.runner.global.metrics.monitorApprovedChange)
        } else {
                h.runner.status.update(h.runner.global, func(ps podStatus) podStatus {
                        ps.failedMonitorRequestCounter.Inc()
                        return ps
                })
        }

        return err
}

package executor

// Consumers of pkg/agent/core, implementing the "executors" for each type of action. These are
// wrapped up into a single ExecutorCore type, which exposes some methods for the various executors.
//
// The executors use various abstract interfaces for the scheduler plugin / NeonVM / vm-monitor, and
// are defined in exec_*.go. The implementations of those interfaces are defined in execbridge.go.
//
// Each of the methods to modify ExecutorCore take 'withLock' as a callback that runs while the lock
// is held. In general, this is used for logging, so that the log output strictly matches the
// ordering of the changes to the underlying core.State, which should help with debugging.
//
// For more, see pkg/agent/ARCHITECTURE.md.

import (
        "sync"
        "time"

        "go.uber.org/zap"

        "github.com/neondatabase/autoscaling/pkg/agent/core"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/util"
)

type Config struct {
        // OnNextActions is called each time the ExecutorCore calls (*core.State).NextActions() on the
        // inner state object.
        //
        // In practice, this value is set to a callback that increments a metric.
        OnNextActions func()

        Core core.Config
}

type ExecutorCore struct {
        mu sync.Mutex

        stateLogger *zap.Logger

        core *core.State

        actions       *timedActions
        lastActionsID timedActionsID
        onNextActions func()

        updates *util.Broadcaster
}

type ClientSet struct {
        Plugin  PluginInterface
        NeonVM  NeonVMInterface
        Monitor MonitorInterface
}

func NewExecutorCore(stateLogger *zap.Logger, vm api.VmInfo, config Config) *ExecutorCore {
        return &ExecutorCore{
                mu:            sync.Mutex{},
                stateLogger:   stateLogger,
                core:          core.NewState(vm, config.Core),
                actions:       nil, // (*ExecutorCore).getActions() checks if this is nil
                lastActionsID: -1,
                onNextActions: config.OnNextActions,
                updates:       util.NewBroadcaster(),
        }
}

// ExecutorCoreWithClients wraps ExecutorCore with the various
type ExecutorCoreWithClients struct {
        *ExecutorCore

        clients ClientSet
}

func (c *ExecutorCore) WithClients(clients ClientSet) ExecutorCoreWithClients {
        return ExecutorCoreWithClients{
                ExecutorCore: c,
                clients:      clients,
        }
}

// timedActions stores the core.ActionSet in ExecutorCore alongside a unique ID
type timedActions struct {
        // id stores a unique ID associated with the cached actions, so that we can use optimistic
        // locking to make sure we're never taking an action that is not the *current* recommendation,
        // because otherwise guaranteeing correctness of core.State is really difficult.
        //
        // id is exclusively used by (*ExecutorCore).updateIfActionsUnchanged().
        id      timedActionsID
        actions core.ActionSet
}

type timedActionsID int64

// fetch the currently cached actions, or recalculate if they've since been invalidated
func (c *ExecutorCore) getActions() timedActions {
        c.mu.Lock()
        defer c.mu.Unlock()

        if c.actions == nil {
                id := c.lastActionsID + 1
                c.onNextActions()

                // NOTE: Even though we cache the actions generated using time.Now(), it's *generally* ok.
                now := time.Now()
                c.stateLogger.Debug("Recalculating ActionSet", zap.Time("now", now), zap.Any("state", c.core.Dump()))
                c.actions = &timedActions{id: id, actions: c.core.NextActions(now)}
                c.lastActionsID = id
                c.stateLogger.Debug("New ActionSet", zap.Time("now", now), zap.Any("actions", c.actions.actions))
        }

        return *c.actions
}

func (c *ExecutorCore) update(with func(*core.State)) {
        c.mu.Lock()
        defer c.mu.Unlock()

        c.updates.Broadcast()
        c.actions = nil
        with(c.core)
}

// updateIfActionsUnchanged is like update, but if the actions have been changed, then the function
// is not called and this returns false.
//
// Otherwise, if the actions are up-to-date, then this is equivalent to c.update(with), and returns true.
func (c *ExecutorCore) updateIfActionsUnchanged(actions timedActions, with func(*core.State)) (updated bool) {
        c.mu.Lock()
        defer c.mu.Unlock()

        if actions.id != c.lastActionsID {
                return false
        }

        c.updates.Broadcast()
        c.actions = nil
        with(c.core)
        return true
}

// may change in the future
type StateDump = core.StateDump

// StateDump copies and returns the current state inside the executor
func (c *ExecutorCore) StateDump() StateDump {
        c.mu.Lock()
        defer c.mu.Unlock()
        return c.core.Dump()
}

// Updater returns a handle on the object used for making external changes to the ExecutorCore,
// beyond what's provided by the various client (ish) interfaces
func (c *ExecutorCore) Updater() ExecutorCoreUpdater {
        return ExecutorCoreUpdater{c}
}

// ExecutorCoreUpdater provides a common interface for external changes to the ExecutorCore
type ExecutorCoreUpdater struct {
        core *ExecutorCore
}

// UpdateSystemMetrics calls (*core.State).UpdateSystemMetrics() on the inner core.State and runs
// withLock while holding the lock.
func (c ExecutorCoreUpdater) UpdateSystemMetrics(metrics core.SystemMetrics, withLock func()) {
        c.core.update(func(state *core.State) {
                state.UpdateSystemMetrics(metrics)
                withLock()
        })
}

// UpdateLFCMetrics calls (*core.State).UpdateLFCMetrics() on the inner core.State and runs withLock
// while holding the lock.
func (c ExecutorCoreUpdater) UpdateLFCMetrics(metrics core.LFCMetrics, withLock func()) {
        c.core.update(func(state *core.State) {
                state.UpdateLFCMetrics(metrics)
                withLock()
        })
}

// UpdatedVM calls (*core.State).UpdatedVM() on the inner core.State and runs withLock while
// holding the lock.
func (c ExecutorCoreUpdater) UpdatedVM(vm api.VmInfo, withLock func()) {
        c.core.update(func(state *core.State) {
                state.UpdatedVM(vm)
                withLock()
        })
}

// ResetMonitor calls (*core.State).Monitor().Reset() on the inner core.State and runs withLock
// while holding the lock.
func (c ExecutorCoreUpdater) ResetMonitor(withLock func()) {
        c.core.update(func(state *core.State) {
                state.Monitor().Reset()
                withLock()
        })
}

// UpscaleRequested calls (*core.State).Monitor().UpscaleRequested(...) on the inner core.State and
// runs withLock while holding the lock.
func (c ExecutorCoreUpdater) UpscaleRequested(resources api.MoreResources, withLock func()) {
        c.core.update(func(state *core.State) {
                state.Monitor().UpscaleRequested(time.Now(), resources)
                withLock()
        })
}

// MonitorActive calls (*core.State).Monitor().Active(...) on the inner core.State and runs withLock
// while holding the lock.
func (c ExecutorCoreUpdater) MonitorActive(active bool, withLock func()) {
        c.core.update(func(state *core.State) {
                state.Monitor().Active(active)
                withLock()
        })
}

package executor

import (
        "context"
        "errors"
        "time"

        "go.uber.org/zap"

        "github.com/neondatabase/autoscaling/pkg/agent/core"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/util"
)

type MonitorInterface interface {
        CurrentGeneration() GenerationNumber
        // GetHandle fetches a stable handle for the current monitor, or nil if there is not one.
        // This method MUST NOT be called unless holding the executor's lock.
        GetHandle() MonitorHandle
}

type MonitorHandle interface {
        Generation() GenerationNumber
        Downscale(_ context.Context, _ *zap.Logger, current, target api.Resources) (*api.DownscaleResult, error)
        Upscale(_ context.Context, _ *zap.Logger, current, target api.Resources) error
}

func (c *ExecutorCoreWithClients) DoMonitorDownscales(ctx context.Context, logger *zap.Logger) {
        var (
                updates     util.BroadcastReceiver = c.updates.NewReceiver()
                ifaceLogger *zap.Logger            = logger.Named("client")
        )

        // must be called while holding c's lock
        generationUnchanged := func(since MonitorHandle) bool {
                return since.Generation() == c.clients.Monitor.CurrentGeneration()
        }

        for {
                // Wait until the state's changed, or we're done.
                select {
                case <-ctx.Done():
                        return
                case <-updates.Wait():
                        updates.Awake()
                }

                last := c.getActions()
                if last.actions.MonitorDownscale == nil {
                        continue // nothing to do; wait until the state changes.
                }

                var startTime time.Time
                var monitorIface MonitorHandle
                action := *last.actions.MonitorDownscale

                if updated := c.updateIfActionsUnchanged(last, func(state *core.State) {
                        logger.Info("Starting vm-monitor downscale request", zap.Object("action", action))
                        startTime = time.Now()
                        monitorIface = c.clients.Monitor.GetHandle()
                        state.Monitor().StartingDownscaleRequest(startTime, action.Target)

                        if monitorIface == nil {
                                panic(errors.New(
                                        "core.State asked for vm-monitor downscale request, but Monitor.GetHandle() is nil, so it should be disabled",
                                ))
                        }
                }); !updated {
                        continue // state has changed, retry.
                }

                result, err := monitorIface.Downscale(ctx, ifaceLogger, action.Current, action.Target)
                endTime := time.Now()

                c.update(func(state *core.State) {
                        unchanged := generationUnchanged(monitorIface)
                        logFields := []zap.Field{
                                zap.Object("action", action),
                                zap.Duration("duration", endTime.Sub(startTime)),
                                zap.Bool("unchanged", unchanged),
                        }

                        warnSkipBecauseChanged := func() {
                                logger.Warn("Skipping state update after vm-monitor downscale request because MonitorHandle changed")
                        }

                        if err != nil {
                                logger.Error("vm-monitor downscale request failed", append(logFields, zap.Error(err))...)
                                if unchanged {
                                        state.Monitor().DownscaleRequestFailed(endTime)
                                } else {
                                        warnSkipBecauseChanged()
                                }
                                return
                        }

                        logFields = append(logFields, zap.Any("response", result))

                        if !result.Ok {
                                logger.Warn("vm-monitor denied downscale", logFields...)
                                if unchanged {
                                        state.Monitor().DownscaleRequestDenied(endTime, action.TargetRevision)
                                } else {
                                        warnSkipBecauseChanged()
                                }
                        } else {
                                logger.Info("vm-monitor approved downscale", logFields...)
                                if unchanged {
                                        state.Monitor().DownscaleRequestAllowed(endTime, action.TargetRevision)
                                } else {
                                        warnSkipBecauseChanged()
                                }
                        }
                })
        }
}

func (c *ExecutorCoreWithClients) DoMonitorUpscales(ctx context.Context, logger *zap.Logger) {
        var (
                updates     util.BroadcastReceiver = c.updates.NewReceiver()
                ifaceLogger *zap.Logger            = logger.Named("client")
        )

        // must be called while holding c's lock
        generationUnchanged := func(since MonitorHandle) bool {
                return since.Generation() == c.clients.Monitor.CurrentGeneration()
        }

        for {
                // Wait until the state's changed, or we're done.
                select {
                case <-ctx.Done():
                        return
                case <-updates.Wait():
                        updates.Awake()
                }

                last := c.getActions()
                if last.actions.MonitorUpscale == nil {
                        continue // nothing to do; wait until the state changes.
                }

                var startTime time.Time
                var monitorIface MonitorHandle
                action := *last.actions.MonitorUpscale

                if updated := c.updateIfActionsUnchanged(last, func(state *core.State) {
                        logger.Info("Starting vm-monitor upscale request", zap.Object("action", action))
                        startTime = time.Now()
                        monitorIface = c.clients.Monitor.GetHandle()
                        state.Monitor().StartingUpscaleRequest(startTime, action.Target)

                        if monitorIface == nil {
                                panic(errors.New(
                                        "core.State asked for vm-monitor upscale request, but Monitor.GetHandle() is nil, so it should be disabled",
                                ))
                        }
                }); !updated {
                        continue // state has changed, retry.
                }

                err := monitorIface.Upscale(ctx, ifaceLogger, action.Current, action.Target)
                endTime := time.Now()

                c.update(func(state *core.State) {
                        unchanged := generationUnchanged(monitorIface)
                        logFields := []zap.Field{
                                zap.Object("action", action),
                                zap.Duration("duration", endTime.Sub(startTime)),
                                zap.Bool("unchanged", unchanged),
                        }

                        warnSkipBecauseChanged := func() {
                                logger.Warn("Skipping state update after vm-monitor upscale request because MonitorHandle changed")
                        }

                        if err != nil {
                                logger.Error("vm-monitor upscale request failed", append(logFields, zap.Error(err))...)
                                if unchanged {
                                        state.Monitor().UpscaleRequestFailed(endTime)
                                } else {
                                        warnSkipBecauseChanged()
                                }
                                return
                        }

                        logger.Info("vm-monitor upscale request successful", logFields...)
                        if unchanged {
                                state.Monitor().UpscaleRequestSuccessful(endTime)
                        } else {
                                warnSkipBecauseChanged()
                        }
                })
        }
}

package executor

import (
        "context"
        "time"

        "go.uber.org/zap"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/agent/core"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/util"
)

type NeonVMInterface interface {
        Request(
                _ context.Context,
                _ *zap.Logger,
                current, target api.Resources,
                targetRevision vmv1.RevisionWithTime,
        ) error
}

func (c *ExecutorCoreWithClients) DoNeonVMRequests(ctx context.Context, logger *zap.Logger) {
        var (
                updates     util.BroadcastReceiver = c.updates.NewReceiver()
                ifaceLogger *zap.Logger            = logger.Named("client")
        )

        for {
                // Wait until the state's changed, or we're done.
                select {
                case <-ctx.Done():
                        return
                case <-updates.Wait():
                        updates.Awake()
                }

                last := c.getActions()
                if last.actions.NeonVMRequest == nil {
                        continue // nothing to do; wait until the state changes.
                }

                var startTime time.Time
                action := *last.actions.NeonVMRequest

                if updated := c.updateIfActionsUnchanged(last, func(state *core.State) {
                        logger.Info("Starting NeonVM request", zap.Object("action", action))
                        startTime = time.Now()
                        state.NeonVM().StartingRequest(startTime, action.Target)
                }); !updated {
                        continue // state has changed, retry.
                }

                endTime := time.Now()
                targetRevision := action.TargetRevision.WithTime(endTime)
                err := c.clients.NeonVM.Request(ctx, ifaceLogger, action.Current, action.Target, targetRevision)

                logFields := []zap.Field{zap.Object("action", action), zap.Duration("duration", endTime.Sub(startTime))}

                c.update(func(state *core.State) {
                        if err != nil {
                                logger.Error("NeonVM request failed", append(logFields, zap.Error(err))...)
                                state.NeonVM().RequestFailed(endTime)
                        } else /* err == nil */ {
                                logger.Info("NeonVM request successful", logFields...)
                                state.NeonVM().RequestSuccessful(endTime)
                        }
                })
        }
}

package executor

import (
        "context"
        "time"

        "go.uber.org/zap"

        "github.com/neondatabase/autoscaling/pkg/agent/core"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/util"
)

type PluginInterface interface {
        Request(_ context.Context, _ *zap.Logger, lastPermit *api.Resources, target api.Resources, _ *api.Metrics) (*api.PluginResponse, error)
}

func (c *ExecutorCoreWithClients) DoPluginRequests(ctx context.Context, logger *zap.Logger) {
        var (
                updates     util.BroadcastReceiver = c.updates.NewReceiver()
                ifaceLogger *zap.Logger            = logger.Named("client")
        )

        for {
                // Wait until the state's changed, or we're done.
                select {
                case <-ctx.Done():
                        return
                case <-updates.Wait():
                        updates.Awake()
                }

                last := c.getActions()
                if last.actions.PluginRequest == nil {
                        continue // nothing to do; wait until the state changes.
                }

                var startTime time.Time
                action := *last.actions.PluginRequest

                if updated := c.updateIfActionsUnchanged(last, func(state *core.State) {
                        logger.Info("Starting plugin request", zap.Object("action", action))
                        startTime = time.Now()
                        state.Plugin().StartingRequest(startTime, action.Target)
                }); !updated {
                        continue // state has changed, retry.
                }

                resp, err := c.clients.Plugin.Request(ctx, ifaceLogger, action.LastPermit, action.Target, action.Metrics)
                endTime := time.Now()

                c.update(func(state *core.State) {
                        logFields := []zap.Field{
                                zap.Object("action", action),
                                zap.Duration("duration", endTime.Sub(startTime)),
                        }

                        if err != nil {
                                logger.Error("Plugin request failed", append(logFields, zap.Error(err))...)
                                state.Plugin().RequestFailed(endTime)
                        } else {
                                logFields = append(logFields, zap.Any("response", resp))
                                logger.Info("Plugin request successful", logFields...)
                                if err := state.Plugin().RequestSuccessful(endTime, action.TargetRevision, *resp); err != nil {
                                        logger.Error("Plugin response validation failed", append(logFields, zap.Error(err))...)
                                }
                        }
                })
        }
}

package executor

import (
        "context"
        "time"

        "go.uber.org/zap"

        "github.com/neondatabase/autoscaling/pkg/agent/core"
)

func (c *ExecutorCore) DoSleeper(ctx context.Context, logger *zap.Logger) {
        updates := c.updates.NewReceiver()

        // preallocate the timer. We clear it at the top of the loop; the 0 duration is just because we
        // need *some* value, so it might as well be zero.
        timer := time.NewTimer(0)
        defer timer.Stop()

        for {
                // Ensure the timer is cleared at the top of the loop
                if !timer.Stop() {
                        // Clear timer.C only if we haven't already read from it
                        select {
                        case <-timer.C:
                        default:
                        }
                }

                // Wait until the state's changed or we're done
                select {
                case <-ctx.Done():
                        return
                case <-updates.Wait():
                        updates.Awake()
                }

                last := c.getActions()
                if last.actions.Wait == nil {
                        continue // nothing to do; wait until the state changes
                }

                // NB: It's possible for last.calculatedAt to be somewhat out of date. It's *probably*
                // fine, because we'll be given a notification any time the state has changed, so we
                // should wake from a select soon enough to get here
                timer.Reset(last.actions.Wait.Duration)

                select {
                case <-ctx.Done():
                        return
                case <-updates.Wait():
                        // Don't consume the event here. Rely on the event to remain at the top of the loop
                        continue
                case <-timer.C:
                        select {
                        // If there's also an update, then let that take preference:
                        case <-updates.Wait():
                                // Same thing as above - don't consume the event here.
                                continue
                        // Otherwise, trigger cache invalidation because we've waited for the requested
                        // amount of time:
                        default:
                                c.update(func(*core.State) {})
                                updates.Awake()
                                last = c.getActions()
                        }
                }
        }
}

package executor

// Generation numbers, for use by implementers of the various interfaces (i.e. pkg/agent/execbridge.go)

import (
        "sync/atomic"
)

type StoredGenerationNumber struct {
        value atomic.Int64
}

type GenerationNumber struct {
        value int64
}

func NewStoredGenerationNumber() *StoredGenerationNumber {
        return &StoredGenerationNumber{value: atomic.Int64{}}
}

// Inc increments the stored GenerationNumber, returning the new value
func (n *StoredGenerationNumber) Inc() GenerationNumber {
        return GenerationNumber{value: n.value.Add(1)}
}

// Get fetches the current value of the stored GenerationNumber
func (n *StoredGenerationNumber) Get() GenerationNumber {
        return GenerationNumber{value: n.value.Load()}
}

package agent

import (
        "context"
        "errors"
        "fmt"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        "go.uber.org/zap"

        "k8s.io/client-go/kubernetes"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
        "github.com/neondatabase/autoscaling/pkg/agent/scalingevents"
        "github.com/neondatabase/autoscaling/pkg/agent/schedwatch"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/util"
)

// agentState is the global state for the autoscaler agent
//
// All fields are immutable, except pods.
type agentState struct {
        // lock guards access to pods
        lock util.ChanMutex
        pods map[util.NamespacedName]*podState

        // A base logger to pass around, so we can recreate the logger for a Runner on restart, without
        // running the risk of leaking keys.
        baseLogger *zap.Logger

        podIP        string
        config       *Config
        kubeClient   *kubernetes.Clientset
        vmClient     *vmclient.Clientset
        schedTracker *schedwatch.SchedulerTracker
        metrics      GlobalMetrics
        vmMetrics    *PerVMMetrics

        scalingReporter *scalingevents.Reporter
}

func (r MainRunner) newAgentState(
        baseLogger *zap.Logger,
        podIP string,
        schedTracker *schedwatch.SchedulerTracker,
        scalingReporter *scalingevents.Reporter,
        globalMetrics GlobalMetrics,
        perVMMetrics *PerVMMetrics,
) *agentState {
        return &agentState{
                lock:         util.NewChanMutex(),
                pods:         make(map[util.NamespacedName]*podState),
                baseLogger:   baseLogger,
                config:       r.Config,
                kubeClient:   r.KubeClient,
                vmClient:     r.VMClient,
                podIP:        podIP,
                schedTracker: schedTracker,
                metrics:      globalMetrics,
                vmMetrics:    perVMMetrics,

                scalingReporter: scalingReporter,
        }
}

func vmIsOurResponsibility(vm *vmv1.VirtualMachine, config *Config, nodeName string) bool {
        return vm.Status.Node == nodeName &&
                (vm.Status.Phase.IsAlive() && vm.Status.Phase != vmv1.VmMigrating) &&
                vm.Status.PodIP != "" &&
                api.HasAutoscalingEnabled(vm) &&
                vm.Spec.SchedulerName == config.Scheduler.SchedulerName
}

func (s *agentState) Stop() {
        s.lock.Lock()
        defer s.lock.Unlock()

        for _, pod := range s.pods {
                pod.stop()
        }
}

func (s *agentState) handleEvent(ctx context.Context, logger *zap.Logger, event vmEvent) {
        logger = logger.With(
                zap.Object("event", event),
                zap.Object("virtualmachine", event.vmInfo.NamespacedName()),
                zap.Object("pod", util.NamespacedName{Namespace: event.vmInfo.Namespace, Name: event.podName}),
        )
        logger.Debug("Handling event for VM")

        if err := s.lock.TryLock(ctx); err != nil {
                logger.Warn("Context canceled while starting to handle event", zap.Error(err))
                return
        }
        defer s.lock.Unlock()

        podName := util.NamespacedName{Namespace: event.vmInfo.Namespace, Name: event.podName}
        state, hasPod := s.pods[podName]

        // nb: we add the "pod" key for uniformity, even though it's derived from the event
        if event.kind != vmEventAdded && !hasPod {
                logger.Error("Received event for pod that isn't present", zap.Object("pod", podName))
                return
        } else if event.kind == vmEventAdded && hasPod {
                logger.Error("Received add event for pod that's already present", zap.Object("pod", podName))
                return
        }

        switch event.kind {
        case vmEventDeleted:
                state.stop()
                // mark the status as deleted, so that it gets removed from metrics.
                state.status.update(s, func(stat podStatus) podStatus {
                        stat.deleted = true
                        delete(s.pods, podName) // Do the removal while synchronized, because we can :)
                        return stat
                })
        case vmEventUpdated:
                state.status.update(s, func(stat podStatus) podStatus {
                        now := time.Now()
                        stat.vmInfo = event.vmInfo
                        stat.endpointID = event.endpointID
                        stat.endpointAssignedAt = &now
                        state.vmInfoUpdated.Send()

                        return stat
                })
        case vmEventAdded:
                s.handleVMEventAdded(ctx, event, podName)
        default:
                panic(errors.New("bad event: unexpected event kind"))
        }
}

func (s *agentState) handleVMEventAdded(
        ctx context.Context,
        event vmEvent,
        podName util.NamespacedName,
) {
        runnerCtx, cancelRunnerContext := context.WithCancel(ctx)

        now := time.Now()

        status := &lockedPodStatus{
                mu: sync.Mutex{},
                podStatus: podStatus{
                        deleted:            false,
                        endState:           nil,
                        previousEndStates:  nil,
                        vmInfo:             event.vmInfo,
                        endpointID:         event.endpointID,
                        endpointAssignedAt: &now,
                        state:              "", // Explicitly set state to empty so that the initial state update does no decrement
                        stateUpdatedAt:     now,

                        startTime:                     now,
                        lastSuccessfulMonitorComm:     nil,
                        failedMonitorRequestCounter:   util.NewRecentCounter(time.Duration(s.config.Monitor.MaxFailedRequestRate.IntervalSeconds) * time.Second),
                        failedNeonVMRequestCounter:    util.NewRecentCounter(time.Duration(s.config.NeonVM.MaxFailedRequestRate.IntervalSeconds) * time.Second),
                        failedSchedulerRequestCounter: util.NewRecentCounter(time.Duration(s.config.Scheduler.MaxFailedRequestRate.IntervalSeconds) * time.Second),
                },
        }

        // Empty update to trigger updating metrics and state.
        status.update(s, func(s podStatus) podStatus { return s })

        runner := s.newRunner(event.vmInfo, podName, event.podIP)
        runner.status = status

        txVMUpdate, rxVMUpdate := util.NewCondChannelPair()

        s.pods[podName] = &podState{
                podName:       podName,
                stop:          cancelRunnerContext,
                runner:        runner,
                status:        status,
                vmInfoUpdated: txVMUpdate,
        }
        s.metrics.runnerStarts.Inc()
        restartCount := 0
        logger := s.loggerForRunner(restartCount, event.vmInfo.NamespacedName(), podName)
        runner.Spawn(runnerCtx, logger, rxVMUpdate)
}

// FIXME: make these timings configurable.
const (
        RunnerRestartMinWaitSeconds = 5
        RunnerRestartMaxWaitSeconds = 10
)

// TriggerRestartIfNecessary restarts the Runner for podName, after a delay if necessary.
//
// NB: runnerCtx is the context *passed to the new Runner*. It is only used here to end our restart
// process early if it's already been canceled. logger is not passed, and so can be handled a bit
// more freely.
func (s *agentState) TriggerRestartIfNecessary(runnerCtx context.Context, logger *zap.Logger, podName util.NamespacedName, podIP string) {
        // Three steps:
        //  1. Check if the Runner needs to restart. If no, we're done.
        //  2. Wait for a random amount of time (between RunnerRestartMinWaitSeconds and RunnerRestartMaxWaitSeconds)
        //  3. Restart the Runner (if it still should be restarted)

        status, ok := func() (*lockedPodStatus, bool) {
                s.lock.Lock()
                defer s.lock.Unlock()
                // note: pod.status has a separate lock, so we're ok to release s.lock
                if pod, ok := s.pods[podName]; ok {
                        return pod.status, true
                } else {
                        return nil, false
                }
        }()

        if !ok {
                return
        }

        status.mu.Lock()
        defer status.mu.Unlock()

        if status.endState == nil {
                logger.Panic("TriggerRestartIfNecessary called with nil endState (should only be called after the pod is finished, when endState != nil)")
        }

        endTime := status.endState.Time

        if endTime.IsZero() {
                // If we don't check this, we run the risk of spinning on failures.
                logger.Error("TriggerRestartIfNecessary called with zero'd Time for pod")
                // Continue on, but with the time overridden, so we guarantee our minimum wait.
                endTime = time.Now()
        }

        // keep this for later.
        exitKind := status.endState.ExitKind

        switch exitKind {
        case podStatusExitCanceled:
                logger.Info("Runner's context was canceled; no need to restart")
                return // successful exit, no need to restart.
        case podStatusExitPanicked:
                // Should restart; continue.
                logger.Info("Runner had abnormal exit kind; it will restart", zap.String("exitKind", string(exitKind)))
        default:
                logger.Error("TriggerRestartIfNecessary called with unexpected ExitKind", zap.String("exitKind", string(exitKind)))
                // continue on; false positives (restarting when we shouldn't) are much better than the
                // alternative here (not restarting when we should)
        }

        // Begin steps (2) and (3) -- wait, then restart.
        var waitDuration time.Duration
        totalRuntime := endTime.Sub(status.startTime)

        // If the runner was running for a while, restart immediately.
        //
        // NOTE: this will have incorrect behavior when the system clock is behaving weirdly, but that's
        // mostly ok. It's ok to e.g. restart an extra time at the switchover to daylight saving time.
        if totalRuntime > time.Second*time.Duration(RunnerRestartMaxWaitSeconds) {
                logger.Info("Runner was running for a long time, restarting immediately", zap.Duration("totalRuntime", totalRuntime))
                waitDuration = 0
        } else /* Otherwise, randomly pick within RunnerRestartMinWait..RunnerRestartMaxWait */ {
                r := util.NewTimeRange(time.Second, RunnerRestartMinWaitSeconds, RunnerRestartMaxWaitSeconds)
                waitDuration = r.Random()
                logger.Info(
                        "Runner was not running for long, restarting after delay",
                        zap.Duration("totalRuntime", totalRuntime),
                        zap.Duration("delay", waitDuration),
                )
        }

        // Run the waiting (if necessary) and restarting in another goroutine, so we're not blocking the
        // caller of this function.
        go func() {
                logCancel := func(logFunc func(string, ...zap.Field), err error) {
                        logFunc(
                                "Canceling restart of Runner",
                                zap.Duration("delay", waitDuration),
                                zap.Duration("waitTime", time.Since(endTime)),
                                zap.Error(err),
                        )
                }

                if waitDuration != 0 {
                        select {
                        case <-time.After(waitDuration):
                        case <-runnerCtx.Done():
                                logCancel(logger.Info, runnerCtx.Err())
                                return
                        }
                }

                s.lock.Lock()
                defer s.lock.Unlock()

                // Need to update pod itself; can't release s.lock. Also, pod *theoretically* may been
                // deleted + restarted since we started, so it's incorrect to hold on to the original
                // podStatus.
                pod, ok := s.pods[podName]
                if !ok {
                        logCancel(logger.Warn, errors.New("no longer present in pod map"))
                        return
                }

                pod.status.update(s, func(status podStatus) podStatus {
                        // Runner was already restarted
                        if status.endState == nil {
                                addedInfo := "this generally shouldn't happen, but could if there's a new pod with the same name"
                                logCancel(logger.Warn, fmt.Errorf("Runner was already restarted (%s)", addedInfo))
                                return status
                        }

                        logger.Info("Restarting runner", zap.String("exitKind", string(exitKind)), zap.Duration("delay", time.Since(endTime)))
                        s.metrics.runnerRestarts.Inc()

                        restartCount := len(status.previousEndStates) + 1
                        runner := s.newRunner(status.vmInfo, podName, podIP)
                        runner.status = pod.status

                        txVMUpdate, rxVMUpdate := util.NewCondChannelPair()
                        // note: pod is *podState, so we don't need to re-assign to the map.
                        pod.vmInfoUpdated = txVMUpdate
                        pod.runner = runner

                        status.previousEndStates = append(status.previousEndStates, *status.endState)
                        status.endState = nil
                        status.startTime = time.Now()

                        runnerLogger := s.loggerForRunner(restartCount, status.vmInfo.NamespacedName(), podName)
                        runner.Spawn(runnerCtx, runnerLogger, rxVMUpdate)
                        return status
                })
        }()
}

func (s *agentState) loggerForRunner(restartCount int, vmName, podName util.NamespacedName) *zap.Logger {
        return s.baseLogger.Named("runner").With(
                zap.Int("restarts", restartCount),
                zap.Object("virtualmachine", vmName),
                zap.Object("pod", podName),
        )
}

// NB: caller must set Runner.status after creation
func (s *agentState) newRunner(vmInfo api.VmInfo, podName util.NamespacedName, podIP string) *Runner {
        return &Runner{
                global: s,
                status: nil, // set by caller

                shutdown:    nil, // set by (*Runner).Run
                vmName:      vmInfo.NamespacedName(),
                podName:     podName,
                podIP:       podIP,
                memSlotSize: vmInfo.Mem.SlotSize,
                lock:        util.NewChanMutex(),

                executorStateDump: nil, // set by (*Runner).Run

                monitor: nil,

                backgroundWorkerCount: atomic.Int64{},
                backgroundPanic:       make(chan error),
        }
}

type podState struct {
        podName util.NamespacedName

        stop   context.CancelFunc
        runner *Runner
        status *lockedPodStatus

        vmInfoUpdated util.CondChannelSender
}

type podStateDump struct {
        PodName         util.NamespacedName `json:"podName"`
        Status          podStatusDump       `json:"status"`
        Runner          *RunnerState        `json:"runner,omitempty"`
        CollectionError error               `json:"collectionError,omitempty"`
}

func (p *podState) dump(ctx context.Context) podStateDump {
        status := p.status.dump()
        runner, collectErr := p.runner.State(ctx)
        if collectErr != nil {
                collectErr = fmt.Errorf("error reading runner state: %w", collectErr)
        }
        return podStateDump{
                PodName:         p.podName,
                Status:          status,
                Runner:          runner,
                CollectionError: collectErr,
        }
}

type lockedPodStatus struct {
        mu sync.Mutex

        podStatus
}

type podStatus struct {
        startTime time.Time

        // if true, the corresponding podState is no longer included in the global pod map
        deleted bool

        // if non-nil, the runner is finished
        endState          *podStatusEndState
        previousEndStates []podStatusEndState

        lastSuccessfulMonitorComm *time.Time

        failedMonitorRequestCounter   *util.RecentCounter
        failedNeonVMRequestCounter    *util.RecentCounter
        failedSchedulerRequestCounter *util.RecentCounter

        // vmInfo stores the latest information about the VM, as given by the global VM watcher.
        //
        // There is also a similar field inside the Runner itself, but it's better to store this out
        // here, where we don't have to rely on the Runner being well-behaved w.r.t. locking.
        vmInfo api.VmInfo

        // endpointID, if non-empty, stores the ID of the endpoint associated with the VM
        endpointID string

        // NB: this value, once non-nil, is never changed.
        endpointAssignedAt *time.Time

        state          runnerMetricState
        stateUpdatedAt time.Time
}

type podStatusDump struct {
        StartTime time.Time `json:"startTime"`

        EndState          *podStatusEndState  `json:"endState"`
        PreviousEndStates []podStatusEndState `json:"previousEndStates"`

        LastSuccessfulMonitorComm     *time.Time `json:"lastSuccessfulMonitorComm"`
        FailedMonitorRequestCounter   uint       `json:"failedMonitorRequestCounter"`
        FailedNeonVMRequestCounter    uint       `json:"failedNeonVMRequestCounter"`
        FailedSchedulerRequestCounter uint       `json:"failedSchedulerRequestCounter"`

        VMInfo api.VmInfo `json:"vmInfo"`

        EndpointID         string     `json:"endpointID"`
        EndpointAssignedAt *time.Time `json:"endpointAssignedAt"`

        State          runnerMetricState `json:"state"`
        StateUpdatedAt time.Time         `json:"stateUpdatedAt"`
}

type podStatusEndState struct {
        // The reason the Runner exited.
        ExitKind podStatusExitKind `json:"exitKind"`
        // If ExitKind is "panicked" or "errored", the error message.
        Error error     `json:"error"`
        Time  time.Time `json:"time"`
}

type podStatusExitKind string

const (
        podStatusExitPanicked podStatusExitKind = "panicked"
        podStatusExitCanceled podStatusExitKind = "canceled" // top-down signal that the Runner should stop.
)

func (s *lockedPodStatus) update(global *agentState, with func(podStatus) podStatus) {
        s.mu.Lock()
        defer s.mu.Unlock()

        newStatus := with(s.podStatus)
        now := time.Now()

        // Calculate the new state:
        var newState runnerMetricState
        if s.deleted {
                // If deleted, don't change anything.
        } else if s.endState != nil {
                switch s.endState.ExitKind {
                case podStatusExitCanceled:
                        // If canceled, don't change the state.
                        newState = s.state
                case podStatusExitPanicked:
                        newState = runnerMetricStatePanicked
                }
        } else if isStuck, _ := newStatus.isStuck(global, now); isStuck {
                newState = runnerMetricStateStuck
        } else {
                newState = runnerMetricStateOk
        }

        if !newStatus.deleted {
                newStatus.state = newState
                newStatus.stateUpdatedAt = now
        }

        // Update the metrics:
        // Note: s.state is initialized to the empty string to signify that it's not yet represented in
        // the metrics.
        if !s.deleted && s.state != "" {
                oldIsEndpoint := strconv.FormatBool(s.endpointID != "")
                global.metrics.runnersCount.WithLabelValues(oldIsEndpoint, string(s.state)).Dec()
        }

        if !newStatus.deleted && newStatus.state != "" {
                newIsEndpoint := strconv.FormatBool(newStatus.endpointID != "")
                global.metrics.runnersCount.WithLabelValues(newIsEndpoint, string(newStatus.state)).Inc()
        }

        s.podStatus = newStatus
}

func (s podStatus) isStuck(global *agentState, now time.Time) (bool, []string) {
        var reasons []string
        if s.monitorStuckAt(global.config).Before(now) {
                reasons = append(reasons, "monitor health check failed")
        }
        if s.failedMonitorRequestCounter.Get() > global.config.Monitor.MaxFailedRequestRate.Threshold {
                reasons = append(reasons, "monitor requests failed")
        }
        if s.failedSchedulerRequestCounter.Get() > global.config.Scheduler.MaxFailedRequestRate.Threshold {
                reasons = append(reasons, "scheduler requests failed")
        }
        if s.failedNeonVMRequestCounter.Get() > global.config.NeonVM.MaxFailedRequestRate.Threshold {
                reasons = append(reasons, "neonvm requests failed")
        }
        return len(reasons) > 0, reasons
}

// monitorStuckAt returns the time at which the Runner will be marked "stuck"
func (s podStatus) monitorStuckAt(config *Config) time.Time {
        startupGracePeriod := time.Second * time.Duration(config.Monitor.UnhealthyStartupGracePeriodSeconds)
        unhealthySilencePeriod := time.Second * time.Duration(config.Monitor.UnhealthyAfterSilenceDurationSeconds)

        if s.lastSuccessfulMonitorComm == nil {
                start := s.startTime

                // For endpoints, we should start the grace period from when the VM was *assigned* the
                // endpoint, rather than when the VM was created.
                if s.endpointID != "" {
                        start = *s.endpointAssignedAt
                }

                return start.Add(startupGracePeriod)
        } else {
                return s.lastSuccessfulMonitorComm.Add(unhealthySilencePeriod)
        }
}

func (s *lockedPodStatus) periodicallyRefreshState(ctx context.Context, logger *zap.Logger, global *agentState) {
        ticker := time.NewTicker(time.Second * time.Duration(global.config.RefreshStateIntervalSeconds))
        defer ticker.Stop()

        for {
                select {
                case <-ctx.Done():
                        return
                case <-ticker.C:
                }

                // use s.update to trigger re-evaluating the metrics, and simultaneously reset the timer to
                // the next point in time at which the state might have changed, so that we minimize the
                // time between the VM meeting the conditions for being "stuck" and us recognizing it.
                s.update(global, func(stat podStatus) podStatus {
                        isStuck, reasons := stat.isStuck(global, time.Now())
                        if isStuck && stat.state != runnerMetricStatePanicked {
                                if stat.endpointID != "" {
                                        logger.Warn("Runner with endpoint is currently stuck",
                                                zap.String("endpointID", stat.endpointID), zap.String("reasons", strings.Join(reasons, ",")))
                                } else {
                                        logger.Warn("Runner without endpoint is currently stuck",
                                                zap.String("reasons", strings.Join(reasons, ",")))
                                }
                        }
                        return stat
                })
        }
}

func (s *lockedPodStatus) dump() podStatusDump {
        s.mu.Lock()
        defer s.mu.Unlock()

        var endState *podStatusEndState
        if s.endState != nil {
                es := *s.endState
                endState = &es
        }

        previousEndStates := make([]podStatusEndState, len(s.previousEndStates))
        copy(previousEndStates, s.previousEndStates)

        return podStatusDump{
                EndState:          endState,
                PreviousEndStates: previousEndStates,

                // FIXME: api.VmInfo contains a resource.Quantity - is that safe to copy by value?
                VMInfo:             s.vmInfo,
                EndpointID:         s.endpointID,
                EndpointAssignedAt: s.endpointAssignedAt, // ok to share the pointer, because it's not updated
                StartTime:          s.startTime,

                State:          s.state,
                StateUpdatedAt: s.stateUpdatedAt,

                LastSuccessfulMonitorComm:     s.lastSuccessfulMonitorComm,
                FailedMonitorRequestCounter:   s.failedMonitorRequestCounter.Get(),
                FailedNeonVMRequestCounter:    s.failedNeonVMRequestCounter.Get(),
                FailedSchedulerRequestCounter: s.failedSchedulerRequestCounter.Get(),
        }
}

package agent

import (
        "sync"
        "time"

        "github.com/prometheus/client_golang/prometheus"
        "github.com/prometheus/client_golang/prometheus/collectors"
        "github.com/samber/lo"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/agent/core/revsource"
        "github.com/neondatabase/autoscaling/pkg/agent/scalingevents"
        "github.com/neondatabase/autoscaling/pkg/util"
)

type GlobalMetrics struct {
        schedulerRequests        *prometheus.CounterVec
        schedulerRequestedChange resourceChangePair
        schedulerApprovedChange  resourceChangePair

        scalingFullDeniesTotal       *prometheus.CounterVec
        scalingPartialApprovalsTotal *prometheus.CounterVec

        monitorRequestsOutbound *prometheus.CounterVec
        monitorRequestsInbound  *prometheus.CounterVec
        monitorRequestedChange  resourceChangePair
        monitorApprovedChange   resourceChangePair

        neonvmRequestsOutbound *prometheus.CounterVec
        neonvmRequestedChange  resourceChangePair

        runnersCount       *prometheus.GaugeVec
        runnerThreadPanics prometheus.Counter
        runnerStarts       prometheus.Counter
        runnerRestarts     prometheus.Counter
        runnerNextActions  prometheus.Counter

        scalingLatency prometheus.HistogramVec
        pluginLatency  prometheus.HistogramVec
        monitorLatency prometheus.HistogramVec
        neonvmLatency  prometheus.HistogramVec
}

func (m *GlobalMetrics) PluginLatency() *prometheus.HistogramVec {
        return &m.pluginLatency
}

func (m *GlobalMetrics) MonitorLatency() *prometheus.HistogramVec {
        return &m.monitorLatency
}

func (m *GlobalMetrics) NeonVMLatency() *prometheus.HistogramVec {
        return &m.neonvmLatency
}

type resourceChangePair struct {
        cpu *prometheus.CounterVec
        mem *prometheus.CounterVec
}

const (
        directionLabel     = "direction"
        directionValueInc  = "inc"
        directionValueDec  = "dec"
        directionValueBoth = "both"
        directionValueNone = "none"
)

type runnerMetricState string

const (
        runnerMetricStateOk       runnerMetricState = "ok"
        runnerMetricStateStuck    runnerMetricState = "stuck"
        runnerMetricStatePanicked runnerMetricState = "panicked"
)

// Copied bucket values from controller runtime latency metric. We can
// adjust them in the future if needed.
var buckets = []float64{
        0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
        1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60,
}

func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) {
        reg := prometheus.NewRegistry()

        // register stock collectors directly:
        //   (even though MustRegister is variadic, the function calls
        //   are cheap and calling it more than once means that when
        //   it panics, we know exactly which metric caused the error.)
        reg.MustRegister(collectors.NewGoCollector())
        reg.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))

        metrics := GlobalMetrics{
                // the util.RegisterMetric() function registers the collector and returns
                // it so we can set it directly on the output structure.

                // ---- SCHEDULER ----
                schedulerRequests: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_agent_scheduler_plugin_requests_total",
                                Help: "Number of attempted HTTP requests to the scheduler plugin by autoscaler-agents",
                        },
                        []string{"code"},
                )),
                schedulerRequestedChange: resourceChangePair{
                        cpu: util.RegisterMetric(reg, prometheus.NewCounterVec(
                                prometheus.CounterOpts{
                                        Name: "autoscaling_agent_scheduler_plugin_requested_cpu_change_total",
                                        Help: "Total change in CPU requested from the scheduler",
                                },
                                []string{directionLabel},
                        )),
                        mem: util.RegisterMetric(reg, prometheus.NewCounterVec(
                                prometheus.CounterOpts{
                                        Name: "autoscaling_agent_scheduler_plugin_requested_mem_change_total",
                                        Help: "Total change in memory (in MiB) requested from the scheduler",
                                },
                                []string{directionLabel},
                        )),
                },
                schedulerApprovedChange: resourceChangePair{
                        cpu: util.RegisterMetric(reg, prometheus.NewCounterVec(
                                prometheus.CounterOpts{
                                        Name: "autoscaling_agent_scheduler_plugin_accepted_cpu_change_total",
                                        Help: "Total change in CPU approved by the scheduler",
                                },
                                []string{directionLabel},
                        )),
                        mem: util.RegisterMetric(reg, prometheus.NewCounterVec(
                                prometheus.CounterOpts{
                                        Name: "autoscaling_agent_scheduler_plugin_accepted_mem_change_total",
                                        Help: "Total change in memory (in MiB) approved by the scheduler",
                                },
                                []string{directionLabel},
                        )),
                },
                // ---- scaling denies related metrics ----
                scalingFullDeniesTotal: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_agent_scaling_full_denials_total",
                                Help: "Number of the scheduler or vmmon full denials responses",
                        },
                        []string{directionLabel},
                )),
                scalingPartialApprovalsTotal: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_agent_scaling_partial_approvals_total",
                                Help: "Number of the scheduler partially approved responses",
                        },
                        []string{directionLabel},
                )),
                // ---- MONITOR ----
                monitorRequestsOutbound: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_agent_monitor_outbound_requests_total",
                                Help: "Number of attempted HTTP requests to vm-monitors by autoscaler-agents",
                        },
                        []string{"endpoint", "code"},
                )),
                monitorRequestsInbound: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_agent_monitor_inbound_requests_total",
                                Help: "Number of HTTP requests from vm-monitors received by autoscaler-agents",
                        },
                        []string{"endpoint", "code"},
                )),
                monitorRequestedChange: resourceChangePair{
                        cpu: util.RegisterMetric(reg, prometheus.NewCounterVec(
                                prometheus.CounterOpts{
                                        Name: "autoscaling_agent_monitor_requested_cpu_change_total",
                                        Help: "Total change in CPU requested from the vm-monitor(s)",
                                },
                                []string{directionLabel},
                        )),
                        mem: util.RegisterMetric(reg, prometheus.NewCounterVec(
                                prometheus.CounterOpts{
                                        Name: "autoscaling_agent_monitor_requested_mem_change_total",
                                        Help: "Total change in memory (in MiB) requested from the vm-monitor(s)",
                                },
                                []string{directionLabel},
                        )),
                },
                monitorApprovedChange: resourceChangePair{
                        cpu: util.RegisterMetric(reg, prometheus.NewCounterVec(
                                prometheus.CounterOpts{
                                        Name: "autoscaling_agent_monitor_approved_cpu_change_total",
                                        Help: "Total change in CPU approved by the vm-monitor(s)",
                                },
                                []string{directionLabel},
                        )),
                        mem: util.RegisterMetric(reg, prometheus.NewCounterVec(
                                prometheus.CounterOpts{
                                        Name: "autoscaling_agent_monitor_approved_mem_change_total",
                                        Help: "Total change in memory (in MiB) approved by the vm-monitor(s)",
                                },
                                []string{directionLabel},
                        )),
                },

                // ---- NEONVM ----
                neonvmRequestsOutbound: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_agent_neonvm_outbound_requests_total",
                                Help: "Number of k8s patch requests to NeonVM objects",
                        },
                        // NOTE: "result" is either "ok" or "[error: $CAUSE]", with $CAUSE as the root cause of
                        // the request error.
                        []string{"result"},
                )),
                neonvmRequestedChange: resourceChangePair{
                        cpu: util.RegisterMetric(reg, prometheus.NewCounterVec(
                                prometheus.CounterOpts{
                                        Name: "autoscaling_agent_neonvm_requested_cpu_change_total",
                                        Help: "Total change in CPU requested for VMs",
                                },
                                []string{directionLabel},
                        )),
                        mem: util.RegisterMetric(reg, prometheus.NewCounterVec(
                                prometheus.CounterOpts{
                                        Name: "autoscaling_agent_neonvm_requested_mem_changed_total",
                                        Help: "Total change in memory (in MiB) requested for VMs",
                                },
                                []string{directionLabel},
                        )),
                },

                // ---- RUNNER LIFECYCLE ----
                runnersCount: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: "autoscaling_agent_runners_current",
                                Help: "Number of per-VM runners, with associated metadata",
                        },
                        // NB: is_endpoint ∈ ("true", "false"), state ∈ runnerMetricState = ("ok", "stuck", "errored", "panicked")
                        []string{"is_endpoint", "state"},
                )),
                runnerThreadPanics: util.RegisterMetric(reg, prometheus.NewCounter(
                        prometheus.CounterOpts{
                                Name: "autoscaling_agent_runner_thread_panics_total",
                                Help: "Number of panics from autoscaler-agent per-VM runner threads",
                        },
                )),
                runnerStarts: util.RegisterMetric(reg, prometheus.NewCounter(
                        prometheus.CounterOpts{
                                Name: "autoscaling_agent_runner_starts",
                                Help: "Number of new per-VM Runners started",
                        },
                )),
                runnerRestarts: util.RegisterMetric(reg, prometheus.NewCounter(
                        prometheus.CounterOpts{
                                Name: "autoscaling_agent_runner_restarts",
                                Help: "Number of existing per-VM Runners restarted due to failure",
                        },
                )),
                runnerNextActions: util.RegisterMetric(reg, prometheus.NewCounter(
                        prometheus.CounterOpts{
                                Name: "autoscaling_agent_runner_next_actions_total",
                                Help: "Number of times (*core.State).NextActions() has been called",
                        },
                )),

                scalingLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec(
                        prometheus.HistogramOpts{
                                Name:    "autoscaling_agent_scaling_latency_seconds",
                                Help:    "End-to-end scaling latency",
                                Buckets: buckets,
                        },
                        []string{directionLabel},
                )),
                pluginLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec(
                        prometheus.HistogramOpts{
                                Name:    "autoscaling_agent_plugin_latency_seconds",
                                Help:    "Plugin request latency",
                                Buckets: buckets,
                        },
                        []string{directionLabel},
                )),
                monitorLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec(
                        prometheus.HistogramOpts{
                                Name:    "autoscaling_agent_monitor_latency_seconds",
                                Help:    "Monitor request latency",
                                Buckets: buckets,
                        },
                        []string{directionLabel},
                )),
                neonvmLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec(
                        prometheus.HistogramOpts{
                                Name:    "autoscaling_agent_neonvm_latency_seconds",
                                Help:    "NeonVM request latency",
                                Buckets: buckets,
                        },
                        []string{directionLabel},
                )),
        }

        // Some of of the metrics should have default keys set to zero. Otherwise, these won't be filled
        // unil the value is non-zero (because something's happened), which makes it harder to
        // distinguish between "valid signal of nothing" vs "no signal".
        metricsWithDirection := []resourceChangePair{
                // scheduler:
                metrics.schedulerRequestedChange,
                metrics.schedulerApprovedChange,
                // monitor:
                metrics.monitorRequestedChange,
                metrics.monitorApprovedChange,
                // neonvm:
                metrics.neonvmRequestedChange,
        }
        for _, p := range metricsWithDirection {
                for _, m := range []*prometheus.CounterVec{p.cpu, p.mem} {
                        m.WithLabelValues(directionValueInc).Add(0.0)
                        m.WithLabelValues(directionValueDec).Add(0.0)
                }
        }

        runnerStates := []runnerMetricState{
                runnerMetricStateOk,
                runnerMetricStateStuck,
                runnerMetricStatePanicked,
        }
        for _, s := range runnerStates {
                metrics.runnersCount.WithLabelValues("true", string(s)).Set(0.0)
                metrics.runnersCount.WithLabelValues("false", string(s)).Set(0.0)
        }

        return metrics, reg
}

func flagsToDirection(flags vmv1.Flag) string {
        if flags.Has(revsource.Upscale) && flags.Has(revsource.Downscale) {
                return directionValueBoth
        }
        if flags.Has(revsource.Upscale) {
                return directionValueInc
        }
        if flags.Has(revsource.Downscale) {
                return directionValueDec
        }
        return directionValueNone
}

func WrapHistogramVec(hist *prometheus.HistogramVec) revsource.ObserveCallback {
        return func(dur time.Duration, flags vmv1.Flag) {
                hist.WithLabelValues(flagsToDirection(flags)).Observe(dur.Seconds())
        }
}

type PerVMMetrics struct {
        // activeMu and activeVMs exist to track the set of VMs currently represented in the metrics, so
        // that when we set the desired CU from internal information, we can check whether the VM still
        // exists.
        // Otherwise it's not possible to prevent data races that would result in leaking metric labels.
        activeMu  sync.Mutex
        activeVMs map[util.NamespacedName]vmMetadata

        cpu          *prometheus.GaugeVec
        memory       *prometheus.GaugeVec
        restartCount *prometheus.GaugeVec
        desiredCU    *prometheus.GaugeVec
}

type vmMetadata struct {
        endpointID string
        projectID  string
}

type vmResourceValueType string

const (
        vmResourceValueSpecMin        vmResourceValueType = "spec_min"
        vmResourceValueAutoscalingMin vmResourceValueType = "autoscaling_min"
        vmResourceValueSpecUse        vmResourceValueType = "spec_use"
        vmResourceValueStatusUse      vmResourceValueType = "status_use"
        vmResourceValueSpecMax        vmResourceValueType = "spec_max"
        vmResourceValueAutoscalingMax vmResourceValueType = "autoscaling_max"
)

func makePerVMMetrics() (*PerVMMetrics, *prometheus.Registry) {
        reg := prometheus.NewRegistry()

        metrics := &PerVMMetrics{
                activeMu:  sync.Mutex{},
                activeVMs: make(map[util.NamespacedName]vmMetadata),

                cpu: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: "autoscaling_vm_cpu_cores",
                                Help: "Number of CPUs for a VM: min, max, spec using, or status using",
                        },
                        []string{
                                "vm_namespace", // .metadata.namespace
                                "vm_name",      // .metadata.name
                                "endpoint_id",  // .metadata.labels["neon/endpoint-id"]
                                "project_id",   // .metadata.labels["neon/project-id"]
                                "value",        // vmResourceValue: min, spec_use, status_use, max
                        },
                )),
                memory: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: "autoscaling_vm_memory_bytes",
                                Help: "Amount of memory in bytes for a VM: min, max, spec using, or status using",
                        },
                        []string{
                                "vm_namespace", // .metadata.namespace
                                "vm_name",      // .metadata.name
                                "endpoint_id",  // .metadata.labels["neon/endpoint-id"]
                                "project_id",   // .metadata.labels["neon/project-id"]
                                "value",        // vmResourceValue: min, spec_use, status_use, max
                        },
                )),
                restartCount: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: "autoscaling_vm_restart_count",
                                Help: "Number of times that the VM has restarted",
                        },
                        []string{
                                "vm_namespace", // .metadata.namespace
                                "vm_name",      // .metadata.name
                                "endpoint_id",  // .metadata.labels["neon/endpoint-id"]
                                "project_id",   // .metadata.labels["neon/project-id"]
                        },
                )),
                desiredCU: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: "autoscaling_vm_desired_cu",
                                Help: "Amount of Compute Units desired for a VM: the total, and the components for cpu, memory, and LFC",
                        },
                        []string{
                                "vm_namespace", // .metadata.namespace
                                "vm_name",      // .metadata.name
                                "endpoint_id",  // .metadata.labels["neon/endpoint-id"]
                                "project_id",   // .metadata.labels["neon/project-id"]
                                "component",    // desired CU component: total, cpu, mem, lfc
                        },
                )),
        }

        return metrics, reg
}

func makePerVMMetricsLabels(namespace string, vmName string, endpointID string, projectID string, valueType vmResourceValueType) prometheus.Labels {
        labels := prometheus.Labels{
                "vm_namespace": namespace,
                "vm_name":      vmName,
                "endpoint_id":  endpointID,
                "project_id":   projectID,
        }
        if len(valueType) > 0 {
                labels["value"] = string(valueType)
        }
        return labels
}

func (m *PerVMMetrics) updateActive(vm *vmv1.VirtualMachine) {
        m.activeMu.Lock()
        defer m.activeMu.Unlock()

        m.activeVMs[util.GetNamespacedName(vm)] = vmMetadata{
                endpointID: vm.Labels[endpointLabel],
                projectID:  vm.Labels[projectLabel],
        }
}

func (m *PerVMMetrics) deleteActive(vm *vmv1.VirtualMachine) {
        m.activeMu.Lock()
        defer m.activeMu.Unlock()

        delete(m.activeVMs, util.GetNamespacedName(vm))
        // ... and any metrics that were associated with it:
        m.desiredCU.DeletePartialMatch(prometheus.Labels{
                "vm_namespace": vm.Namespace,
                "vm_name":      vm.Name,
        })
}

// vmMetric is a data object that represents a single metric
// (either CPU or memory) for a VM.
type vmMetric struct {
        labels prometheus.Labels
        value  float64
}

func (m *PerVMMetrics) updateDesiredCU(
        vm util.NamespacedName,
        cuMultiplier float64,
        total uint32,
        parts scalingevents.GoalCUComponents,
) {
        m.activeMu.Lock()
        defer m.activeMu.Unlock()

        // Don't do anything if this VM is not known. Either the relevant watch event hasn't been
        // processed yet (unlikely, maybe impossible?) or it has since been deleted (in which case we
        // don't want to leak metrics that won't get cleaned up)
        info, ok := m.activeVMs[vm]
        if !ok {
                return
        }

        pairs := []struct {
                component string
                value     *float64
        }{
                {"total", lo.ToPtr(float64(total))},
                {"cpu", parts.CPU},
                {"mem", parts.Mem},
                {"lfc", parts.LFC},
        }

        for _, p := range pairs {
                labels := prometheus.Labels{
                        "vm_namespace": vm.Namespace,
                        "vm_name":      vm.Name,
                        "endpoint_id":  info.endpointID,
                        "project_id":   info.projectID,
                        "component":    p.component,
                }
                if p.value == nil {
                        m.desiredCU.Delete(labels)
                } else {
                        m.desiredCU.With(labels).Set(*p.value * cuMultiplier /* multiply to allow fractional CU in metrics */)
                }
        }
}

package agent

// Core glue and logic for a single VM
//
// The primary object in this file is the Runner. We create a new Runner for each VM, and the Runner
// spawns a handful of long-running tasks that share state via the Runner object itself.
//
// Each of these tasks is created by (*Runner).spawnBackgroundWorker(), which gracefully handles
// panics so that it terminates (and restarts) the Runner itself, instead of e.g. taking down the
// entire autoscaler-agent.
//
// The main entrypoint is (*Runner).Spawn(), which in turn calls (*Runner).Run(), etc.
//
// For more information, refer to ARCHITECTURE.md.

import (
        "bytes"
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "math"
        "net/http"
        "runtime/debug"
        "strconv"
        "strings"
        "sync/atomic"
        "time"

        "go.uber.org/zap"

        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        ktypes "k8s.io/apimachinery/pkg/types"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/agent/core"
        "github.com/neondatabase/autoscaling/pkg/agent/core/revsource"
        "github.com/neondatabase/autoscaling/pkg/agent/executor"
        "github.com/neondatabase/autoscaling/pkg/agent/scalingevents"
        "github.com/neondatabase/autoscaling/pkg/agent/schedwatch"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/util"
        "github.com/neondatabase/autoscaling/pkg/util/patch"
)

// PluginProtocolVersion is the current version of the agent<->scheduler plugin in use by this
// autoscaler-agent.
//
// Currently, each autoscaler-agent supports only one version at a time. In the future, this may
// change.
const PluginProtocolVersion api.PluginProtoVersion = api.PluginProtoV5_0

// Runner is per-VM Pod god object responsible for handling everything
//
// It primarily operates as a source of shared data for a number of long-running tasks. For
// additional general information, refer to the comment at the top of this file.
type Runner struct {
        global *agentState
        // status provides the high-level status of the Runner. Reading or updating the status requires
        // holding podStatus.lock. Updates are typically done handled by the setStatus method.
        status *lockedPodStatus

        // shutdown provides a clean way to trigger all background Runner threads to shut down. shutdown
        // is set exactly once, by (*Runner).Run
        shutdown context.CancelFunc

        vmName  util.NamespacedName
        podName util.NamespacedName
        podIP   string

        memSlotSize api.Bytes

        // lock guards the values of all mutable fields - namely, scheduler and monitor (which may be
        // read without the lock, but the lock must be acquired to lock them).
        lock util.ChanMutex

        // executorStateDump is set by (*Runner).Run and provides a way to get the state of the
        // "executor"
        executorStateDump func() executor.StateDump

        // monitor, if non nil, stores the current Dispatcher in use for communicating with the
        // vm-monitor, alongside a generation number.
        //
        // Additionally, this field MAY ONLY be updated while holding both lock AND the executor's lock,
        // which means that it may be read when EITHER holding lock OR the executor's lock.
        monitor *monitorInfo

        // backgroundWorkerCount tracks the current number of background workers. It is exclusively
        // updated by r.spawnBackgroundWorker
        backgroundWorkerCount atomic.Int64
        backgroundPanic       chan error
}

// RunnerState is the serializable state of the Runner, extracted by its State method
type RunnerState struct {
        PodIP                 string             `json:"podIP"`
        ExecutorState         executor.StateDump `json:"executorState"`
        Monitor               *MonitorState      `json:"monitor"`
        BackgroundWorkerCount int64              `json:"backgroundWorkerCount"`
}

// SchedulerState is the state of a Scheduler, constructed as part of a Runner's State Method
type SchedulerState struct {
        Info schedwatch.SchedulerInfo `json:"info"`
}

// Temporary type, to hopefully help with debugging https://github.com/neondatabase/autoscaling/issues/503
type MonitorState struct {
        WaitersSize int `json:"waitersSize"`
}

func (r *Runner) State(ctx context.Context) (*RunnerState, error) {
        if err := r.lock.TryLock(ctx); err != nil {
                return nil, err
        }
        defer r.lock.Unlock()

        var monitorState *MonitorState
        if r.monitor != nil {
                monitorState = &MonitorState{
                        WaitersSize: r.monitor.dispatcher.lenWaiters(),
                }
        }

        var executorState *executor.StateDump
        if r.executorStateDump != nil /* may be nil if r.Run() hasn't fully started yet */ {
                s := r.executorStateDump()
                executorState = &s
        }

        return &RunnerState{
                PodIP:                 r.podIP,
                ExecutorState:         *executorState,
                Monitor:               monitorState,
                BackgroundWorkerCount: r.backgroundWorkerCount.Load(),
        }, nil
}

func (r *Runner) Spawn(ctx context.Context, logger *zap.Logger, vmInfoUpdated util.CondChannelReceiver) {
        go func() {
                // Gracefully handle panics, plus trigger restart
                defer func() {
                        if err := recover(); err != nil {
                                now := time.Now()
                                r.status.update(r.global, func(stat podStatus) podStatus {
                                        stat.endState = &podStatusEndState{
                                                ExitKind: podStatusExitPanicked,
                                                Error:    fmt.Errorf("Runner %v panicked: %v", stat.vmInfo.NamespacedName(), err),
                                                Time:     now,
                                        }
                                        return stat
                                })
                        }

                        r.global.TriggerRestartIfNecessary(ctx, logger, r.podName, r.podIP)
                }()

                r.Run(ctx, logger, vmInfoUpdated)
                endTime := time.Now()

                exitKind := podStatusExitCanceled // normal exit, only by context being canceled.
                r.status.update(r.global, func(stat podStatus) podStatus {
                        stat.endState = &podStatusEndState{
                                ExitKind: exitKind,
                                Error:    nil,
                                Time:     endTime,
                        }
                        return stat
                })

                logger.Info("Ended without error")
        }()
}

// Run is the main entrypoint to the long-running per-VM pod tasks
func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util.CondChannelReceiver) {
        ctx, r.shutdown = context.WithCancel(ctx)
        defer r.shutdown()

        getVmInfo := func() api.VmInfo {
                r.status.mu.Lock()
                defer r.status.mu.Unlock()
                return r.status.vmInfo
        }

        execLogger := logger.Named("exec")

        // Subtract a small random amount from core.Config.PluginRequestTick so that periodic requests
        // tend to become distribted randomly over time.
        pluginRequestJitter := util.NewTimeRange(time.Millisecond, 0, 100).Random()

        coreExecLogger := execLogger.Named("core")

        vmInfo := getVmInfo()
        var initialRevision int64
        if vmInfo.CurrentRevision != nil {
                initialRevision = vmInfo.CurrentRevision.Value
        }
        // "dsrl" stands for "desired scaling report limiter" -- helper to avoid spamming events.
        dsrl := &desiredScalingReportLimiter{lastEvent: nil}
        revisionSource := revsource.NewRevisionSource(initialRevision, WrapHistogramVec(&r.global.metrics.scalingLatency))
        executorCore := executor.NewExecutorCore(coreExecLogger, vmInfo, executor.Config{
                OnNextActions: r.global.metrics.runnerNextActions.Inc,
                Core: core.Config{
                        ComputeUnit:                        r.global.config.Scaling.ComputeUnit,
                        DefaultScalingConfig:               r.global.config.Scaling.DefaultConfig,
                        NeonVMRetryWait:                    time.Second * time.Duration(r.global.config.NeonVM.RetryFailedRequestSeconds),
                        PluginRequestTick:                  time.Second*time.Duration(r.global.config.Scheduler.RequestAtLeastEverySeconds) - pluginRequestJitter,
                        PluginRetryWait:                    time.Second * time.Duration(r.global.config.Scheduler.RetryFailedRequestSeconds),
                        PluginDeniedRetryWait:              time.Second * time.Duration(r.global.config.Scheduler.RetryDeniedUpscaleSeconds),
                        MonitorDeniedDownscaleCooldown:     time.Second * time.Duration(r.global.config.Monitor.RetryDeniedDownscaleSeconds),
                        MonitorRequestedUpscaleValidPeriod: time.Second * time.Duration(r.global.config.Monitor.RequestedUpscaleValidSeconds),
                        MonitorRetryWait:                   time.Second * time.Duration(r.global.config.Monitor.RetryFailedRequestSeconds),
                        Log: core.LogConfig{
                                Info: coreExecLogger.Info,
                                Warn: coreExecLogger.Warn,
                        },
                        RevisionSource: revisionSource,
                        ObservabilityCallbacks: core.ObservabilityCallbacks{
                                PluginLatency:  WrapHistogramVec(&r.global.metrics.pluginLatency),
                                MonitorLatency: WrapHistogramVec(&r.global.metrics.monitorLatency),
                                NeonVMLatency:  WrapHistogramVec(&r.global.metrics.neonvmLatency),
                                ActualScaling:  r.reportScalingEvent,
                                HypotheticalScaling: func(ts time.Time, current, target uint32, parts core.ScalingGoalParts) {
                                        r.reportDesiredScaling(dsrl, ts, current, target, scalingevents.GoalCUComponents{
                                                CPU: parts.CPU,
                                                Mem: parts.Mem,
                                                LFC: parts.LFC,
                                        })
                                },
                        },
                },
        })

        r.executorStateDump = executorCore.StateDump

        monitorGeneration := executor.NewStoredGenerationNumber()

        pluginIface := makePluginInterface(r)
        neonvmIface := makeNeonVMInterface(r)
        monitorIface := makeMonitorInterface(r, executorCore, monitorGeneration)

        // "ecwc" stands for "ExecutorCoreWithClients"
        ecwc := executorCore.WithClients(executor.ClientSet{
                Plugin:  pluginIface,
                NeonVM:  neonvmIface,
                Monitor: monitorIface,
        })

        logger.Info("Starting background workers")

        // FIXME: make this timeout/delay a separately defined constant, or configurable
        mainDeadlockChecker := r.lock.DeadlockChecker(250*time.Millisecond, time.Second)

        r.spawnBackgroundWorker(ctx, logger, "deadlock checker", ignoreLogger(mainDeadlockChecker))
        r.spawnBackgroundWorker(ctx, logger, "podStatus updater", func(ctx2 context.Context, logger2 *zap.Logger) {
                r.status.periodicallyRefreshState(ctx2, logger2, r.global)
        })
        r.spawnBackgroundWorker(ctx, logger, "VmInfo updater", func(ctx2 context.Context, logger2 *zap.Logger) {
                for {
                        select {
                        case <-ctx2.Done():
                                return
                        case <-vmInfoUpdated.Recv():
                                vm := getVmInfo()
                                ecwc.Updater().UpdatedVM(vm, func() {
                                        logger2.Info("VmInfo updated", zap.Any("vmInfo", vm))
                                })
                        }
                }
        })
        r.spawnBackgroundWorker(ctx, logger, "get system metrics", func(ctx2 context.Context, logger2 *zap.Logger) {
                getMetricsLoop(
                        r,
                        ctx2,
                        logger2,
                        r.global.config.Metrics.System,
                        metricsMgr[*core.SystemMetrics]{
                                kind:         "system",
                                emptyMetrics: func() *core.SystemMetrics { return new(core.SystemMetrics) },
                                isActive:     func() bool { return true },
                                updateMetrics: func(metrics *core.SystemMetrics, withLock func()) {
                                        ecwc.Updater().UpdateSystemMetrics(*metrics, withLock)
                                },
                        },
                )
        })
        r.spawnBackgroundWorker(ctx, logger, "get LFC metrics", func(ctx2 context.Context, logger2 *zap.Logger) {
                getMetricsLoop(
                        r,
                        ctx2,
                        logger2,
                        r.global.config.Metrics.LFC,
                        metricsMgr[*core.LFCMetrics]{
                                kind:         "LFC",
                                emptyMetrics: func() *core.LFCMetrics { return new(core.LFCMetrics) },
                                isActive: func() bool {
                                        scalingConfig := r.global.config.Scaling.DefaultConfig.WithOverrides(getVmInfo().Config.ScalingConfig)
                                        return *scalingConfig.EnableLFCMetrics // guaranteed non-nil as a required field.
                                },
                                updateMetrics: func(metrics *core.LFCMetrics, withLock func()) {
                                        ecwc.Updater().UpdateLFCMetrics(*metrics, withLock)
                                },
                        },
                )
        })
        r.spawnBackgroundWorker(ctx, logger.Named("vm-monitor"), "vm-monitor reconnection loop", func(ctx2 context.Context, logger2 *zap.Logger) {
                r.connectToMonitorLoop(ctx2, logger2, monitorGeneration, monitorStateCallbacks{
                        reset: func(withLock func()) {
                                ecwc.Updater().ResetMonitor(withLock)
                        },
                        upscaleRequested: func(request api.MoreResources, withLock func()) {
                                ecwc.Updater().UpscaleRequested(request, withLock)
                        },
                        setActive: func(active bool, withLock func()) {
                                ecwc.Updater().MonitorActive(active, withLock)
                        },
                })
        })
        r.spawnBackgroundWorker(ctx, execLogger.Named("sleeper"), "executor: sleeper", ecwc.DoSleeper)
        r.spawnBackgroundWorker(ctx, execLogger.Named("plugin"), "executor: plugin", ecwc.DoPluginRequests)
        r.spawnBackgroundWorker(ctx, execLogger.Named("neonvm"), "executor: neonvm", ecwc.DoNeonVMRequests)
        r.spawnBackgroundWorker(ctx, execLogger.Named("vm-monitor-downscale"), "executor: vm-monitor downscale", ecwc.DoMonitorDownscales)
        r.spawnBackgroundWorker(ctx, execLogger.Named("vm-monitor-upscale"), "executor: vm-monitor upscale", ecwc.DoMonitorUpscales)

        // Note: Run doesn't terminate unless the parent context is cancelled - either because the VM
        // pod was deleted, or the autoscaler-agent is exiting.
        select {
        case <-ctx.Done():
                return
        case err := <-r.backgroundPanic:
                panic(err)
        }
}

func (r *Runner) reportScalingEvent(timestamp time.Time, currentCU, targetCU uint32) {
        endpointID := func() string {
                return r.status.endpointID
        }()

        reporter := r.global.scalingReporter
        reporter.Submit(reporter.NewActualEvent(
                timestamp,
                endpointID,
                currentCU,
                targetCU,
        ))
}

func (r *Runner) reportDesiredScaling(
        rl *desiredScalingReportLimiter,
        timestamp time.Time,
        currentCU uint32,
        targetCU uint32,
        parts scalingevents.GoalCUComponents,
) {
        endpointID := func() string {
                return r.status.endpointID
        }()

        r.global.vmMetrics.updateDesiredCU(
                r.vmName,
                r.global.config.ScalingEvents.CUMultiplier, // have to multiply before exposing as metrics here.
                targetCU,
                parts,
        )

        rl.report(r.global.scalingReporter, r.global.scalingReporter.NewHypotheticalEvent(
                timestamp,
                endpointID,
                currentCU,
                targetCU,
                parts,
        ))
}

type desiredScalingReportLimiter struct {
        lastEvent *scalingevents.ScalingEvent
}

func (rl *desiredScalingReportLimiter) report(
        reporter *scalingevents.Reporter,
        event scalingevents.ScalingEvent,
) {
        closeEnough := func(x *float64, y *float64) bool {
                if (x != nil) != (y != nil) {
                        return false
                }
                if x == nil /* && y == nil */ {
                        return true
                }
                // true iff x and y are within the threshold of each other
                return math.Abs(*x-*y) < 0.25
        }

        // Check if we should skip this time.
        if rl.lastEvent != nil {
                skip := rl.lastEvent.TargetMilliCU == event.TargetMilliCU &&
                        closeEnough(rl.lastEvent.GoalComponents.CPU, event.GoalComponents.CPU) &&
                        closeEnough(rl.lastEvent.GoalComponents.Mem, event.GoalComponents.Mem) &&
                        closeEnough(rl.lastEvent.GoalComponents.LFC, event.GoalComponents.LFC)
                if skip {
                        return
                }
        }

        // Not skipping.
        rl.lastEvent = &event
        reporter.Submit(event)
}

//////////////////////
// Background tasks //
//////////////////////

func ignoreLogger(f func(context.Context)) func(context.Context, *zap.Logger) {
        return func(c context.Context, _ *zap.Logger) {
                f(c)
        }
}

// spawnBackgroundWorker is a helper function to appropriately handle panics in the various goroutines
// spawned by `(Runner) Run`, sending them back on r.backgroundPanic
//
// This method is essentially equivalent to 'go f(ctx)' but with appropriate panic handling,
// start/stop logging, and updating of r.backgroundWorkerCount
func (r *Runner) spawnBackgroundWorker(ctx context.Context, logger *zap.Logger, name string, f func(context.Context, *zap.Logger)) {
        // Increment the background worker count
        r.backgroundWorkerCount.Add(1)

        logger = logger.With(zap.String("taskName", name))

        go func() {
                defer func() {
                        // Decrement the background worker count
                        r.backgroundWorkerCount.Add(-1)

                        if v := recover(); v != nil {
                                r.global.metrics.runnerThreadPanics.Inc()

                                err := fmt.Errorf("background worker %q panicked: %v", name, v)
                                // note: In Go, the stack doesn't "unwind" on panic. Instead, a panic will traverse up
                                // the callstack, and each deferred function, when called, will be *added* to the stack
                                // as if the original panic() is calling them. So the output of runtime/debug.Stack()
                                // has a couple frames do with debug.Stack() and this deferred function, and then the
                                // rest of the callstack starts from where the panic occurred.
                                //
                                // FIXME: we should handle the stack ourselves to remove the stack frames from
                                // debug.Stack() and co. -- it's ok to have nice things!
                                logger.Error(
                                        "background worker panicked",
                                        zap.String("error", fmt.Sprint(v)),
                                        zap.String("stack", string(debug.Stack())),
                                )
                                // send to r.backgroundPanic if we can; otherwise, don't worry about it.
                                select {
                                case r.backgroundPanic <- err:
                                default:
                                }
                        } else {
                                logger.Info("background worker ended normally")
                        }
                }()

                logger.Info("background worker started")

                f(ctx, logger)
        }()
}

type metricsMgr[M core.FromPrometheus] struct {
        // kind is the human-readable name representing this type of metrics.
        // It's either "system" or "LFC".
        kind string

        // emptyMetrics returns a new M
        //
        // Typically this is required because M is itself a pointer, so if we just initialized it with a
        // zero value, we'd end up with nil pointer derefs. There *are* ways around this with generics,
        // but at the time we decided this is the least convoluted way.
        emptyMetrics func() M

        // isActive returns whether these metrics should currently be collected for the VM.
        //
        // For example, with LFC metrics, we return false if they are not enabled for the VM.
        isActive func() bool

        // updateMetrics is a callback to update the internal state with new values for these metrics.
        updateMetrics func(metrics M, withLock func())
}

// getMetricsLoop repeatedly attempts to fetch metrics from the VM
//
// Every time metrics are successfully fetched, the value is recorded with mgr.updateMetrics().
func getMetricsLoop[M core.FromPrometheus](
        r *Runner,
        ctx context.Context,
        logger *zap.Logger,
        config MetricsSourceConfig,
        mgr metricsMgr[M],
) {
        waitBetweenDuration := time.Second * time.Duration(config.SecondsBetweenRequests)

        randomStartWait := util.NewTimeRange(time.Second, 0, int(config.SecondsBetweenRequests)).Random()

        lastActive := mgr.isActive()

        // Don't log anything if we're not making this type of metrics request currently.
        //
        // The idea is that isActive() can/should be used for gradual rollout of new metrics, and we
        // don't want to log every time we *don't* do the new thing.
        if lastActive {
                logger.Info(
                        fmt.Sprintf("Sleeping for random delay before making first %s metrics request", mgr.kind),
                        zap.Duration("delay", randomStartWait),
                )
        }

        select {
        case <-ctx.Done():
                return
        case <-time.After(randomStartWait):
        }

        for {
                if !mgr.isActive() {
                        if lastActive {
                                logger.Info(fmt.Sprintf("VM is no longer active for %s metrics requests", mgr.kind))
                        }
                        lastActive = false
                } else {
                        if !lastActive {
                                logger.Info(fmt.Sprintf("VM is now active for %s metrics requests", mgr.kind))
                        }
                        lastActive = true

                        metrics := mgr.emptyMetrics()
                        err := doMetricsRequest(r, ctx, logger, metrics, config)
                        if err != nil {
                                logger.Error("Error making metrics request", zap.Error(err))
                                goto next
                        }

                        mgr.updateMetrics(metrics, func() {
                                logger.Info("Updated metrics", zap.Any("metrics", metrics))
                        })
                }

        next:
                select {
                case <-ctx.Done():
                        return
                case <-time.After(waitBetweenDuration):
                }
        }
}

type monitorInfo struct {
        generation executor.GenerationNumber
        dispatcher *Dispatcher
}

type monitorStateCallbacks struct {
        reset            func(withLock func())
        upscaleRequested func(request api.MoreResources, withLock func())
        setActive        func(active bool, withLock func())
}

// connectToMonitorLoop does lifecycle management of the (re)connection to the vm-monitor
func (r *Runner) connectToMonitorLoop(
        ctx context.Context,
        logger *zap.Logger,
        generation *executor.StoredGenerationNumber,
        callbacks monitorStateCallbacks,
) {
        addr := fmt.Sprintf("ws://%s:%d/monitor", r.podIP, r.global.config.Monitor.ServerPort)

        minWait := time.Second * time.Duration(r.global.config.Monitor.ConnectionRetryMinWaitSeconds)
        var lastStart time.Time

        for i := 0; ; i += 1 {
                // Remove any prior Dispatcher from the Runner
                if i != 0 {
                        func() {
                                r.lock.Lock()
                                defer r.lock.Unlock()
                                callbacks.reset(func() {
                                        generation.Inc()
                                        r.monitor = nil
                                        logger.Info("Reset previous vm-monitor connection")
                                })
                        }()
                }

                // If the context was canceled, don't restart
                if err := ctx.Err(); err != nil {
                        action := "attempt"
                        if i != 0 {
                                action = "retry "
                        }
                        logger.Info(
                                fmt.Sprintf("Aborting vm-monitor connection %s because context is already canceled", action),
                                zap.Error(err),
                        )
                        return
                }

                // Delayed restart management, long because of friendly logging:
                if i != 0 {
                        endTime := time.Now()
                        runtime := endTime.Sub(lastStart)

                        if runtime > minWait {
                                logger.Info(
                                        "Immediately retrying connection to vm-monitor",
                                        zap.String("addr", addr),
                                        zap.Duration("totalRuntime", runtime),
                                )
                        } else {
                                delay := minWait - runtime
                                logger.Info(
                                        "Connection to vm-monitor was not live for long, retrying after delay",
                                        zap.Duration("delay", delay),
                                        zap.Duration("totalRuntime", runtime),
                                )

                                select {
                                case <-time.After(delay):
                                        logger.Info(
                                                "Retrying connection to vm-monitor",
                                                zap.Duration("delay", delay),
                                                zap.Duration("waitTime", time.Since(endTime)),
                                                zap.String("addr", addr),
                                        )
                                case <-ctx.Done():
                                        logger.Info(
                                                "Canceling retrying connection to vm-monitor",
                                                zap.Duration("delay", delay),
                                                zap.Duration("waitTime", time.Since(endTime)),
                                                zap.Error(ctx.Err()),
                                        )
                                        return
                                }
                        }
                } else {
                        logger.Info("Connecting to vm-monitor", zap.String("addr", addr))
                }

                lastStart = time.Now()
                dispatcher, err := NewDispatcher(ctx, logger, addr, r, callbacks.upscaleRequested)
                if err != nil {
                        logger.Error("Failed to connect to vm-monitor", zap.String("addr", addr), zap.Error(err))
                        continue
                }

                // Update runner to the new dispatcher
                func() {
                        r.lock.Lock()
                        defer r.lock.Unlock()
                        callbacks.setActive(true, func() {
                                r.monitor = &monitorInfo{
                                        generation: generation.Inc(),
                                        dispatcher: dispatcher,
                                }
                                logger.Info("Connected to vm-monitor")
                        })
                }()

                // Wait until the dispatcher is no longer running, either due to error or because the
                // root-level Runner context was canceled.
                <-dispatcher.ExitSignal()

                if err := dispatcher.ExitError(); err != nil {
                        logger.Error("Dispatcher for vm-monitor connection exited due to error", zap.Error(err))
                }
        }
}

//////////////////////////////////////////
// Lower-level implementation functions //
//////////////////////////////////////////

// doMetricsRequest makes a single metrics request to the VM, writing the result into 'metrics'
func doMetricsRequest(
        r *Runner,
        ctx context.Context,
        logger *zap.Logger,
        metrics core.FromPrometheus,
        config MetricsSourceConfig,
) error {
        url := fmt.Sprintf("http://%s:%d/metrics", r.podIP, config.Port)

        timeout := time.Second * time.Duration(config.RequestTimeoutSeconds)
        reqCtx, cancel := context.WithTimeout(ctx, timeout)
        defer cancel()

        req, err := http.NewRequestWithContext(reqCtx, http.MethodGet, url, bytes.NewReader(nil))
        if err != nil {
                panic(fmt.Errorf("Error constructing metrics request to %q: %w", url, err))
        }

        logger.Debug("Making metrics request to VM", zap.String("url", url))

        resp, err := http.DefaultClient.Do(req)
        if ctx.Err() != nil {
                return ctx.Err()
        } else if err != nil {
                return fmt.Errorf("Error making request to %q: %w", url, err)
        }
        defer resp.Body.Close()

        if resp.StatusCode != 200 {
                return fmt.Errorf("Unsuccessful response status %d", resp.StatusCode)
        }

        if err := core.ParseMetrics(resp.Body, metrics); err != nil {
                return fmt.Errorf("Error parsing metrics from prometheus output: %w", err)
        }

        return nil
}

func (r *Runner) doNeonVMRequest(
        ctx context.Context,
        target api.Resources,
        targetRevision vmv1.RevisionWithTime,
) error {
        patches := []patch.Operation{{
                Op:    patch.OpReplace,
                Path:  "/spec/guest/cpus/use",
                Value: target.VCPU.ToResourceQuantity(),
        }, {
                Op:    patch.OpReplace,
                Path:  "/spec/guest/memorySlots/use",
                Value: uint32(target.Mem / r.memSlotSize),
        }, {
                Op:    patch.OpReplace,
                Path:  "/spec/targetRevision",
                Value: targetRevision,
        }}

        patchPayload, err := json.Marshal(patches)
        if err != nil {
                panic(fmt.Errorf("Error marshalling JSON patch: %w", err))
        }

        timeout := time.Second * time.Duration(r.global.config.NeonVM.RequestTimeoutSeconds)
        requestCtx, cancel := context.WithTimeout(ctx, timeout)
        defer cancel()

        // FIXME: We should check the returned VM object here, in case the values are different.
        //
        // Also relevant: <https://github.com/neondatabase/autoscaling/issues/23>
        _, err = r.global.vmClient.NeonvmV1().VirtualMachines(r.vmName.Namespace).
                Patch(requestCtx, r.vmName.Name, ktypes.JSONPatchType, patchPayload, metav1.PatchOptions{})
        if err != nil {
                errMsg := util.RootError(err).Error()
                // Some error messages contain the object name. We could try to filter them all out, but
                // it's probably more maintainable to just keep them as-is and remove the name.
                errMsg = strings.ReplaceAll(errMsg, r.vmName.Name, "<name>")
                r.global.metrics.neonvmRequestsOutbound.WithLabelValues(fmt.Sprintf("[error: %s]", errMsg)).Inc()
                return err
        }

        r.global.metrics.neonvmRequestsOutbound.WithLabelValues("ok").Inc()
        return nil
}

func (r *Runner) recordResourceChange(current, target api.Resources, metrics resourceChangePair) {
        getDirection := func(targetIsGreater bool) string {
                if targetIsGreater {
                        return directionValueInc
                } else {
                        return directionValueDec
                }
        }

        abs := current.AbsDiff(target)

        // Add CPU
        if abs.VCPU != 0 {
                direction := getDirection(target.VCPU > current.VCPU)

                metrics.cpu.WithLabelValues(direction).Add(abs.VCPU.AsFloat64())
        }

        // Add memory
        if abs.Mem != 0 {
                direction := getDirection(target.Mem > current.Mem)

                // Avoid floating-point inaccuracy.
                byteTotal := abs.Mem
                mib := api.Bytes(1 << 20)
                floatMB := float64(byteTotal/mib) + float64(byteTotal%mib)/float64(mib)

                metrics.mem.WithLabelValues(direction).Add(floatMB)
        }
}

func doMonitorDownscale(
        ctx context.Context,
        logger *zap.Logger,
        dispatcher *Dispatcher,
        target api.Resources,
) (*api.DownscaleResult, error) {
        r := dispatcher.runner
        rawResources := target.ConvertToAllocation()

        timeout := time.Second * time.Duration(r.global.config.Monitor.ResponseTimeoutSeconds)

        res, err := dispatcher.Call(ctx, logger, timeout, "DownscaleRequest", api.DownscaleRequest{
                Target: rawResources,
        })
        if err != nil {
                return nil, err
        }

        return res.Result, nil
}

func doMonitorUpscale(
        ctx context.Context,
        logger *zap.Logger,
        dispatcher *Dispatcher,
        target api.Resources,
) error {
        r := dispatcher.runner
        rawResources := target.ConvertToAllocation()

        timeout := time.Second * time.Duration(r.global.config.Monitor.ResponseTimeoutSeconds)

        _, err := dispatcher.Call(ctx, logger, timeout, "UpscaleNotification", api.UpscaleNotification{
                Granted: rawResources,
        })
        return err
}

// DoSchedulerRequest sends a request to the scheduler and does not validate the response.
func (r *Runner) DoSchedulerRequest(
        ctx context.Context,
        logger *zap.Logger,
        resources api.Resources,
        lastPermit *api.Resources,
        metrics *api.Metrics,
) (_ *api.PluginResponse, err error) {
        reqData := &api.AgentRequest{
                ProtoVersion: PluginProtocolVersion,
                Pod:          r.podName,
                ComputeUnit:  r.global.config.Scaling.ComputeUnit,
                Resources:    resources,
                LastPermit:   lastPermit,
                Metrics:      metrics,
        }

        // make sure we log any error we're returning:
        defer func() {
                if err != nil {
                        logger.Error("Scheduler request failed", zap.Error(err))
                }
        }()

        sched := r.global.schedTracker.Get()
        if sched == nil {
                err := errors.New("no known ready scheduler to send request to")
                description := fmt.Sprintf("[error doing request: %s]", err)
                r.global.metrics.schedulerRequests.WithLabelValues(description).Inc()
                return nil, err
        }

        reqBody, err := json.Marshal(reqData)
        if err != nil {
                return nil, fmt.Errorf("Error encoding request JSON: %w", err)
        }

        timeout := time.Second * time.Duration(r.global.config.NeonVM.RequestTimeoutSeconds)
        reqCtx, cancel := context.WithTimeout(ctx, timeout)
        defer cancel()

        url := fmt.Sprintf("http://%s:%d/", sched.IP, r.global.config.Scheduler.RequestPort)

        request, err := http.NewRequestWithContext(reqCtx, http.MethodPost, url, bytes.NewReader(reqBody))
        if err != nil {
                return nil, fmt.Errorf("Error building request to %q: %w", url, err)
        }
        request.Header.Set("content-type", "application/json")

        logger.Debug("Sending request to scheduler", zap.Any("request", reqData))

        response, err := http.DefaultClient.Do(request)
        if err != nil {
                description := fmt.Sprintf("[error doing request: %s]", util.RootError(err))
                r.global.metrics.schedulerRequests.WithLabelValues(description).Inc()
                return nil, fmt.Errorf("Error doing request: %w", err)
        }
        defer response.Body.Close()

        r.global.metrics.schedulerRequests.WithLabelValues(strconv.Itoa(response.StatusCode)).Inc()

        respBody, err := io.ReadAll(response.Body)
        if err != nil {
                return nil, fmt.Errorf("Error reading body for response: %w", err)
        }

        if response.StatusCode != 200 {
                // Fatal because 4XX implies our state doesn't match theirs, 5XX means we can't assume
                // current contents of the state, and anything other than 200, 4XX, or 5XX shouldn't happen
                return nil, fmt.Errorf("Received response status %d body %q", response.StatusCode, string(respBody))
        }

        var respData api.PluginResponse
        if err := json.Unmarshal(respBody, &respData); err != nil {
                // Fatal because invalid JSON might also be semantically invalid
                return nil, fmt.Errorf("Bad JSON response: %w", err)
        }
        level := zap.DebugLevel
        if respData.Permit.HasFieldLessThan(resources) {
                level = zap.WarnLevel
        }
        logger.Log(level, "Received response from scheduler", zap.Any("response", respData), zap.Any("requested", resources))

        return &respData, nil
}

package scalingevents

import (
        "context"
        "fmt"
        "time"

        "github.com/lithammer/shortuuid"
        "go.uber.org/zap"

        "github.com/neondatabase/autoscaling/pkg/reporting"
)

type ClientsConfig struct {
        AzureBlob *AzureBlobStorageClientConfig `json:"azureBlob"`
        S3        *S3ClientConfig               `json:"s3"`
}

type S3ClientConfig struct {
        reporting.BaseClientConfig
        reporting.S3ClientConfig
        PrefixInBucket string `json:"prefixInBucket"`
}

type AzureBlobStorageClientConfig struct {
        reporting.BaseClientConfig
        reporting.AzureBlobStorageClientConfig
        PrefixInContainer string `json:"prefixInContainer"`
}

type eventsClient = reporting.Client[ScalingEvent]

func createClients(ctx context.Context, logger *zap.Logger, cfg ClientsConfig) ([]eventsClient, error) {
        var clients []eventsClient

        if c := cfg.AzureBlob; c != nil {
                generateKey := newBlobStorageKeyGenerator(c.PrefixInContainer)
                client, err := reporting.NewAzureBlobStorageClient(c.AzureBlobStorageClientConfig, generateKey)
                if err != nil {
                        return nil, fmt.Errorf("error creating Azure Blob Storage client: %w", err)
                }
                logger.Info("Created Azure Blob Storage client for scaling events", zap.Any("config", c))

                clients = append(clients, eventsClient{
                        Name:            "azureblob",
                        Base:            client,
                        BaseConfig:      c.BaseClientConfig,
                        NewBatchBuilder: jsonLinesBatch(reporting.NewGZIPBuffer),
                })
        }
        if c := cfg.S3; c != nil {
                generateKey := newBlobStorageKeyGenerator(c.PrefixInBucket)
                client, err := reporting.NewS3Client(ctx, c.S3ClientConfig, generateKey)
                if err != nil {
                        return nil, fmt.Errorf("error creating S3 client: %w", err)
                }
                logger.Info("Created S3 client for scaling events", zap.Any("config", c))

                clients = append(clients, eventsClient{
                        Name:            "s3",
                        Base:            client,
                        BaseConfig:      c.BaseClientConfig,
                        NewBatchBuilder: jsonLinesBatch(reporting.NewGZIPBuffer),
                })
        }

        return clients, nil
}

func jsonLinesBatch[B reporting.IOBuffer](buf func() B) func() reporting.BatchBuilder[ScalingEvent] {
        return func() reporting.BatchBuilder[ScalingEvent] {
                return reporting.NewJSONLinesBuilder[ScalingEvent](buf())
        }
}

// Returns a function to generate keys for the placement of scaling events data into blob storage.
//
// Example: prefix/2024/10/31/23/events_{uuid}.ndjson.gz (11pm on halloween, UTC)
//
// NOTE: This key format is different from the one we use for billing, but similar to the one proxy
// uses for its reporting.
func newBlobStorageKeyGenerator(prefix string) func() string {
        return func() string {
                now := time.Now().UTC()
                id := shortuuid.New()

                return fmt.Sprintf(
                        "%s/%d/%02d/%02d/%02d/events_%s.ndjson.gz",
                        prefix,
                        now.Year(), now.Month(), now.Day(), now.Hour(),
                        id,
                )
        }
}

package scalingevents

// Prometheus metrics for the agent's scaling event reporting subsystem

import (
        "github.com/prometheus/client_golang/prometheus"

        "github.com/neondatabase/autoscaling/pkg/reporting"
        "github.com/neondatabase/autoscaling/pkg/util"
)

type PromMetrics struct {
        reporting  *reporting.EventSinkMetrics
        totalCount *prometheus.GaugeVec
}

func NewPromMetrics(reg prometheus.Registerer) PromMetrics {
        return PromMetrics{
                reporting: reporting.NewEventSinkMetrics("autoscaling_agent_scalingevents", reg),
                totalCount: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: "autoscaling_agent_scaling_events_total",
                                Help: "Total number of scaling events generated",
                        },
                        []string{"kind"},
                )),
        }
}

func (m PromMetrics) recordSubmitted(event ScalingEvent) {
        var eventKind string
        switch event.Kind {
        case scalingEventActual, scalingEventHypothetical:
                eventKind = string(event.Kind)
        default:
                eventKind = "unknown"
        }
        m.totalCount.WithLabelValues(eventKind).Inc()
}

package scalingevents

import (
        "context"
        "fmt"
        "math"
        "time"

        "github.com/samber/lo"
        "go.uber.org/zap"

        "github.com/neondatabase/autoscaling/pkg/reporting"
)

type Config struct {
        // CUMultiplier sets the ratio between our internal compute unit and the one that should be
        // reported.
        //
        // This exists because Neon allows fractional compute units, while the autoscaler-agent acts on
        // integer multiples of a smaller compute unit.
        CUMultiplier float64 `json:"cuMultiplier"`

        // RereportThreshold sets the minimum amount of change in desired compute units required for us to
        // re-report the desired scaling.
        RereportThreshold float64 `json:"rereportThreshold"`

        // RegionName is the name of the region that the reporting autoscaler-agent is in.
        RegionName string `json:"regionName"`

        Clients ClientsConfig `json:"clients"`
}

type Reporter struct {
        conf    *Config
        sink    *reporting.EventSink[ScalingEvent]
        metrics PromMetrics
}

type ScalingEvent struct {
        Timestamp      time.Time         `json:"timestamp"`
        Region         string            `json:"region"`
        EndpointID     string            `json:"endpoint_id"`
        Kind           scalingEventKind  `json:"kind"`
        CurrentMilliCU uint32            `json:"current_cu"`
        TargetMilliCU  uint32            `json:"target_cu"`
        GoalComponents *GoalCUComponents `json:"goalComponents,omitempty"`
}

type GoalCUComponents struct {
        CPU *float64 `json:"cpu,omitempty"`
        Mem *float64 `json:"mem,omitempty"`
        LFC *float64 `json:"lfc,omitempty"`
}

type scalingEventKind string

const (
        scalingEventActual       = "actual"
        scalingEventHypothetical = "hypothetical"
)

func NewReporter(
        ctx context.Context,
        parentLogger *zap.Logger,
        conf *Config,
        metrics PromMetrics,
) (*Reporter, error) {
        logger := parentLogger.Named("scalingevents")

        clients, err := createClients(ctx, logger, conf.Clients)
        if err != nil {
                return nil, err
        }

        sink := reporting.NewEventSink(logger, metrics.reporting, clients...)

        return &Reporter{
                conf:    conf,
                sink:    sink,
                metrics: metrics,
        }, nil
}

// Run calls the underlying reporting.EventSink's Run() method, periodically pushing events to the
// clients specified in Config until the context expires.
//
// Refer there for more information.
func (r *Reporter) Run(ctx context.Context) error {
        if err := r.sink.Run(ctx); err != nil {
                return fmt.Errorf("scaling events sink failed: %w", err)
        }
        return nil
}

// Submit adds the ScalingEvent to the sender queue(s), returning without waiting for it to be sent.
func (r *Reporter) Submit(event ScalingEvent) {
        r.metrics.recordSubmitted(event)
        r.sink.Enqueue(event)
}

func convertToMilliCU(cu uint32, multiplier float64) uint32 {
        return uint32(math.Round(1000 * float64(cu) * multiplier))
}

// NewActualEvent is a helper function to create a ScalingEvent for actual scaling that has
// occurred.
//
// This method also handles compute unit translation.
func (r *Reporter) NewActualEvent(
        timestamp time.Time,
        endpointID string,
        currentCU uint32,
        targetCU uint32,
) ScalingEvent {
        return ScalingEvent{
                Timestamp:      timestamp,
                Region:         r.conf.RegionName,
                EndpointID:     endpointID,
                Kind:           scalingEventActual,
                CurrentMilliCU: convertToMilliCU(currentCU, r.conf.CUMultiplier),
                TargetMilliCU:  convertToMilliCU(targetCU, r.conf.CUMultiplier),
                GoalComponents: nil,
        }
}

func (r *Reporter) NewHypotheticalEvent(
        timestamp time.Time,
        endpointID string,
        currentCU uint32,
        targetCU uint32,
        goalCUs GoalCUComponents,
) ScalingEvent {
        convertFloat := func(cu *float64) *float64 {
                if cu != nil {
                        return lo.ToPtr(*cu * r.conf.CUMultiplier)
                }
                return nil
        }

        return ScalingEvent{
                Timestamp:      timestamp,
                Region:         r.conf.RegionName,
                EndpointID:     endpointID,
                Kind:           scalingEventHypothetical,
                CurrentMilliCU: convertToMilliCU(currentCU, r.conf.CUMultiplier),
                TargetMilliCU:  convertToMilliCU(targetCU, r.conf.CUMultiplier),
                GoalComponents: &GoalCUComponents{
                        CPU: convertFloat(goalCUs.CPU),
                        Mem: convertFloat(goalCUs.Mem),
                        LFC: convertFloat(goalCUs.LFC),
                },
        }
}

package schedwatch

import (
        "time"

        "go.uber.org/zap/zapcore"

        corev1 "k8s.io/api/core/v1"
        "k8s.io/apimachinery/pkg/types"

        "github.com/neondatabase/autoscaling/pkg/util"
)

type SchedulerInfo struct {
        PodName           util.NamespacedName
        UID               types.UID
        IP                string
        CreationTimestamp time.Time
}

// MarshalLogObject implements zapcore.ObjectMarshaler
func (s SchedulerInfo) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        if err := enc.AddObject("pod", s.PodName); err != nil {
                return err
        }
        enc.AddString("uid", string(s.UID))
        enc.AddString("ip", string(s.IP))
        enc.AddTime("creationTimestamp", s.CreationTimestamp)
        return nil
}

func newSchedulerInfo(pod *corev1.Pod) SchedulerInfo {
        return SchedulerInfo{
                PodName:           util.NamespacedName{Name: pod.Name, Namespace: pod.Namespace},
                UID:               pod.UID,
                IP:                pod.Status.PodIP,
                CreationTimestamp: pod.CreationTimestamp.Time,
        }
}

package schedwatch

import (
        "context"
        "fmt"
        "sync"
        "time"

        "go.uber.org/zap"

        corev1 "k8s.io/api/core/v1"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/apimachinery/pkg/types"
        "k8s.io/client-go/kubernetes"

        "github.com/neondatabase/autoscaling/pkg/util"
        "github.com/neondatabase/autoscaling/pkg/util/watch"
)

func isActivePod(pod *corev1.Pod) bool {
        return pod.Status.PodIP != "" && util.PodReady(pod)
}

type SchedulerTracker struct {
        sp *schedPods

        Stop func()
}

func (s SchedulerTracker) Get() *SchedulerInfo {
        s.sp.mu.RLock()
        defer s.sp.mu.RUnlock()

        return s.sp.current
}

type schedPods struct {
        mu      sync.RWMutex
        current *SchedulerInfo
        pods    map[types.UID]*SchedulerInfo
}

const schedulerNamespace string = "kube-system"

func schedulerLabelSelector(schedulerName string) string {
        return fmt.Sprintf("name=%s", schedulerName)
}

func StartSchedulerWatcher(
        ctx context.Context,
        parentLogger *zap.Logger,
        kubeClient *kubernetes.Clientset,
        metrics watch.Metrics,
        schedulerName string,
) (*SchedulerTracker, error) {
        logger := parentLogger.Named("watch-schedulers")

        sp := &schedPods{
                mu:      sync.RWMutex{},
                current: nil,
                pods:    make(map[types.UID]*SchedulerInfo),
        }

        store, err := watch.Watch(
                ctx,
                logger.Named("watch"),
                kubeClient.CoreV1().Pods(schedulerNamespace),
                watch.Config{
                        ObjectNameLogField: "pod",
                        Metrics: watch.MetricsConfig{
                                Metrics:  metrics,
                                Instance: "Scheduler Pod",
                        },
                        // We don't need to be super responsive to scheduler changes.
                        //
                        // FIXME: make these configurable.
                        RetryRelistAfter: util.NewTimeRange(time.Second, 4, 5),
                        RetryWatchAfter:  util.NewTimeRange(time.Second, 4, 5),
                },
                watch.Accessors[*corev1.PodList, corev1.Pod]{
                        Items: func(list *corev1.PodList) []corev1.Pod { return list.Items },
                },
                watch.InitModeSync,
                metav1.ListOptions{LabelSelector: schedulerLabelSelector(schedulerName)},
                watch.HandlerFuncs[*corev1.Pod]{
                        AddFunc: func(pod *corev1.Pod, preexisting bool) {
                                if isActivePod(pod) {
                                        info := newSchedulerInfo(pod)
                                        logger.Info("New scheduler, already ready", zap.Object("scheduler", info))
                                        sp.add(logger, &info)
                                }
                        },
                        UpdateFunc: func(oldPod, newPod *corev1.Pod) {
                                oldReady := isActivePod(oldPod)
                                newReady := isActivePod(newPod)

                                if !oldReady && newReady {
                                        info := newSchedulerInfo(newPod)
                                        logger.Info("Existing scheduler became ready", zap.Object("scheduler", info))
                                        sp.add(logger, &info)
                                } else if oldReady && !newReady {
                                        info := newSchedulerInfo(newPod)
                                        logger.Info("Existing scheduler no longer ready", zap.Object("scheduler", info))
                                        sp.remove(logger, &info)
                                }
                        },
                        DeleteFunc: func(pod *corev1.Pod, mayBeStale bool) {
                                wasReady := isActivePod(pod)
                                if wasReady {
                                        info := newSchedulerInfo(pod)
                                        logger.Info("Previously-ready scheduler deleted", zap.Object("scheduler", info))
                                        sp.remove(logger, &info)
                                }
                        },
                },
        )
        if err != nil {
                return nil, err
        }

        return &SchedulerTracker{
                sp:   sp,
                Stop: store.Stop,
        }, nil
}

func (s *schedPods) add(logger *zap.Logger, pod *SchedulerInfo) {
        s.mu.Lock()
        defer s.mu.Unlock()

        s.pods[pod.UID] = pod
        s.reconcile(logger)
}

func (s *schedPods) remove(logger *zap.Logger, pod *SchedulerInfo) {
        s.mu.Lock()
        defer s.mu.Unlock()

        delete(s.pods, pod.UID)
        s.reconcile(logger)
}

// reconcile refreshes the value of s.current based on s.pods.
// s.mu MUST be exclusively locked while calling reconcile.
func (s *schedPods) reconcile(logger *zap.Logger) {
        var newCurrent *SchedulerInfo
        // There's *basically* guaranteed to be ≤ 2 scheduler pods because the scheduler deployment has
        // replicas=1, so "just" looping here is fine; it's not worth a more complex data structure.
        for _, pod := range s.pods {
                // Use the pod if we don't already have one, or if it was created more recently than
                // whatever we've seen so far.
                // The ordering isn't *too* important here, but we need to pick one to be consistent, and
                // preferring a newer scheduler (remember: the pod is 'Ready') is likely to be more correct.
                if newCurrent == nil || newCurrent.CreationTimestamp.Before(pod.CreationTimestamp) {
                        newCurrent = pod
                }
        }

        if s.current != nil && newCurrent != nil {
                count := len(s.pods)
                if s.current.UID != newCurrent.UID {
                        logger.Info("Scheduler pod selection changed", zap.Int("count", count), zap.Object("scheduler", newCurrent))
                } else {
                        logger.Info("Scheduler pod selection is unchanged", zap.Int("count", count), zap.Object("scheduler", newCurrent))
                }
        } else if newCurrent == nil && s.current != nil {
                logger.Warn("No scheduler pod available anymore")
        } else if newCurrent != nil && s.current == nil {
                logger.Info("Scheduler pod now available (there was none before)", zap.Object("scheduler", newCurrent))
        } else /* newCurrent == nil && s.current.pod == nil */ {
                logger.Warn("No scheduler pod available (still)")
        }
        s.current = newCurrent
}

package agent

import (
        "context"
        "encoding/json"
        "fmt"
        "maps"
        "slices"
        "time"

        "github.com/prometheus/client_golang/prometheus"
        "go.uber.org/zap"
        "go.uber.org/zap/zapcore"

        corev1 "k8s.io/api/core/v1"
        "k8s.io/apimachinery/pkg/api/resource"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/util"
        "github.com/neondatabase/autoscaling/pkg/util/watch"
)

type vmEvent struct {
        kind    vmEventKind
        vmInfo  api.VmInfo
        podName string
        podIP   string
        // if present, the ID of the endpoint associated with the VM. May be empty.
        endpointID string
}

const (
        endpointLabel = "neon/endpoint-id"
        projectLabel  = "neon/project-id"
)

// MarshalLogObject implements zapcore.ObjectMarshaler
func (ev vmEvent) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        enc.AddString("kind", string(ev.kind))
        enc.AddString("podName", ev.podName)
        enc.AddString("podIP", ev.podIP)
        enc.AddString("endpointID", ev.endpointID)
        if err := enc.AddReflected("vmInfo", ev.vmInfo); err != nil {
                return err
        }
        return nil
}

type vmEventKind string

const (
        vmEventAdded   vmEventKind = "added"
        vmEventUpdated vmEventKind = "updated"
        vmEventDeleted vmEventKind = "deleted"
)

// note: unlike startPodWatcher, we aren't able to use a field selector on VM status.node (currently; NeonVM v0.4.6)
func startVMWatcher(
        ctx context.Context,
        parentLogger *zap.Logger,
        config *Config,
        vmClient *vmclient.Clientset,
        metrics watch.Metrics,
        perVMMetrics *PerVMMetrics,
        nodeName string,
        submitEvent func(vmEvent),
) (*watch.Store[vmv1.VirtualMachine], error) {
        logger := parentLogger.Named("vm-watch")

        return watch.Watch(
                ctx,
                logger.Named("watch"),
                vmClient.NeonvmV1().VirtualMachines(corev1.NamespaceAll),
                watch.Config{
                        ObjectNameLogField: "virtualmachine",
                        Metrics: watch.MetricsConfig{
                                Metrics:  metrics,
                                Instance: "VirtualMachines",
                        },
                        // We want to be relatively snappy; don't wait for too long before retrying.
                        RetryRelistAfter: util.NewTimeRange(time.Millisecond, 500, 1000),
                        RetryWatchAfter:  util.NewTimeRange(time.Millisecond, 500, 1000),
                },
                watch.Accessors[*vmv1.VirtualMachineList, vmv1.VirtualMachine]{
                        Items: func(list *vmv1.VirtualMachineList) []vmv1.VirtualMachine { return list.Items },
                },
                watch.InitModeDefer,
                metav1.ListOptions{},
                watch.HandlerFuncs[*vmv1.VirtualMachine]{
                        AddFunc: func(vm *vmv1.VirtualMachine, preexisting bool) {
                                setVMMetrics(perVMMetrics, vm, nodeName)

                                if vmIsOurResponsibility(vm, config, nodeName) {
                                        event, err := makeVMEvent(logger, vm, vmEventAdded)
                                        if err != nil {
                                                logger.Error(
                                                        "Failed to create vmEvent for added VM",
                                                        util.VMNameFields(vm), zap.Error(err),
                                                )
                                                return
                                        }
                                        submitEvent(event)
                                }
                        },
                        UpdateFunc: func(oldVM, newVM *vmv1.VirtualMachine) {
                                updateVMMetrics(perVMMetrics, oldVM, newVM, nodeName)

                                oldIsOurs := vmIsOurResponsibility(oldVM, config, nodeName)
                                newIsOurs := vmIsOurResponsibility(newVM, config, nodeName)
                                if !oldIsOurs && !newIsOurs {
                                        return
                                }

                                var vmForEvent *vmv1.VirtualMachine
                                var eventKind vmEventKind

                                if !oldIsOurs && newIsOurs {
                                        vmForEvent = newVM
                                        eventKind = vmEventAdded
                                } else if oldIsOurs && !newIsOurs {
                                        vmForEvent = oldVM
                                        eventKind = vmEventDeleted
                                } else {
                                        vmForEvent = newVM
                                        eventKind = vmEventUpdated
                                }

                                event, err := makeVMEvent(logger, vmForEvent, eventKind)
                                if err != nil {
                                        logger.Error(
                                                "Failed to create vmEvent for updated VM",
                                                util.VMNameFields(vmForEvent), zap.Error(err),
                                        )
                                        return
                                }

                                submitEvent(event)
                        },
                        DeleteFunc: func(vm *vmv1.VirtualMachine, maybeStale bool) {
                                deleteVMMetrics(perVMMetrics, vm, nodeName)

                                if vmIsOurResponsibility(vm, config, nodeName) {
                                        event, err := makeVMEvent(logger, vm, vmEventDeleted)
                                        if err != nil {
                                                logger.Error(
                                                        "Failed to create vmEvent for deleted VM",
                                                        util.VMNameFields(vm), zap.Error(err),
                                                )
                                                return
                                        }
                                        submitEvent(event)
                                }
                        },
                },
        )
}

func makeVMEvent(logger *zap.Logger, vm *vmv1.VirtualMachine, kind vmEventKind) (vmEvent, error) {
        info, err := api.ExtractVmInfo(logger, vm)
        if err != nil {
                return vmEvent{}, fmt.Errorf("Error extracting VM info: %w", err)
        }

        endpointID := ""
        if vm.Labels != nil {
                endpointID = vm.Labels[endpointLabel]
        }

        return vmEvent{
                kind:       kind,
                vmInfo:     *info,
                podName:    vm.Status.PodName,
                podIP:      vm.Status.PodIP,
                endpointID: endpointID,
        }, nil
}

// extractAutoscalingBounds extracts the ScalingBounds from a VM's autoscaling
// annotation, for the purpose of exposing it in per-VM metrics.
//
// We're not reusing api.ExtractVmInfo even though it also looks at the bounds
// annotation, because its data is less precise - CPU and memory values might
// come from the VM spec without us knowing.
func extractAutoscalingBounds(vm *vmv1.VirtualMachine) *api.ScalingBounds {
        boundsJSON, ok := vm.Annotations[api.AnnotationAutoscalingBounds]
        if !ok {
                return nil
        }
        var bounds api.ScalingBounds
        if err := json.Unmarshal([]byte(boundsJSON), &bounds); err != nil {
                return nil
        }
        return &bounds
}

type pair[T1 any, T2 any] struct {
        first  T1
        second T2
}

func makeVMMetric(vm *vmv1.VirtualMachine, valType vmResourceValueType, val float64) vmMetric {
        endpointID := vm.Labels[endpointLabel]
        projectID := vm.Labels[projectLabel]
        labels := makePerVMMetricsLabels(vm.Namespace, vm.Name, endpointID, projectID, valType)
        return vmMetric{
                labels: labels,
                value:  val,
        }
}

func makeVMCPUMetrics(vm *vmv1.VirtualMachine) []vmMetric {
        var metrics []vmMetric

        // metrics from spec
        specPairs := []pair[vmResourceValueType, vmv1.MilliCPU]{
                {vmResourceValueSpecMin, vm.Spec.Guest.CPUs.Min},
                {vmResourceValueSpecMax, vm.Spec.Guest.CPUs.Max},
                {vmResourceValueSpecUse, vm.Spec.Guest.CPUs.Use},
        }
        for _, p := range specPairs {
                m := makeVMMetric(vm, p.first, p.second.AsFloat64())
                metrics = append(metrics, m)
        }

        // metrics from status
        if vm.Status.CPUs != nil {
                m := makeVMMetric(vm, vmResourceValueStatusUse, vm.Status.CPUs.AsFloat64())
                metrics = append(metrics, m)
        }

        // metrics from autoscaling bounds annotation
        if bounds := extractAutoscalingBounds(vm); bounds != nil {
                boundPairs := []pair[vmResourceValueType, resource.Quantity]{
                        {vmResourceValueAutoscalingMin, bounds.Min.CPU},
                        {vmResourceValueAutoscalingMax, bounds.Max.CPU},
                }
                for _, p := range boundPairs {
                        // avoid using resource.Quantity.AsApproximateFloat64() since it's quite inaccurate
                        m := makeVMMetric(vm, p.first, vmv1.MilliCPUFromResourceQuantity(p.second).AsFloat64())
                        metrics = append(metrics, m)
                }
        }

        return metrics
}

func makeVMMemMetrics(vm *vmv1.VirtualMachine) []vmMetric {
        var metrics []vmMetric

        memorySlotsToBytes := func(m int32) int64 {
                return vm.Spec.Guest.MemorySlotSize.Value() * int64(m)
        }

        // metrics from spec
        specPairs := []pair[vmResourceValueType, int32]{
                {vmResourceValueSpecMin, vm.Spec.Guest.MemorySlots.Min},
                {vmResourceValueSpecMax, vm.Spec.Guest.MemorySlots.Max},
                {vmResourceValueSpecUse, vm.Spec.Guest.MemorySlots.Use},
        }
        for _, p := range specPairs {
                m := makeVMMetric(vm, p.first, float64(memorySlotsToBytes(p.second)))
                metrics = append(metrics, m)
        }

        // metrics from status
        if vm.Status.MemorySize != nil {
                m := makeVMMetric(vm, vmResourceValueStatusUse, float64(vm.Status.MemorySize.Value()))
                metrics = append(metrics, m)
        }

        // metrics from autoscaling bounds annotation
        if bounds := extractAutoscalingBounds(vm); bounds != nil {
                boundPairs := []pair[vmResourceValueType, resource.Quantity]{
                        {vmResourceValueAutoscalingMin, bounds.Min.Mem},
                        {vmResourceValueAutoscalingMax, bounds.Max.Mem},
                }
                for _, p := range boundPairs {
                        m := makeVMMetric(vm, p.first, float64(p.second.Value()))
                        metrics = append(metrics, m)
                }
        }

        return metrics
}

// makeVMRestartMetrics makes metrics related to VM restarts. Currently, it
// only includes one metrics, which is restartCount.
func makeVMRestartMetrics(vm *vmv1.VirtualMachine) []vmMetric {
        endpointID := vm.Labels[endpointLabel]
        projectID := vm.Labels[projectLabel]
        labels := makePerVMMetricsLabels(vm.Namespace, vm.Name, endpointID, projectID, "")
        return []vmMetric{
                {
                        labels: labels,
                        value:  float64(vm.Status.RestartCount),
                },
        }
}

func setVMMetrics(perVMMetrics *PerVMMetrics, vm *vmv1.VirtualMachine, nodeName string) {
        if vm.Status.Node != nodeName {
                return
        }

        cpuMetrics := makeVMCPUMetrics(vm)
        for _, m := range cpuMetrics {
                perVMMetrics.cpu.With(m.labels).Set(m.value)
        }

        memMetrics := makeVMMemMetrics(vm)
        for _, m := range memMetrics {
                perVMMetrics.memory.With(m.labels).Set(m.value)
        }

        restartCountMetrics := makeVMRestartMetrics(vm)
        for _, m := range restartCountMetrics {
                perVMMetrics.restartCount.With(m.labels).Set(m.value)
        }

        // Add the VM to the internal tracker:
        perVMMetrics.updateActive(vm)
}

func updateVMMetrics(perVMMetrics *PerVMMetrics, oldVM, newVM *vmv1.VirtualMachine, nodeName string) {
        if newVM.Status.Node != nodeName || oldVM.Status.Node != nodeName {
                // this case we don't need an in-place metric update. Either we just have
                // to add the new metrics, or delete the old ones, or nothing!
                deleteVMMetrics(perVMMetrics, oldVM, nodeName)
                setVMMetrics(perVMMetrics, newVM, nodeName)
                return
        }

        updateMetrics := func(gauge *prometheus.GaugeVec, oldMetrics, newMetrics []vmMetric) {
                for _, m := range oldMetrics {
                        // this is a linear search, but since we have small number (~10) of
                        // different metrics for each vm, this should be fine.
                        ok := slices.ContainsFunc(newMetrics, func(vm vmMetric) bool {
                                return maps.Equal(m.labels, vm.labels)
                        })
                        if !ok {
                                gauge.Delete(m.labels)
                        }
                }
                for _, m := range newMetrics {
                        gauge.With(m.labels).Set(m.value)
                }
        }

        oldCPUMetrics := makeVMCPUMetrics(oldVM)
        newCPUMetrics := makeVMCPUMetrics(newVM)
        updateMetrics(perVMMetrics.cpu, oldCPUMetrics, newCPUMetrics)

        oldMemMetrics := makeVMMemMetrics(oldVM)
        newMemMetrics := makeVMMemMetrics(newVM)
        updateMetrics(perVMMetrics.memory, oldMemMetrics, newMemMetrics)

        oldRestartCountMetrics := makeVMRestartMetrics(oldVM)
        newRestartCountMetrics := makeVMRestartMetrics(newVM)
        updateMetrics(perVMMetrics.restartCount, oldRestartCountMetrics, newRestartCountMetrics)

        // Update the VM in the internal tracker:
        perVMMetrics.updateActive(newVM) // note: don't need to clean up old one, because it's keyed by name
}

func deleteVMMetrics(perVMMetrics *PerVMMetrics, vm *vmv1.VirtualMachine, nodeName string) {
        if vm.Status.Node != nodeName {
                return
        }

        cpuMetrics := makeVMCPUMetrics(vm)
        for _, m := range cpuMetrics {
                perVMMetrics.cpu.Delete(m.labels)
        }

        memMetrics := makeVMMemMetrics(vm)
        for _, m := range memMetrics {
                perVMMetrics.memory.Delete(m.labels)
        }

        restartCountMetrics := makeVMRestartMetrics(vm)
        for _, m := range restartCountMetrics {
                perVMMetrics.restartCount.Delete(m.labels)
        }

        // Remove the VM from the internal tracker:
        perVMMetrics.deleteActive(vm)
}

package api

import (
        "encoding/json"
        "errors"
        "fmt"
        "reflect"

        "go.uber.org/zap/zapcore"

        "k8s.io/apimachinery/pkg/api/resource"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/util"
)

/////////////////////////////////
// (Autoscaler) Agent Messages //
/////////////////////////////////

// PluginProtoVersion represents a single version of the agent<->scheduler plugin protocol
//
// Each version of the agent<->scheduler plugin protocol is named independently from releases of the
// repository containing this code. Names follow semver, although this does not necessarily
// guarantee support - for example, the plugin may only support a single version, even though others
// may appear to be semver-compatible.
type PluginProtoVersion uint32

const (
        // PluginProtoV1_0 represents v1.0 of the agent<->scheduler plugin protocol - the initial
        // version.
        //
        // Last used in release version v0.1.8.
        PluginProtoV1_0 PluginProtoVersion = iota + 1 // start from zero, for backwards compatibility with pre-versioned messages

        // PluginProtoV1_1 represents v1.1 of the agent<->scheduler plugin protocol.
        //
        // Changes from v1.0:
        //
        // * Allows a nil value of the AgentRequest.Metrics field.
        //
        // Last used in release version v0.6.0.
        PluginProtoV1_1

        // PluginProtoV2_0 represents v2.0 of the agent<->scheduler plugin protocol.
        //
        // Changes from v1.1:
        //
        // * Supports fractional CPU
        //
        // Last used in release version v0.19.x.
        PluginProtoV2_0

        // PluginProtoV2_1 represents v2.1 of the agent<->scheduler plugin protocol.
        //
        // Changes from v2.0:
        //
        // * added AgentRequest.LastPermit
        //
        // Last used in release version v0.21.0.
        PluginProtoV2_1

        // PluginProtoV3_0 represents v3.0 of the agent<->scheduler plugin protocol.
        //
        // Changes from v2.1:
        //
        // * Removes PluginResponse.ComputeUnit (agent is now responsible for source of truth)
        //
        // Last used in release version v0.22.0.
        PluginProtoV3_0

        // PluginProtoV4_0 represents v4.0 of the agent<->scheduler plugin protocol.
        //
        // Changes from v3.0:
        //
        // * Memory quantities now use "number of bytes" instead of "number of memory slots"
        // * Adds AgentRequest.ComputeUnit
        //
        // Last used in release version v0.27.0.
        PluginProtoV4_0

        // PluginProtoV5_0 represents v5.0 of the agent<->scheduler plugin protocol.
        //
        // Changes from v4.0:
        //
        // * Removed AgentRequest.metrics fields loadAvg5M and memoryUsageBytes
        //
        // Currently the latest version.
        PluginProtoV5_0

        // latestPluginProtoVersion represents the latest version of the agent<->scheduler plugin
        // protocol
        //
        // This value is kept private because it should not be used externally; any desired
        // functionality that could be implemented with it should instead be a method on
        // PluginProtoVersion.
        latestPluginProtoVersion PluginProtoVersion = iota // excluding +1 makes it equal to previous
)

func (v PluginProtoVersion) String() string {
        var zero PluginProtoVersion

        switch v {
        case zero:
                return "<invalid: zero>"
        case PluginProtoV1_0:
                return "v1.0"
        case PluginProtoV1_1:
                return "v1.1"
        case PluginProtoV2_0:
                return "v2.0"
        case PluginProtoV2_1:
                return "v2.1"
        case PluginProtoV3_0:
                return "v3.0"
        case PluginProtoV4_0:
                return "v4.0"
        case PluginProtoV5_0:
                return "v5.0"
        default:
                diff := v - latestPluginProtoVersion
                return fmt.Sprintf("<unknown = %v + %d>", latestPluginProtoVersion, diff)
        }
}

// IsValid returns whether the protocol version is valid. The zero value is not valid.
func (v PluginProtoVersion) IsValid() bool {
        return uint(v) != 0
}

// AllowsNilMetrics returns whether this version of the protocol allows the autoscaler-agent to send
// a nil metrics field.
//
// This is true for version v1.1 and greater.
func (v PluginProtoVersion) AllowsNilMetrics() bool {
        return v >= PluginProtoV1_1
}

func (v PluginProtoVersion) SupportsFractionalCPU() bool {
        return v >= PluginProtoV2_0
}

// PluginSendsComputeUnit returns whether this version of the protocol expects the scheduler plugin
// to send the value of the Compute Unit in its PluginResponse.
//
// This is true for all versions below v3.0.
func (v PluginProtoVersion) PluginSendsComputeUnit() bool {
        return v < PluginProtoV3_0
}

// AgentSendsComputeUnit returns whether this version of the protocol expects the autoscaler-agent
// to send the value of its configured Compute Unit in its AgentRequest.
//
// This is true for version v4.0 and greater.
func (v PluginProtoVersion) AgentSendsComputeUnit() bool {
        return v >= PluginProtoV4_0
}

// RepresentsMemoryAsBytes returns whether this version of the protocol uses byte quantities to
// refer to memory amounts, rather than a number of memory slots.
//
// This is true for version v4.0 and greater.
func (v PluginProtoVersion) RepresentsMemoryAsBytes() bool {
        return v >= PluginProtoV4_0
}

// IncludesExtendedMetrics returns whether this version of the protocol includes the AgentRequest's
// metrics loadAvg5M and memoryUsageBytes.
//
// This is true for all versions below v5.0.
func (v PluginProtoVersion) IncludesExtendedMetrics() bool {
        return v < PluginProtoV5_0
}

// AgentRequest is the type of message sent from an autoscaler-agent to the scheduler plugin on
// behalf of a Pod on the agent's node.
//
// All AgentRequests expect a PluginResponse.
type AgentRequest struct {
        // ProtoVersion is the version of the protocol that the autoscaler-agent is expecting to use
        //
        // If the scheduler does not support this version, then it will respond with a 400 status.
        ProtoVersion PluginProtoVersion `json:"protoVersion"`
        // Pod is the namespaced name of the Pod that the autoscaler-agent is making the request on
        // behalf of.
        Pod util.NamespacedName `json:"pod"`
        // ComputeUnit gives the value of the agent's configured compute unit to use for the VM.
        //
        // If the requested resources are not a multiple of ComputeUnit, the scheduler plugin will make
        // a best-effort attempt to return a value satisfying the request. Any approved increases will
        // be a multiple of ComputeUnit, but otherwise the plugin does not check.
        ComputeUnit Resources `json:"computeUnit"`
        // Resources gives a requested or notified change in resources allocated to the VM.
        //
        // The requested amount MAY be equal to the current amount, in which case it serves as a
        // notification that the VM should no longer be contributing to resource pressure.
        //
        // TODO: allow passing nil here if nothing's changed (i.e., the request would be the same as the
        // previous request)
        Resources Resources `json:"resources"`
        // LastPermit indicates the last permit that the agent has received from the scheduler plugin.
        // In case of a failure, the new running scheduler uses LastPermit to recover the previous state.
        // LastPermit may be nil.
        LastPermit *Resources `json:"lastPermit"`
        // Metrics provides information about the VM's current load, so that the scheduler may
        // prioritize which pods to migrate
        //
        // In some protocol versions, this field may be nil.
        Metrics *Metrics `json:"metrics"`
}

// Metrics gives the information pulled from vector.dev that the scheduler may use to prioritize
// which pods it should migrate.
type Metrics struct {
        LoadAverage1Min float32 `json:"loadAvg1M"`
        // DEPRECATED. Will be removed in an upcoming release.
        LoadAverage5Min *float32 `json:"loadAvg5M,omitempty"`
        // DEPRECATED. Will be removed in an upcoming release.
        MemoryUsageBytes *float32 `json:"memoryUsageBytes,omitempty"`
}

// ProtocolRange returns a VersionRange exactly equal to r.ProtoVersion
func (r AgentRequest) ProtocolRange() VersionRange[PluginProtoVersion] {
        return VersionRange[PluginProtoVersion]{
                Min: r.ProtoVersion,
                Max: r.ProtoVersion,
        }
}

// Bytes represents a number of bytes, with custom marshaling / unmarshaling that goes through
// resource.Quantity in order to have simplified values over wire
type Bytes uint64

// BytesFromResourceQuantity converts resource.Quantity into Bytes
func BytesFromResourceQuantity(r resource.Quantity) Bytes {
        return Bytes(uint64(r.Value()))
}

// ToResourceQuantity converts a Bytes to resource.Quantity - typically used for formatting and/or
// serialization
func (b Bytes) ToResourceQuantity() *resource.Quantity {
        return resource.NewQuantity(int64(b), resource.BinarySI)
}

// AsFloat64 converts a Bytes into float64 of the same amount
func (b Bytes) AsFloat64() float64 {
        return float64(b)
}

func (b *Bytes) UnmarshalJSON(data []byte) error {
        var quantity resource.Quantity
        err := json.Unmarshal(data, &quantity)
        if err != nil {
                return err
        }

        *b = BytesFromResourceQuantity(quantity)
        return nil
}

func (b Bytes) MarshalJSON() ([]byte, error) {
        // To (temporarily) support multiple API versions, we should output smaller values as integers.
        // Otherwise, resource.Quantity will always format as a string, which is incompatible with
        // earllier versions of the agent<->scheduler plugin API.
        if b < 1024 {
                return json.Marshal(uint64(b))
        }

        return json.Marshal(b.ToResourceQuantity())
}

func (b Bytes) Format(state fmt.State, verb rune) {
        switch {
        case verb == 'v' && state.Flag('#'):
                //nolint:errcheck // can't do anything about the write error
                state.Write([]byte(fmt.Sprintf("%v", uint64(b))))
        default:
                //nolint:errcheck // can't do anything about the write error
                state.Write([]byte(b.ToResourceQuantity().String()))
        }
}

// Resources represents an amount of CPU and memory
//
// When used in an AgentRequest, it represents the desired total amount of resources. When
// a resource is increasing, the autoscaler-agent "requests" the change to confirm that the
// resources are available. When decreasing, the autoscaler-agent is expected to use Resources to
// "notify" the scheduler -- i.e., the resource amount should have already been decreased. When
// a resource stays at the same amount, the associated AgentRequest serves to indicate that the
// autoscaler-agent is "satisfied" with its current resources, and should no longer contribute to
// any existing resource pressure.
//
// When used a PluginResponse (as a Permit), then the Resources serves to inform the
// autoscaler-agent of the amount it has been permitted to use, subject to node resource limits.
//
// In all cases, each resource type is considered separately from the others.
type Resources struct {
        VCPU vmv1.MilliCPU `json:"vCPUs"`
        // Mem gives the number of bytes of memory requested
        Mem Bytes `json:"mem"`
}

// MarshalLogObject implements zapcore.ObjectMarshaler, so that Resources can be used with zap.Object
func (r Resources) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        enc.AddString("vCPU", fmt.Sprintf("%v", r.VCPU))
        enc.AddString("mem", fmt.Sprintf("%v", r.Mem))
        return nil
}

// ValidateNonZero checks that neither of the Resources fields are equal to zero, returning an error
// if either is.
func (r Resources) ValidateNonZero() error {
        if r.VCPU == 0 {
                return errors.New("vCPUs must be non-zero")
        } else if r.Mem == 0 {
                return errors.New("mem must be non-zero")
        }

        return nil
}

func (r Resources) CheckValuesAreReasonablySized() error {
        if r.VCPU < 50 {
                return errors.New("VCPU is smaller than 0.05")
        }
        if r.VCPU > 512*1000 {
                return errors.New("VCPU is bigger than 512")
        }

        return nil
}

// HasFieldGreaterThan returns true if and only if there is a field F where r.F > cmp.F
func (r Resources) HasFieldGreaterThan(cmp Resources) bool {
        return r.VCPU > cmp.VCPU || r.Mem > cmp.Mem
}

// HasFieldGreaterThan returns true if and only if there is a field F where r.F < cmp.F
func (r Resources) HasFieldLessThan(cmp Resources) bool {
        return cmp.HasFieldGreaterThan(r)
}

// Min returns a new Resources value with each field F as the minimum of r.F and cmp.F
func (r Resources) Min(cmp Resources) Resources {
        return Resources{
                VCPU: min(r.VCPU, cmp.VCPU),
                Mem:  min(r.Mem, cmp.Mem),
        }
}

// Max returns a new Resources value with each field F as the maximum of r.F and cmp.F
func (r Resources) Max(cmp Resources) Resources {
        return Resources{
                VCPU: max(r.VCPU, cmp.VCPU),
                Mem:  max(r.Mem, cmp.Mem),
        }
}

// Add returns the result of adding the two Resources
func (r Resources) Add(other Resources) Resources {
        return Resources{
                VCPU: r.VCPU + other.VCPU,
                Mem:  r.Mem + other.Mem,
        }
}

// SaturatingSub returns the result of subtracting r - other, with values that *would* underflow
// instead set to zero.
func (r Resources) SaturatingSub(other Resources) Resources {
        return Resources{
                VCPU: util.SaturatingSub(r.VCPU, other.VCPU),
                Mem:  util.SaturatingSub(r.Mem, other.Mem),
        }
}

// Mul returns the result of multiplying each resource by factor
func (r Resources) Mul(factor uint16) Resources {
        return Resources{
                VCPU: vmv1.MilliCPU(factor) * r.VCPU,
                Mem:  Bytes(factor) * r.Mem,
        }
}

// DivResources divides the resources by the smaller amount, returning the uint16 value such that
// other.Mul(factor) is equal to the original resources.
//
// If r is not an integer multiple of other, then (0, false) will be returned.
func (r Resources) DivResources(other Resources) (uint16, bool) {
        cpuFactor := uint16(r.VCPU / other.VCPU)
        cpuOk := r.VCPU%other.VCPU == 0
        memFactor := uint16(r.Mem / other.Mem)
        memOk := r.Mem%other.Mem == 0

        if !cpuOk || !memOk || cpuFactor != memFactor {
                return 0, false
        }

        return cpuFactor, true // already known equal to memFactor
}

// AbsDiff returns a new Resources with each field F as the absolute value of the difference between
// r.F and cmp.F
func (r Resources) AbsDiff(cmp Resources) Resources {
        return Resources{
                VCPU: util.AbsDiff(r.VCPU, cmp.VCPU),
                Mem:  util.AbsDiff(r.Mem, cmp.Mem),
        }
}

// Increase returns a MoreResources with each field F true when r.F > old.F.
func (r Resources) IncreaseFrom(old Resources) MoreResources {
        return MoreResources{
                Cpu:    r.VCPU > old.VCPU,
                Memory: r.Mem > old.Mem,
        }
}

// ConvertToRaw produces the Allocation equivalent to these Resources
func (r Resources) ConvertToAllocation() Allocation {
        return Allocation{
                Cpu: r.VCPU.ToResourceQuantity().AsApproximateFloat64(),
                Mem: uint64(r.Mem),
        }
}

/////////////////////////////////
// (Scheduler) Plugin Messages //
/////////////////////////////////

type PluginResponse struct {
        // Permit provides an upper bound on the resources that the VM is now allowed to consume
        //
        // If the request's Resources were less than or equal its current resources, then the Permit
        // will exactly equal those resources. Otherwise, it may contain resource allocations anywhere
        // between the current and requested resources, inclusive.
        Permit Resources `json:"permit"`

        // Migrate, if present, notifies the autoscaler-agent that its VM will be migrated away,
        // alongside whatever other information may be useful.
        Migrate *MigrateResponse `json:"migrate,omitempty"`
}

// MigrateResponse, when provided, is a notification to the autsocaler-agent that it will migrate
//
// After receiving a MigrateResponse, the autoscaler-agent MUST NOT change its resource allocation.
//
// TODO: fill this with more information as required
type MigrateResponse struct{}

// MoreResources holds the data associated with a MoreResourcesRequest
type MoreResources struct {
        // Cpu is true if the vm-monitor is requesting more CPU
        Cpu bool `json:"cpu"`
        // Memory is true if the vm-monitor is requesting more memory
        Memory bool `json:"memory"`
}

// Not returns the field-wise logical "not" of m
func (m MoreResources) Not() MoreResources {
        return MoreResources{
                Cpu:    !m.Cpu,
                Memory: !m.Memory,
        }
}

// And returns the field-wise logical "and" of m and cmp
func (m MoreResources) And(cmp MoreResources) MoreResources {
        return MoreResources{
                Cpu:    m.Cpu && cmp.Cpu,
                Memory: m.Memory && cmp.Memory,
        }
}

////////////////////////////////////
// Controller <-> Runner Messages //
////////////////////////////////////

// VCPUChange is used to notify runner that it had some changes in its CPUs
// runner uses this info to adjust qemu cgroup
type VCPUChange struct {
        VCPUs vmv1.MilliCPU
}

// VCPUCgroup is used in runner to reply to controller
// it represents the vCPU usage as controlled by cgroup
type VCPUCgroup struct {
        VCPUs vmv1.MilliCPU
}

// this a similar version type for controller <-> runner communications
// see PluginProtoVersion comment for details
type RunnerProtoVersion uint32

const (
        RunnerProtoV1 RunnerProtoVersion = iota + 1
)

func (v RunnerProtoVersion) SupportsCgroupFractionalCPU() bool {
        return v >= RunnerProtoV1
}

////////////////////////////////////
//   Agent <-> Monitor Messages   //
////////////////////////////////////

// Represents the resources that a VM has been granted
type Allocation struct {
        // Number of vCPUs
        Cpu float64 `json:"cpu"`

        // Number of bytes
        Mem uint64 `json:"mem"`
}

// ** Types sent by monitor **

// This type is sent to the agent as a way to request immediate upscale.
// Since the agent cannot control if the agent will choose to upscale the VM,
// it does not return anything. If an upscale is granted, the agent will notify
// the monitor via an UpscaleConfirmation
type UpscaleRequest struct{}

// This type is sent to the agent to confirm it successfully upscaled, meaning
// it increased its filecache and/or cgroup memory limits. The agent does not
// need to respond.
type UpscaleConfirmation struct{}

// This type is sent to the agent to indicate if downscaling was successful. The
// agent does not need to respond.
type DownscaleResult struct {
        Ok     bool
        Status string
}

// ** Types sent by agent **

// This type is sent to the monitor to inform it that it has been granted a geater
// allocation. Once the monitor is done applying this new allocation (i.e, increasing
// file cache size, cgroup memory limits) it should reply with an UpscaleConfirmation.
type UpscaleNotification struct {
        Granted Allocation `json:"granted"`
}

// This type is sent to the monitor as a request to downscale its resource usage.
// Once the monitor has downscaled or failed to do so, it should respond with a
// DownscaleResult.
type DownscaleRequest struct {
        Target Allocation `json:"target"`
}

// ** Types shared by agent and monitor **

// This type can be sent by either party whenever they receive a message they
// cannot deserialize properly.
type InvalidMessage struct {
        Error string `json:"error"`
}

// This type can be sent by either party to signal that an error occurred carrying
// out the other party's request, for example, the monitor erroring while trying
// to downscale. The receiving party can they log the error or propagate it as they
// see fit.
type InternalError struct {
        Error string `json:"error"`
}

// This type is sent as part of a bidirectional heartbeat between the monitor and
// agent. The check is initiated by the agent.
type HealthCheck struct{}

// This function is used to prepare a message for serialization. Any data passed
// to the monitor should be serialized with this function. As of protocol v1.0,
// the following types maybe be sent to the monitor, and thus passed in:
// - DownscaleRequest
// - UpscaleNotification
// - InvalidMessage
// - InternalError
// - HealthCheck
func SerializeMonitorMessage(content any, id uint64) ([]byte, error) {
        // The final type that gets sent over the wire
        type Bundle struct {
                Content any    `json:"content"`
                Type    string `json:"type"`
                Id      uint64 `json:"id"`
        }

        var typeStr string
        switch content.(type) {
        case DownscaleRequest:
                typeStr = "DownscaleRequest"
        case UpscaleNotification:
                typeStr = "UpscaleNotification"
        case InvalidMessage:
                typeStr = "InvalidMessage"
        case InternalError:
                typeStr = "InternalError"
        case HealthCheck:
                typeStr = "HealthCheck"
        default:
                return nil, fmt.Errorf("unknown message type \"%s\"", reflect.TypeOf(content))
        }

        return json.Marshal(Bundle{
                Content: content,
                Type:    typeStr,
                Id:      id,
        })
}

// MonitorProtoVersion represents a single version of the agent<->monitor protocol
//
// Each version of the agent<->monitor protocol is named independently from releases of the
// repository containing this code. Names follow semver, although this does not necessarily
// guarantee support - for example, the monitor may only support versions above v1.1.
type MonitorProtoVersion uint32

const (
        // MonitorProtoV1_0 represents v1.0 of the agent<->monitor protocol - the initial version.
        //
        // Currently the latest version.
        MonitorProtoV1_0 = iota + 1

        // latestMonitorProtoVersion represents the latest version of the agent<->Monitor protocol
        //
        // This value is kept private because it should not be used externally; any desired
        // functionality that could be implemented with it should instead be a method on
        // MonitorProtoVersion.
        latestMonitorProtoVersion MonitorProtoVersion = iota // excluding +1 makes it equal to previous
)

func (v MonitorProtoVersion) String() string {
        var zero MonitorProtoVersion

        switch v {
        case zero:
                return "<invalid: zero>"
        case MonitorProtoV1_0:
                return "v1.0"
        default:
                diff := v - latestMonitorProtoVersion
                return fmt.Sprintf("<unknown = %v + %d>", latestMonitorProtoVersion, diff)
        }
}

// Sent back by the monitor after figuring out what protocol version we should use
type MonitorProtocolResponse struct {
        // If `Error` is nil, contains the value of the settled on protocol version.
        // Otherwise, will be set to 0 (MonitorProtocolVersion's zero value).
        Version MonitorProtoVersion `json:"version,omitempty"`

        // Will be nil if no error occurred.
        Error *string `json:"error,omitempty"`
}

package api

// Generic version handling

import (
        "fmt"

        "golang.org/x/exp/constraints"
)

// VersionRange is a helper type to represent a range of versions.
//
// The bounds are inclusive, representing all versions v with Min <= v <= Max.
//
// This type is sent directly to the monitor during the creation of a new
// Dispatcher as part of figuring out which protocol to use.
type VersionRange[V constraints.Ordered] struct {
        Min V `json:"min"`
        Max V `json:"max"`
}

func (r VersionRange[V]) String() string {
        if r.Min == r.Max {
                return fmt.Sprintf("%v", r.Min)
        } else {
                return fmt.Sprintf("%v to %v", r.Min, r.Max)
        }
}

// LatestSharedVersion returns the latest version covered by both VersionRanges, if there is one.
//
// If either range is invalid, or no such version exists (i.e. the ranges are disjoint), then the
// returned values will be (0, false).
func (r VersionRange[V]) LatestSharedVersion(cmp VersionRange[V]) (_ V, ok bool) {
        maxVersion := min(r.Max, cmp.Max)
        minVersion := max(r.Min, cmp.Min)
        if maxVersion >= minVersion {
                return maxVersion, true
        } else {
                var v V
                return v, false
        }
}

// API-relevant types extracted from NeonVM VMs

package api

import (
        "encoding/json"
        "errors"
        "fmt"

        "github.com/samber/lo"
        "github.com/tychoish/fun/erc"
        "go.uber.org/zap"

        corev1 "k8s.io/api/core/v1"
        "k8s.io/apimachinery/pkg/api/resource"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/util"
)

const (
        LabelEnableAutoMigration      = "autoscaling.neon.tech/auto-migration-enabled"
        LabelTestingOnlyAlwaysMigrate = "autoscaling.neon.tech/testing-only-always-migrate"
        LabelEnableAutoscaling        = "autoscaling.neon.tech/enabled"
        AnnotationAutoscalingBounds   = "autoscaling.neon.tech/bounds"
        AnnotationAutoscalingConfig   = "autoscaling.neon.tech/config"
        AnnotationAutoscalingUnit     = "autoscaling.neon.tech/scaling-unit"
        AnnotationBillingEndpointID   = "autoscaling.neon.tech/billing-endpoint-id"

        // For internal use only, between the autoscaler-agent and scheduler plugin:
        InternalAnnotationResourcesRequested = "internal.autoscaling.neon.tech/resources-requested"
        InternalAnnotationResourcesApproved  = "internal.autoscaling.neon.tech/resources-approved"
)

func hasTrueLabel(obj metav1.ObjectMetaAccessor, labelName string) bool {
        labels := obj.GetObjectMeta().GetLabels()
        value, ok := labels[labelName]
        return ok && value == "true"
}

// HasAutoscalingEnabled returns true iff the object has the label that enables autoscaling
func HasAutoscalingEnabled(obj metav1.ObjectMetaAccessor) bool {
        return hasTrueLabel(obj, LabelEnableAutoscaling)
}

// HasAutoMigrationEnabled returns true iff the object has the label that enables "automatic"
// scheduler-triggered migration, and it's set to "true"
func HasAutoMigrationEnabled(obj metav1.ObjectMetaAccessor) bool {
        return hasTrueLabel(obj, LabelEnableAutoMigration)
}

func HasAlwaysMigrateLabel(obj metav1.ObjectMetaAccessor) bool {
        return hasTrueLabel(obj, LabelTestingOnlyAlwaysMigrate)
}

func extractAnnotationJSON[T any](obj metav1.ObjectMetaAccessor, annotation string) (*T, error) {
        jsonString, ok := obj.GetObjectMeta().GetAnnotations()[annotation]
        if !ok {
                return nil, nil
        }

        var value T
        if err := json.Unmarshal([]byte(jsonString), &value); err != nil {
                return nil, fmt.Errorf("could not unmarshal %s annotation: %w", annotation, err)
        }
        return &value, nil
}

// ExtractScalingUnit returns the configured scaling unit (aka the "compute unit") for the object,
// based on the AnnotationAutoscalingUnit annotation.
func ExtractScalingUnit(obj metav1.ObjectMetaAccessor) (*Resources, error) {
        return extractAnnotationJSON[Resources](obj, AnnotationAutoscalingUnit)
}

func ExtractRequestedScaling(obj metav1.ObjectMetaAccessor) (*Resources, error) {
        return extractAnnotationJSON[Resources](obj, InternalAnnotationResourcesRequested)
}

func ExtractApprovedScaling(obj metav1.ObjectMetaAccessor) (*Resources, error) {
        return extractAnnotationJSON[Resources](obj, InternalAnnotationResourcesApproved)
}

// VmInfo is the subset of vmv1.VirtualMachineSpec that the scheduler plugin and autoscaler agent
// care about. It takes various labels and annotations into account, so certain fields might be
// different from what's strictly in the VirtualMachine object.
type VmInfo struct {
        Name            string                 `json:"name"`
        Namespace       string                 `json:"namespace"`
        Cpu             VmCpuInfo              `json:"cpu"`
        Mem             VmMemInfo              `json:"mem"`
        Config          VmConfig               `json:"config"`
        CurrentRevision *vmv1.RevisionWithTime `json:"currentRevision,omitempty"`
}

type VmCpuInfo struct {
        Min vmv1.MilliCPU `json:"min"`
        Max vmv1.MilliCPU `json:"max"`
        Use vmv1.MilliCPU `json:"use"`
}

func NewVmCpuInfo(cpus vmv1.CPUs) VmCpuInfo {
        return VmCpuInfo{
                Min: cpus.Min,
                Max: cpus.Max,
                Use: cpus.Use,
        }
}

type VmMemInfo struct {
        // Min is the minimum number of memory slots available
        Min uint16 `json:"min"`
        // Max is the maximum number of memory slots available
        Max uint16 `json:"max"`
        // Use is the number of memory slots currently plugged in the VM
        Use uint16 `json:"use"`

        SlotSize Bytes `json:"slotSize"`
}

func NewVmMemInfo(memSlots vmv1.MemorySlots, memSlotSize resource.Quantity) VmMemInfo {
        return VmMemInfo{
                Min:      uint16(memSlots.Min),
                Max:      uint16(memSlots.Max),
                Use:      uint16(memSlots.Use),
                SlotSize: Bytes(memSlotSize.Value()),
        }
}

// VmConfig stores the autoscaling-specific "extra" configuration derived from labels and
// annotations on the VM object.
//
// This is separate from the bounds information stored in VmInfo (even though that's also derived
// from annotations), because VmConfig is meant to store values that either qualitatively change the
// handling for a VM (e.g., AutoMigrationEnabled) or are expected to largely be the same for most VMs
// (e.g., ScalingConfig).
type VmConfig struct {
        // AutoMigrationEnabled indicates to the scheduler plugin that it's allowed to trigger migration
        // for this VM. This defaults to false because otherwise we might disrupt VMs that don't have
        // adequate networking support to preserve connections across live migration.
        AutoMigrationEnabled bool `json:"autoMigrationEnabled"`
        // AlwaysMigrate is a test-only debugging flag that, if present in the VM's labels, will always
        // prompt it to migrate, regardless of whether the VM actually *needs* to.
        AlwaysMigrate  bool           `json:"alwaysMigrate"`
        ScalingEnabled bool           `json:"scalingEnabled"`
        ScalingConfig  *ScalingConfig `json:"scalingConfig,omitempty"`
}

// Using returns the Resources that this VmInfo says the VM is using
func (vm VmInfo) Using() Resources {
        return Resources{
                VCPU: vm.Cpu.Use,
                Mem:  vm.Mem.SlotSize * Bytes(vm.Mem.Use),
        }
}

// SetUsing sets the values of vm.{Cpu,Mem}.Use to those provided by r
func (vm *VmInfo) SetUsing(r Resources) {
        vm.Cpu.Use = r.VCPU
        vm.Mem.Use = uint16(r.Mem / vm.Mem.SlotSize)
}

// Min returns the Resources representing the minimum amount this VmInfo says the VM must reserve
func (vm VmInfo) Min() Resources {
        return Resources{
                VCPU: vm.Cpu.Min,
                Mem:  vm.Mem.SlotSize * Bytes(vm.Mem.Min),
        }
}

// Max returns the Resources representing the maximum amount this VmInfo says the VM may reserve
func (vm VmInfo) Max() Resources {
        return Resources{
                VCPU: vm.Cpu.Max,
                Mem:  vm.Mem.SlotSize * Bytes(vm.Mem.Max),
        }
}

func (vm VmInfo) NamespacedName() util.NamespacedName {
        return util.NamespacedName{Namespace: vm.Namespace, Name: vm.Name}
}

func ExtractVmInfo(logger *zap.Logger, vm *vmv1.VirtualMachine) (*VmInfo, error) {
        logger = logger.With(util.VMNameFields(vm))
        info, err := extractVmInfoGeneric(logger, vm.Name, vm, vm.Spec.Resources())
        if err != nil {
                return nil, fmt.Errorf("error extracting VM info: %w", err)
        }

        info.CurrentRevision = vm.Status.CurrentRevision
        return info, nil
}

func ExtractVmInfoFromPod(logger *zap.Logger, pod *corev1.Pod) (*VmInfo, error) {
        logger = logger.With(util.PodNameFields(pod))

        resources, err := vmv1.VirtualMachineResourcesFromPod(pod)
        if err != nil {
                return nil, err
        }

        vmName := pod.Labels[vmv1.VirtualMachineNameLabel]
        return extractVmInfoGeneric(logger, vmName, pod, *resources)
}

func extractVmInfoGeneric(
        logger *zap.Logger,
        vmName string,
        obj metav1.ObjectMetaAccessor,
        resources vmv1.VirtualMachineResources,
) (*VmInfo, error) {
        cpuInfo := NewVmCpuInfo(resources.CPUs)
        memInfo := NewVmMemInfo(resources.MemorySlots, resources.MemorySlotSize)

        autoMigrationEnabled := HasAutoMigrationEnabled(obj)
        scalingEnabled := HasAutoscalingEnabled(obj)
        alwaysMigrate := HasAlwaysMigrateLabel(obj)

        info := VmInfo{
                Name:      vmName,
                Namespace: obj.GetObjectMeta().GetNamespace(),
                Cpu:       cpuInfo,
                Mem:       memInfo,
                Config: VmConfig{
                        AutoMigrationEnabled: autoMigrationEnabled,
                        AlwaysMigrate:        alwaysMigrate,
                        ScalingEnabled:       scalingEnabled,
                        ScalingConfig:        nil, // set below, maybe
                },
                CurrentRevision: nil, // set later, maybe
        }

        if boundsJSON, ok := obj.GetObjectMeta().GetAnnotations()[AnnotationAutoscalingBounds]; ok {
                var bounds ScalingBounds
                if err := json.Unmarshal([]byte(boundsJSON), &bounds); err != nil {
                        return nil, fmt.Errorf("Error unmarshaling annotation %q: %w", AnnotationAutoscalingBounds, err)
                }

                if err := bounds.Validate(&resources.MemorySlotSize); err != nil {
                        return nil, fmt.Errorf("Bad scaling bounds in annotation %q: %w", AnnotationAutoscalingBounds, err)
                }
                info.applyBounds(bounds)
        }

        if configJSON, ok := obj.GetObjectMeta().GetAnnotations()[AnnotationAutoscalingConfig]; ok {
                var config ScalingConfig
                if err := json.Unmarshal([]byte(configJSON), &config); err != nil {
                        return nil, fmt.Errorf("Error unmarshaling annotation %q: %w", AnnotationAutoscalingConfig, err)
                }

                if err := config.ValidateOverrides(); err != nil {
                        return nil, fmt.Errorf("Bad scaling config in annotation %q: %w", AnnotationAutoscalingConfig, err)
                }
                info.Config.ScalingConfig = &config
        }

        minResources := info.Min()
        using := info.Using()
        maxResources := info.Max()

        // we can't do validation for resource.Quantity with kubebuilder
        // so do it here
        if err := minResources.CheckValuesAreReasonablySized(); err != nil {
                return nil, fmt.Errorf("min resources are invalid: %w", err)
        }

        if err := maxResources.CheckValuesAreReasonablySized(); err != nil {
                return nil, fmt.Errorf("max resources are invalid: %w", err)
        }

        // check: min <= max
        if minResources.HasFieldGreaterThan(maxResources) {
                return nil, fmt.Errorf("min resources %+v has field greater than maximum %+v", minResources, maxResources)
        }

        // check: min <= using <= max
        if using.HasFieldLessThan(minResources) {
                logger.Warn(
                        "Current usage has field less than minimum",
                        zap.Object("using", using), zap.Object("min", minResources),
                )
        } else if using.HasFieldGreaterThan(maxResources) {
                logger.Warn(
                        "Current usage has field greater than maximum",
                        zap.Object("using", using), zap.Object("max", maxResources),
                )
        }

        return &info, nil
}

func (vm VmInfo) EqualScalingBounds(cmp VmInfo) bool {
        return vm.Min() == cmp.Min() && vm.Max() == cmp.Max()
}

func (vm *VmInfo) applyBounds(b ScalingBounds) {
        vm.Cpu.Min = vmv1.MilliCPUFromResourceQuantity(b.Min.CPU)
        vm.Cpu.Max = vmv1.MilliCPUFromResourceQuantity(b.Max.CPU)

        // FIXME: this will be incorrect if b.{Min,Max}.Mem.Value() is greater than
        // (2^16-1) * info.Mem.SlotSize.Value().
        vm.Mem.Min = uint16(BytesFromResourceQuantity(b.Min.Mem) / vm.Mem.SlotSize)
        vm.Mem.Max = uint16(BytesFromResourceQuantity(b.Max.Mem) / vm.Mem.SlotSize)
}

// ScalingBounds is the type that we deserialize from the "autoscaling.neon.tech/bounds" annotation
//
// All fields (and sub-fields) are pointers so that our handling can distinguish between "field not
// set" and "field equal to zero". Please note that all field are still required to be set and
// non-zero, though.
type ScalingBounds struct {
        Min ResourceBounds `json:"min"`
        Max ResourceBounds `json:"max"`
}

type ResourceBounds struct {
        CPU resource.Quantity `json:"cpu"`
        Mem resource.Quantity `json:"mem"`
}

// Validate checks that the ScalingBounds are all reasonable values - all fields initialized and
// non-zero.
func (b ScalingBounds) Validate(memSlotSize *resource.Quantity) error {
        ec := &erc.Collector{}

        b.Min.validate(ec, ".min", memSlotSize)
        b.Max.validate(ec, ".max", memSlotSize)

        return ec.Resolve()
}

// TODO: This could be made better - see:
// https://github.com/neondatabase/autoscaling/pull/190#discussion_r1169405645
func (b ResourceBounds) validate(ec *erc.Collector, path string, memSlotSize *resource.Quantity) {
        errAt := func(field string, err error) error {
                return fmt.Errorf("error at %s%s: %w", path, field, err)
        }

        if b.CPU.IsZero() {
                ec.Add(errAt(".cpu", errors.New("must be set to a non-zero value")))
        }

        if b.Mem.IsZero() || b.Mem.Value() < 0 {
                ec.Add(errAt(".mem", errors.New("must be set to a value greater than zero")))
        } else if b.Mem.Value()%memSlotSize.Value() != 0 {
                ec.Add(errAt(".mem", fmt.Errorf("must be divisible by VM memory slot size %s", memSlotSize)))
        }
}

// ScalingConfig provides bits of configuration for how the autoscaler-agent makes scaling decisions
type ScalingConfig struct {
        // LoadAverageFractionTarget sets the desired fraction of current CPU that the load average
        // should be. For example, with a value of 0.7, we'd want load average to sit at 0.7 × CPU,
        // scaling CPU to make this happen.
        //
        // When specifying the autoscaler-agent config, this field is required. For an individual VM, if
        // this field is left out the settings will fall back on the global default.
        LoadAverageFractionTarget *float64 `json:"loadAverageFractionTarget,omitempty"`

        // MemoryUsageFractionTarget sets the maximum fraction of total memory that postgres allocations
        // (MemoryUsage) must fit into. This doesn't count the LFC memory.
        // This memory may also be viewed as "unreclaimable" (contrary to e.g. page cache).
        //
        // For example, with a value of 0.75 on a 4GiB VM, we will try to upscale if the unreclaimable
        // memory usage exceeds 3GiB.
        //
        // When specifying the autoscaler-agent config, this field is required. For an individual VM, if
        // this field is left out the settings will fall back on the global default.
        MemoryUsageFractionTarget *float64 `json:"memoryUsageFractionTarget,omitempty"`

        // MemoryTotalFractionTarget sets the maximum fraction of total memory that postgres allocations
        // PLUS LFC memory (MemoryUsage + MemoryCached) must fit into.
        //
        // Compared with MemoryUsageFractionTarget, this value can be set higher (e.g. 0.9 vs 0.75),
        // because we can tolerate higher fraction of consumption for both in-VM memory consumers.
        MemoryTotalFractionTarget *float64 `json:"memoryTotalFractionTarget,omitempty"`

        // EnableLFCMetrics, if true, enables fetching additional metrics about the Local File Cache
        // (LFC) to provide as input to the scaling algorithm.
        //
        // When specifying the autoscaler-agent config, this field is required. False is a safe default.
        // For an individual VM, if this field is left out the settings will fall back on the global
        // default.
        EnableLFCMetrics *bool `json:"enableLFCMetrics,omitempty"`

        // LFCUseLargestWindow, if true, calculates goal LFC size only based on the largest available
        // working set size window, instead of trying to allow downscaling at earlier opportunities.
        //
        // This is not fit for general use. It's meant as a temproary escape hatch to let us assess the
        // upper bound of potential improvements to our LFC goal size heuristics.
        LFCUseLargestWindow *bool `json:"lfcUseLargestWindow,omitempty"`

        // LFCToMemoryRatio dictates the amount of memory in any given Compute Unit that will be
        // allocated to the LFC. For example, if the LFC is sized at 75% of memory, then this value
        // would be 0.75.
        LFCToMemoryRatio *float64 `json:"lfcToMemoryRatio,omitempty"`

        // LFCMinWaitBeforeDownscaleMinutes dictates the minimum duration we must wait before lowering
        // the goal CU based on LFC working set size.
        // For example, a value of 15 means we will not allow downscaling below the working set size
        // over the past 15 minutes. This allows us to accommodate spiky workloads without flushing the
        // cache every time.
        LFCMinWaitBeforeDownscaleMinutes *int `json:"lfcMinWaitBeforeDownscaleMinutes,omitempty"`

        // LFCWindowSizeMinutes dictates the minimum duration we must use during internal calculations
        // of the rate of increase in LFC working set size.
        LFCWindowSizeMinutes *int `json:"lfcWindowSizeMinutes,omitempty"`

        // CPUStableZoneRatio is the ratio of the stable load zone size relative to load5.
        // For example, a value of 0.25 means that stable zone will be load5±25%.
        CPUStableZoneRatio *float64 `json:"cpuStableZoneRatio,omitempty"`

        // CPUMixedZoneRatio is the ratio of the mixed load zone size relative to load5.
        // Since mixed zone starts after stable zone, values CPUStableZoneRatio=0.25 and CPUMixedZoneRatio=0.15
        // means that stable zone will be from 0.75*load5 to 1.25*load5, and mixed zone will be
        // from 0.6*load5 to 0.75*load5, and from 1.25*load5 to 1.4*load5.
        CPUMixedZoneRatio *float64 `json:"cpuMixedZoneRatio,omitempty"`
}

// WithOverrides returns a new copy of defaults, where fields set in overrides replace the ones in
// defaults but all others remain the same.
//
// overrides may be nil; if so, this method just returns defaults.
func (defaults ScalingConfig) WithOverrides(overrides *ScalingConfig) ScalingConfig {
        if overrides == nil {
                return defaults
        }

        if overrides.LoadAverageFractionTarget != nil {
                defaults.LoadAverageFractionTarget = lo.ToPtr(*overrides.LoadAverageFractionTarget)
        }
        if overrides.MemoryUsageFractionTarget != nil {
                defaults.MemoryUsageFractionTarget = lo.ToPtr(*overrides.MemoryUsageFractionTarget)
        }
        if overrides.MemoryTotalFractionTarget != nil {
                defaults.MemoryTotalFractionTarget = lo.ToPtr(*overrides.MemoryTotalFractionTarget)
        }
        if overrides.EnableLFCMetrics != nil {
                defaults.EnableLFCMetrics = lo.ToPtr(*overrides.EnableLFCMetrics)
        }
        if overrides.LFCUseLargestWindow != nil {
                defaults.LFCUseLargestWindow = lo.ToPtr(*overrides.LFCUseLargestWindow)
        }
        if overrides.LFCToMemoryRatio != nil {
                defaults.LFCToMemoryRatio = lo.ToPtr(*overrides.LFCToMemoryRatio)
        }
        if overrides.LFCWindowSizeMinutes != nil {
                defaults.LFCWindowSizeMinutes = lo.ToPtr(*overrides.LFCWindowSizeMinutes)
        }
        if overrides.LFCMinWaitBeforeDownscaleMinutes != nil {
                defaults.LFCMinWaitBeforeDownscaleMinutes = lo.ToPtr(*overrides.LFCMinWaitBeforeDownscaleMinutes)
        }

        if overrides.CPUStableZoneRatio != nil {
                defaults.CPUStableZoneRatio = lo.ToPtr(*overrides.CPUStableZoneRatio)
        }
        if overrides.CPUMixedZoneRatio != nil {
                defaults.CPUMixedZoneRatio = lo.ToPtr(*overrides.CPUMixedZoneRatio)
        }

        return defaults
}

// ValidateDefaults checks that the ScalingConfig is safe to use as default settings.
//
// This is more strict than ValidateOverride, where some fields need not be specified.
// Refer to the comments on ScalingConfig for more - each field specifies whether it is required,
// and when.
func (c *ScalingConfig) ValidateDefaults() error {
        return c.validate(true)
}

// ValidateOverrides checks that the ScalingConfig is safe to use to override preexisting settings.
//
// This is less strict than ValidateDefaults, because with ValidateOverrides even required fields
// are optional.
func (c *ScalingConfig) ValidateOverrides() error {
        return c.validate(false)
}

func (c *ScalingConfig) validate(requireAll bool) error {
        ec := &erc.Collector{}

        // Check c.LoadAverageFractionTarget is between 0 and 2. We don't *strictly* need the upper
        // bound, but it's a good safety check.
        if c.LoadAverageFractionTarget != nil {
                erc.Whenf(ec, *c.LoadAverageFractionTarget < 0.0, "%s must be set to value >= 0", ".loadAverageFractionTarget")
                erc.Whenf(ec, *c.LoadAverageFractionTarget >= 2.0, "%s must be set to value < 2 ", ".loadAverageFractionTarget")
        } else if requireAll {
                ec.Add(fmt.Errorf("%s is a required field", ".loadAverageFractionTarget"))
        }

        // Make sure c.MemoryUsageFractionTarget is between 0 and 1
        if c.MemoryUsageFractionTarget != nil {
                erc.Whenf(ec, *c.MemoryUsageFractionTarget < 0.0, "%s must be set to value >= 0", ".memoryUsageFractionTarget")
                erc.Whenf(ec, *c.MemoryUsageFractionTarget >= 1.0, "%s must be set to value < 1 ", ".memoryUsageFractionTarget")
        } else if requireAll {
                ec.Add(fmt.Errorf("%s is a required field", ".memoryUsageFractionTarget"))
        }
        // Make sure c.MemoryTotalFractionTarget is between 0 and 1
        if c.MemoryTotalFractionTarget != nil {
                erc.Whenf(ec, *c.MemoryTotalFractionTarget < 0.0, "%s must be set to value >= 0", ".memoryTotalFractionTarget")
                erc.Whenf(ec, *c.MemoryTotalFractionTarget >= 1.0, "%s must be set to value < 1 ", ".memoryTotalFractionTarget")
        } else if requireAll {
                ec.Add(fmt.Errorf("%s is a required field", ".memoryTotalFractionTarget"))
        }

        if requireAll {
                erc.Whenf(ec, c.EnableLFCMetrics == nil, "%s is a required field", ".enableLFCMetrics")
                erc.Whenf(ec, c.LFCToMemoryRatio == nil, "%s is a required field", ".lfcToMemoryRatio")
                erc.Whenf(ec, c.LFCWindowSizeMinutes == nil, "%s is a required field", ".lfcWindowSizeMinutes")
                erc.Whenf(ec, c.LFCMinWaitBeforeDownscaleMinutes == nil, "%s is a required field", ".lfcMinWaitBeforeDownscaleMinutes")
                erc.Whenf(ec, c.CPUStableZoneRatio == nil, "%s is a required field", ".cpuStableZoneRatio")
                erc.Whenf(ec, c.CPUMixedZoneRatio == nil, "%s is a required field", ".cpuMixedZoneRatio")
        }

        // heads-up! some functions elsewhere depend on the concrete return type of this function.
        return ec.Resolve()
}

package controllers

import (
        "context"
        "fmt"
        "runtime/debug"

        ctrl "sigs.k8s.io/controller-runtime"
        "sigs.k8s.io/controller-runtime/pkg/log"
        "sigs.k8s.io/controller-runtime/pkg/reconcile"
)

type catchPanicReconciler struct {
        inner reconcile.Reconciler
}

func withCatchPanic(r reconcile.Reconciler) reconcile.Reconciler {
        return &catchPanicReconciler{inner: r}
}

func (r *catchPanicReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) {
        log := log.FromContext(ctx)

        defer func() {
                if v := recover(); v != nil {
                        err = fmt.Errorf("panicked with: %v", v)
                        log.Error(err, "Reconcile panicked", "stack", string(debug.Stack()))
                }
        }()

        result, err = r.inner.Reconcile(ctx, req)
        return
}

package controllers

import (
        "bytes"
        "context"
        "crypto"
        "crypto/ecdsa"
        "crypto/elliptic"
        "crypto/rand"
        "crypto/x509"
        "encoding/pem"
        "fmt"
        "reflect"
        "time"

        "github.com/cert-manager/cert-manager/pkg/apis/certmanager"
        certv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1"
        cmmeta "github.com/cert-manager/cert-manager/pkg/apis/meta/v1"
        "github.com/cert-manager/cert-manager/pkg/util/pki"
        "sigs.k8s.io/controller-runtime/pkg/log"

        corev1 "k8s.io/api/core/v1"
        apierrors "k8s.io/apimachinery/pkg/api/errors"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/apimachinery/pkg/types"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)

func (r *VMReconciler) reconcileCertificateSecret(ctx context.Context, vm *vmv1.VirtualMachine) (*corev1.Secret, error) {
        log := log.FromContext(ctx)

        certSecret := &corev1.Secret{}

        // Check if the TLS secret exists, if not start the creation routine.
        err := r.Get(ctx, types.NamespacedName{Name: vm.Status.TLSSecretName, Namespace: vm.Namespace}, certSecret)
        if err != nil && !apierrors.IsNotFound(err) {
                log.Error(err, "Failed to get vm TLS secret")
                return nil, err
        }

        certNotFound := false
        if err != nil /* not found */ {
                msg := fmt.Sprintf("VirtualMachine %s TLS secret %s not found", vm.Name, vm.Status.TLSSecretName)
                r.Recorder.Event(vm, "Normal", "SigningCertificate", msg)

                certNotFound = true
        } else {
                // check the certificate expiration
                certs, err := pki.DecodeX509CertificateChainBytes(certSecret.Data[corev1.TLSCertKey])
                if err != nil {
                        log.Error(err, "Failed to parse VM certificate")
                        return nil, err
                }
                renewAt := certs[0].NotAfter.Add(-vm.Spec.TLS.RenewBefore.Duration)

                // if not yet due for renewal
                if time.Now().Before(renewAt) {
                        // just in case they were left around due to a transient issue.
                        if err := r.cleanupTmpSecrets(ctx, vm); err != nil {
                                return nil, err
                        }

                        return certSecret, nil
                }

                msg := fmt.Sprintf("VirtualMachine %s TLS secret %s is due for renewal", vm.Name, vm.Status.TLSSecretName)
                r.Recorder.Event(vm, "Normal", "SigningCertificate", msg)
        }

        // Check if the TLS private key temporary secret exists, if not create a new one
        tmpKeySecret := &corev1.Secret{}
        err = r.Get(ctx, types.NamespacedName{Name: fmt.Sprintf("%s-tmp", vm.Status.TLSSecretName), Namespace: vm.Namespace}, tmpKeySecret)
        if err != nil && !apierrors.IsNotFound(err) {
                log.Error(err, "Failed to get vm TLS secret")
                return nil, err
        } else if err != nil /* not found */ {
                tmpKeySecret, err = r.createTlsTmpSecret(ctx, vm)
                if err != nil {
                        return nil, err
                }
        }

        key, err := pki.DecodePrivateKeyBytes(tmpKeySecret.Data[corev1.TLSPrivateKeyKey])
        if err != nil {
                log.Error(err, "Failed to decode TLS private key")
                return nil, err
        }

        // Check if the TLS certificate already exists, if not create a new one
        certificateReq := &certv1.CertificateRequest{}
        err = r.Get(ctx, types.NamespacedName{Name: vm.Name, Namespace: vm.Namespace}, certificateReq)
        if err != nil && !apierrors.IsNotFound(err) {
                log.Error(err, "Failed to get vm CertificateRequest")
                return nil, err
        } else if err != nil /* not found */ {
                certificateReq, err = r.createCertificateRequest(ctx, vm, key)
                if err != nil {
                        return nil, err
                }
        }

        if len(certificateReq.Status.Certificate) == 0 {
                // we cannot yet update the cert secret.
                // return it untouched.
                return certSecret, nil
        }

        // we have a certificate and the corresponding private key
        // create/update the proper TLS secret and delete the tmp secret
        if certNotFound {
                if err := r.createTlsSecret(ctx, vm, key, certificateReq); err != nil {
                        return nil, err
                }
        } else if !reflect.DeepEqual(certificateReq.Status.Certificate, certSecret.Data[corev1.TLSCertKey]) {
                if err := r.updateTlsSecret(ctx, vm, key, certificateReq, certSecret); err != nil {
                        return nil, err
                }
        }

        // we made a lot of changes to state.
        // nil signals that we should re-schedule reconciliation with a refreshed state.
        return nil, nil
}

func (r *VMReconciler) cleanupTmpSecrets(ctx context.Context, vm *vmv1.VirtualMachine) error {
        log := log.FromContext(ctx)

        // Check if the TLS private key temporary secret exists, if so, delete it.
        tmpKeySecret := &corev1.Secret{}
        err := r.Get(ctx, types.NamespacedName{Name: fmt.Sprintf("%s-tmp", vm.Status.TLSSecretName), Namespace: vm.Namespace}, tmpKeySecret)
        if err != nil && !apierrors.IsNotFound(err) {
                log.Error(err, "Failed to get vm TLS secret")
                return err
        } else if err == nil /* found */ {
                if err := r.deleteTmpSecret(ctx, vm, tmpKeySecret); err != nil {
                        return err
                }
        }

        // Check if the TLS certificate already exists, if so, delete it.
        certificateReq := &certv1.CertificateRequest{}
        err = r.Get(ctx, types.NamespacedName{Name: vm.Name, Namespace: vm.Namespace}, certificateReq)
        if err != nil && !apierrors.IsNotFound(err) {
                log.Error(err, "Failed to get vm CertificateRequest")
                return err
        } else if err == nil /* found */ {
                if err := r.deleteCertRequest(ctx, vm, certificateReq); err != nil {
                        return err
                }
        }

        return nil
}

func (r *VMReconciler) createTlsTmpSecret(ctx context.Context, vm *vmv1.VirtualMachine) (*corev1.Secret, error) {
        log := log.FromContext(ctx)

        // create a new key for this VM
        key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
        if err != nil {
                log.Error(err, "Failed to generate TLS private key for VirtualMachine")
                return nil, err
        }

        // Define the secret
        tmpKeySecret, err := r.tmpKeySecretForVirtualMachine(vm, key)
        if err != nil {
                log.Error(err, "Failed to define new temporary TLS private key secret resource for VirtualMachine")
                return nil, err
        }

        if err = r.Create(ctx, tmpKeySecret); err != nil {
                log.Error(err, "Failed to create new temporary TLS private key secret", "Secret.Namespace", tmpKeySecret.Namespace, "Secret.Name", tmpKeySecret.Name)
                return nil, err
        }
        log.Info("Virtual Machine temporary TLS private key secret was created", "Secret.Namespace", tmpKeySecret.Namespace, "Secret.Name", tmpKeySecret.Name)

        msg := fmt.Sprintf("VirtualMachine %s created temporary TLS private key secret %s", vm.Name, tmpKeySecret.Name)
        r.Recorder.Event(vm, "Normal", "Created", msg)

        return tmpKeySecret, nil
}

func (r *VMReconciler) createCertificateRequest(ctx context.Context, vm *vmv1.VirtualMachine, key crypto.Signer) (*certv1.CertificateRequest, error) {
        log := log.FromContext(ctx)

        // Define a new cert req
        certificateReq, err := r.certReqForVirtualMachine(vm, key)
        if err != nil {
                log.Error(err, "Failed to define new Certificate resource for VirtualMachine")
                return nil, err
        }

        log.Info("Creating a new CertificateRequest", "CertificateRequest.Namespace", certificateReq.Namespace, "CertificateRequest.Name", certificateReq.Name)
        if err = r.Create(ctx, certificateReq); err != nil {
                log.Error(err, "Failed to create new Certificate", "CertificateRequest.Namespace", certificateReq.Namespace, "CertificateRequest.Name", certificateReq.Name)
                return nil, err
        }
        log.Info("Runner CertificateRequest was created", "CertificateRequest.Namespace", certificateReq.Namespace, "CertificateRequest.Name", certificateReq.Name)

        msg := fmt.Sprintf("VirtualMachine %s created CertificateRequest %s", vm.Name, certificateReq.Name)
        r.Recorder.Event(vm, "Normal", "Created", msg)

        return certificateReq, nil
}

func (r *VMReconciler) createTlsSecret(ctx context.Context, vm *vmv1.VirtualMachine, key crypto.Signer, certificateReq *certv1.CertificateRequest) error {
        log := log.FromContext(ctx)

        certSecret, err := r.certSecretForVirtualMachine(vm, key, certificateReq.Status.Certificate)
        if err != nil {
                log.Error(err, "Failed to define new TLS secret resource for VirtualMachine")
                return err
        }

        if err = r.Create(ctx, certSecret); err != nil {
                log.Error(err, "Failed to create new TLS secret", "Secret.Namespace", certSecret.Namespace, "Secret.Name", certSecret.Name)
                return err
        }
        log.Info("Virtual Machine TLS secret was created", "Secret.Namespace", certSecret.Namespace, "Secret.Name", certSecret.Name)

        msg := fmt.Sprintf("VirtualMachine %s created TLS secret %s", vm.Name, certSecret.Name)
        r.Recorder.Event(vm, "Normal", "Created", msg)

        return nil
}

func (r *VMReconciler) updateTlsSecret(ctx context.Context, vm *vmv1.VirtualMachine, key crypto.Signer, certificateReq *certv1.CertificateRequest, certSecret *corev1.Secret) error {
        log := log.FromContext(ctx)

        encodedKey, err := pki.EncodePrivateKey(key, certv1.PKCS1)
        if err != nil {
                return err
        }
        certSecret.Data[corev1.TLSPrivateKeyKey] = encodedKey
        certSecret.Data[corev1.TLSCertKey] = certificateReq.Status.Certificate

        if err = r.Update(ctx, certSecret); err != nil {
                log.Error(err, "Failed to update new TLS secret", "Secret.Namespace", certSecret.Namespace, "Secret.Name", certSecret.Name)
                return err
        }
        log.Info("Virtual Machine TLS secret was updated", "Secret.Namespace", certSecret.Namespace, "Secret.Name", certSecret.Name)

        msg := fmt.Sprintf("VirtualMachine %s updated TLS secret %s", vm.Name, certSecret.Name)
        r.Recorder.Event(vm, "Normal", "Updated", msg)

        return nil
}

func (r *VMReconciler) deleteTmpSecret(ctx context.Context, vm *vmv1.VirtualMachine, tmpKeySecret *corev1.Secret) error {
        log := log.FromContext(ctx)

        err := r.Delete(ctx, tmpKeySecret)
        if err != nil {
                log.Info("Virtual Machine temporary TLS private key secret could not be deleted", "Secret.Namespace", tmpKeySecret.Namespace, "Secret.Name", tmpKeySecret.Name)
                return err
        }
        msg := fmt.Sprintf("VirtualMachine %s temporary TLS private key secret %s was deleted", vm.Name, tmpKeySecret.Name)
        r.Recorder.Event(vm, "Normal", "Deleted", msg)
        return nil
}

func (r *VMReconciler) deleteCertRequest(ctx context.Context, vm *vmv1.VirtualMachine, certificateReq *certv1.CertificateRequest) error {
        log := log.FromContext(ctx)

        err := r.Delete(ctx, certificateReq)
        if err != nil {
                log.Info("Virtual Machine CertificateRequest could not be deleted", "CertificateRequest.Namespace", certificateReq.Namespace, "CertificateRequest.Name", certificateReq.Name)
                return err
        }
        msg := fmt.Sprintf("VirtualMachine %s CertificateRequest %s was deleted", vm.Name, certificateReq.Name)
        r.Recorder.Event(vm, "Normal", "Deleted", msg)
        return nil
}

func certSpecCSR(vm *vmv1.VirtualMachine) (*x509.CertificateRequest, error) {
        certSpec := certv1.CertificateSpec{
                CommonName: vm.Spec.TLS.ServerName,
                DNSNames:   []string{vm.Spec.TLS.ServerName},
                PrivateKey: &certv1.CertificatePrivateKey{
                        Algorithm:      certv1.ECDSAKeyAlgorithm,
                        Encoding:       certv1.PKCS1,
                        RotationPolicy: certv1.RotationPolicyAlways,
                        Size:           256,
                },
                Usages:      certv1.DefaultKeyUsages(),
                IsCA:        false,
                Duration:    &metav1.Duration{Duration: vm.Spec.TLS.ExpireAfter.Duration},
                RenewBefore: &metav1.Duration{Duration: vm.Spec.TLS.RenewBefore.Duration},
        }

        cert := &certv1.Certificate{
                Spec: certSpec,
        }

        return pki.GenerateCSR(cert)
}

func tmpKeySecretSpec(
        vm *vmv1.VirtualMachine,
        key crypto.PrivateKey,
) (*corev1.Secret, error) {
        encodedKey, err := pki.EncodePrivateKey(key, certv1.PKCS1)
        if err != nil {
                return nil, err
        }

        name := fmt.Sprintf("%s-tmp", vm.Status.TLSSecretName)

        return &corev1.Secret{
                ObjectMeta: metav1.ObjectMeta{
                        Name:      name,
                        Namespace: vm.Namespace,
                },
                Data: map[string][]byte{
                        corev1.TLSPrivateKeyKey: encodedKey,
                },
        }, nil
}

func certSecretSpec(
        vm *vmv1.VirtualMachine,
        key crypto.PrivateKey,
        cert []byte,
) (*corev1.Secret, error) {
        encodedKey, err := pki.EncodePrivateKey(key, certv1.PKCS1)
        if err != nil {
                return nil, err
        }

        return &corev1.Secret{
                ObjectMeta: metav1.ObjectMeta{
                        Name:      vm.Status.TLSSecretName,
                        Namespace: vm.Namespace,
                },
                Data: map[string][]byte{
                        corev1.TLSPrivateKeyKey: encodedKey,
                        corev1.TLSCertKey:       cert,
                },
                Type: corev1.SecretTypeTLS,
        }, nil
}

func certReqSpec(
        vm *vmv1.VirtualMachine,
        key crypto.Signer,
) (*certv1.CertificateRequest, error) {
        issuer := cmmeta.ObjectReference{
                Name:  vm.Spec.TLS.CertificateIssuer,
                Kind:  "ClusterIssuer",
                Group: certmanager.GroupName,
        }

        cr, err := certSpecCSR(vm)
        if err != nil {
                return nil, err
        }

        csrDER, err := x509.CreateCertificateRequest(rand.Reader, cr, key)
        if err != nil {
                return nil, err
        }

        csrPEM := bytes.NewBuffer([]byte{})
        err = pem.Encode(csrPEM, &pem.Block{Type: "CERTIFICATE REQUEST", Bytes: csrDER, Headers: map[string]string{}})
        if err != nil {
                return nil, err
        }

        certSpec := certv1.CertificateRequestSpec{
                Duration:  &metav1.Duration{Duration: vm.Spec.TLS.ExpireAfter.Duration},
                IssuerRef: issuer,
                Request:   csrPEM.Bytes(),
                IsCA:      false,
                Usages:    certv1.DefaultKeyUsages(),
        }

        return &certv1.CertificateRequest{
                ObjectMeta: metav1.ObjectMeta{
                        Name:      vm.Name,
                        Namespace: vm.Namespace,
                },
                Spec: certSpec,
        }, nil
}

package failurelag

import (
        "sync"
        "time"
)

// Tracker accumulates failure events for a given key and determines if
// the key is degraded. The key becomes degraded if it receives only failures
// over a configurable pending period. Once the success event is received, the key
// is no longer considered degraded, and the pending period is reset.
type Tracker[T comparable] struct {
        period time.Duration

        pendingSince map[T]time.Time
        degraded     map[T]struct{}
        degradeAt    []degradeAt[T]

        lock sync.Mutex
        Now  func() time.Time
}

type degradeAt[T comparable] struct {
        ts  time.Time
        key T
}

func NewTracker[T comparable](period time.Duration) *Tracker[T] {
        return &Tracker[T]{
                period:       period,
                pendingSince: make(map[T]time.Time),
                degraded:     make(map[T]struct{}),
                degradeAt:    []degradeAt[T]{},
                lock:         sync.Mutex{},
                Now:          time.Now,
        }
}

// forward processes all the fireAt events that are now in the past.
func (t *Tracker[T]) forward(now time.Time) {
        i := 0
        for ; i < len(t.degradeAt); i++ {
                event := t.degradeAt[i]
                if event.ts.After(now) {
                        break
                }
                pendingSince, ok := t.pendingSince[event.key]
                if !ok {
                        // There was a success event in between
                        continue
                }

                if event.ts.Sub(pendingSince) < t.period {
                        // There was a success, and another failure in between
                        // We will have another fireAt event for this key in the future
                        continue
                }
                t.degraded[event.key] = struct{}{}
        }
        t.degradeAt = t.degradeAt[i:]
}

func (t *Tracker[T]) RecordSuccess(key T) {
        t.lock.Lock()
        defer t.lock.Unlock()

        delete(t.degraded, key)
        delete(t.pendingSince, key)
        t.forward(t.Now())
}

func (t *Tracker[T]) RecordFailure(key T) {
        t.lock.Lock()
        defer t.lock.Unlock()

        now := t.Now()

        if _, ok := t.pendingSince[key]; !ok {
                t.pendingSince[key] = now
        }

        t.degradeAt = append(t.degradeAt, degradeAt[T]{
                ts:  now.Add(t.period),
                key: key,
        })

        t.forward(now)
}

func (t *Tracker[T]) DegradedCount() int {
        t.lock.Lock()
        defer t.lock.Unlock()

        t.forward(t.Now())
        return len(t.degraded)
}

func (t *Tracker[T]) Degraded() []T {
        t.lock.Lock()
        defer t.lock.Unlock()

        t.forward(t.Now())
        keys := make([]T, 0, len(t.degraded))
        for k := range t.degraded {
                keys = append(keys, k)
        }
        return keys
}

package controllers

import (
        "context"
        "fmt"
        "time"

        "github.com/go-logr/logr"
        "github.com/prometheus/client_golang/prometheus"
        ctrl "sigs.k8s.io/controller-runtime"
        "sigs.k8s.io/controller-runtime/pkg/client"
        "sigs.k8s.io/controller-runtime/pkg/log"
        "sigs.k8s.io/controller-runtime/pkg/metrics"
        "sigs.k8s.io/controller-runtime/pkg/reconcile"

        "k8s.io/apimachinery/pkg/api/errors"

        "github.com/neondatabase/autoscaling/pkg/neonvm/controllers/failurelag"
        "github.com/neondatabase/autoscaling/pkg/util"
)

type ReconcilerMetrics struct {
        failing                        *prometheus.GaugeVec
        vmCreationToRunnerCreationTime prometheus.Histogram
        runnerCreationToVMRunningTime  prometheus.Histogram
        vmCreationToVMRunningTime      prometheus.Histogram
        vmRestartCounts                prometheus.Counter
        reconcileDuration              prometheus.HistogramVec
}

const OutcomeLabel = "outcome"

func MakeReconcilerMetrics() ReconcilerMetrics {
        // Copied bucket values from controller runtime latency metric. We can
        // adjust them in the future if needed.
        buckets := []float64{
                0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
                1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60,
        }

        m := ReconcilerMetrics{
                failing: util.RegisterMetric(metrics.Registry, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: "reconcile_failing_objects",
                                Help: "Number of objects that are failing to reconcile for each specific controller",
                        },
                        []string{"controller", OutcomeLabel},
                )),
                vmCreationToRunnerCreationTime: util.RegisterMetric(metrics.Registry, prometheus.NewHistogram(
                        prometheus.HistogramOpts{
                                Name:    "vm_creation_to_runner_creation_duration_seconds",
                                Help:    "Time duration from VirtualMachine.CreationTimestamp to runner Pod.CreationTimestamp",
                                Buckets: buckets,
                        },
                )),
                runnerCreationToVMRunningTime: util.RegisterMetric(metrics.Registry, prometheus.NewHistogram(
                        prometheus.HistogramOpts{
                                Name:    "vm_runner_creation_to_vm_running_duration_seconds",
                                Help:    "Time duration from runner Pod.CreationTimestamp to the moment when VirtualMachine.Status.Phase becomes Running",
                                Buckets: buckets,
                        },
                )),
                vmCreationToVMRunningTime: util.RegisterMetric(metrics.Registry, prometheus.NewHistogram(
                        prometheus.HistogramOpts{
                                Name:    "vm_creation_to_vm_running_duration_seconds",
                                Help:    "Time duration from VirtualMachine.CreationTimeStamp to the moment when VirtualMachine.Status.Phase becomes Running",
                                Buckets: buckets,
                        },
                )),
                vmRestartCounts: util.RegisterMetric(metrics.Registry, prometheus.NewCounter(
                        prometheus.CounterOpts{
                                Name: "vm_restarts_count",
                                Help: "Total number of VM restarts across the cluster captured by VirtualMachine reconciler",
                        },
                )),
                reconcileDuration: *util.RegisterMetric(metrics.Registry, prometheus.NewHistogramVec(
                        prometheus.HistogramOpts{
                                Name:    "reconcile_duration_seconds",
                                Help:    "Time duration of reconciles",
                                Buckets: buckets,
                        }, []string{OutcomeLabel},
                )),
        }
        return m
}

type ReconcileOutcome string

const (
        SuccessOutcome  ReconcileOutcome = "success"
        FailureOutcome  ReconcileOutcome = "failure"
        ConflictOutcome ReconcileOutcome = "conflict"
)

func (m ReconcilerMetrics) ObserveReconcileDuration(
        outcome ReconcileOutcome,
        duration time.Duration,
) {
        m.reconcileDuration.WithLabelValues(string(outcome)).Observe(duration.Seconds())
}

type wrappedReconciler struct {
        ControllerName         string
        Reconciler             reconcile.Reconciler
        Metrics                ReconcilerMetrics
        refreshFailingInterval time.Duration

        failing     *failurelag.Tracker[client.ObjectKey]
        conflicting *failurelag.Tracker[client.ObjectKey]
}

// ReconcilerWithMetrics is a Reconciler produced by WithMetrics that can return a snapshot of the
// state backing the metrics.
type ReconcilerWithMetrics interface {
        reconcile.Reconciler

        Snapshot() ReconcileSnapshot
        FailingRefresher() FailingRefresher
}

// ReconcileSnapshot provides a glimpse into the current state of ongoing reconciles
//
// This type is (transitively) returned by the controller's "dump state" HTTP endpoint, and exists
// to allow us to get deeper information on the metrics - we can't expose information for every
// VirtualMachine into the metrics (it'd be too high cardinality), but we *can* make it available
// when requested.
type ReconcileSnapshot struct {
        // ControllerName is the name of the controller: virtualmachine or virtualmachinemigration.
        ControllerName string `json:"controllerName"`

        // Failing is the list of objects currently failing to reconcile
        Failing []string `json:"failing"`

        // Conflicting is the list of objects currently failing to reconcile
        // due to a conflict
        Conflicting []string `json:"conflicting"`
}

// WithMetrics wraps a given Reconciler with metrics capabilities.
//
// The returned reconciler also provides a way to get a snapshot of the state of ongoing reconciles,
// to see the data backing the metrics.
func WithMetrics(
        reconciler reconcile.Reconciler,
        rm ReconcilerMetrics,
        cntrlName string,
        failurePendingPeriod time.Duration,
        refreshFailingInterval time.Duration,
) ReconcilerWithMetrics {
        return &wrappedReconciler{
                Reconciler:             reconciler,
                Metrics:                rm,
                ControllerName:         cntrlName,
                failing:                failurelag.NewTracker[client.ObjectKey](failurePendingPeriod),
                conflicting:            failurelag.NewTracker[client.ObjectKey](failurePendingPeriod),
                refreshFailingInterval: refreshFailingInterval,
        }
}

func (d *wrappedReconciler) refreshFailing(
        log logr.Logger,
        outcome ReconcileOutcome,
        tracker *failurelag.Tracker[client.ObjectKey],
) {
        degraded := tracker.Degraded()
        d.Metrics.failing.WithLabelValues(d.ControllerName, string(outcome)).
                Set(float64(len(degraded)))

        // Log each object on a separate line (even though we could just put them all on the same line)
        // so that:
        // 1. we avoid super long log lines (which can make log storage / querying unhappy), and
        // 2. so that we can process it with Grafana Loki, which can't handle arrays
        for _, obj := range degraded {
                log.Info(
                        fmt.Sprintf("Currently failing to reconcile %v object", d.ControllerName),
                        "outcome", outcome,
                        "object", obj,
                )
        }
}

func (d *wrappedReconciler) runRefreshFailing(ctx context.Context) {
        log := log.FromContext(ctx)

        for {
                select {
                case <-ctx.Done():
                        return
                case <-time.After(d.refreshFailingInterval):
                        d.refreshFailing(log, FailureOutcome, d.failing)
                        d.refreshFailing(log, ConflictOutcome, d.conflicting)
                }
        }
}

func (d *wrappedReconciler) FailingRefresher() FailingRefresher {
        return FailingRefresher{r: d}
}

// FailingRefresher is a wrapper, which implements manager.Runnable
type FailingRefresher struct {
        r *wrappedReconciler
}

func (f FailingRefresher) Start(ctx context.Context) error {
        go f.r.runRefreshFailing(ctx)
        return nil
}

func (d *wrappedReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
        log := log.FromContext(ctx)

        now := time.Now()
        res, err := d.Reconciler.Reconcile(ctx, req)
        duration := time.Since(now)

        outcome := SuccessOutcome
        if err != nil {
                if errors.IsConflict(err) {
                        outcome = ConflictOutcome
                        d.conflicting.RecordFailure(req.NamespacedName)
                } else {
                        outcome = FailureOutcome
                        d.failing.RecordFailure(req.NamespacedName)

                        // If the VM is now getting non-conflict errors, it probably
                        // means transient conflicts has been resolved.
                        //
                        // Notably, the other way around is not true:
                        // if a VM is getting conflict errors, it doesn't mean
                        // non-conflict errors are resolved, as they are more
                        // likely to be persistent.
                        d.conflicting.RecordSuccess(req.NamespacedName)
                }

                log.Error(err, "Failed to reconcile VirtualMachine",
                        "duration", duration.String(), "outcome", outcome)
        } else {
                d.failing.RecordSuccess(req.NamespacedName)
                d.conflicting.RecordSuccess(req.NamespacedName)
                log.Info("Successful reconciliation", "duration", duration.String(), "requeueAfter", res.RequeueAfter)
        }
        d.Metrics.ObserveReconcileDuration(outcome, duration)
        d.Metrics.failing.WithLabelValues(d.ControllerName,
                string(FailureOutcome)).Set(float64(d.failing.DegradedCount()))
        d.Metrics.failing.WithLabelValues(d.ControllerName,
                string(ConflictOutcome)).Set(float64(d.conflicting.DegradedCount()))

        return res, err
}

func toStringSlice(s []client.ObjectKey) []string {
        keys := make([]string, 0, len(s))
        for _, k := range s {
                keys = append(keys, k.String())
        }
        return keys
}

func (r *wrappedReconciler) Snapshot() ReconcileSnapshot {
        failing := toStringSlice(r.failing.Degraded())
        conflicting := toStringSlice(r.conflicting.Degraded())

        return ReconcileSnapshot{
                ControllerName: r.ControllerName,
                Failing:        failing,
                Conflicting:    conflicting,
        }
}

package controllers

import (
        "bytes"
        "context"
        "encoding/json"
        "fmt"
        "io"
        "net/http"
        "time"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/api"
)

func setRunnerCPULimits(ctx context.Context, vm *vmv1.VirtualMachine, cpu vmv1.MilliCPU) error {
        ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
        defer cancel()

        url := fmt.Sprintf("http://%s:%d/cpu_change", vm.Status.PodIP, vm.Spec.RunnerPort)

        update := api.VCPUChange{VCPUs: cpu}

        data, err := json.Marshal(update)
        if err != nil {
                return err
        }

        req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(data))
        if err != nil {
                return err
        }
        req.Header.Set("Content-Type", "application/json")

        resp, err := http.DefaultClient.Do(req)
        if err != nil {
                return err
        }
        defer resp.Body.Close()

        if resp.StatusCode != 200 {
                return fmt.Errorf("setRunnerCgroup: unexpected status %s", resp.Status)
        }
        return nil
}

func getRunnerCPULimits(ctx context.Context, vm *vmv1.VirtualMachine) (*api.VCPUCgroup, error) {
        ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
        defer cancel()

        url := fmt.Sprintf("http://%s:%d/cpu_current", vm.Status.PodIP, vm.Spec.RunnerPort)

        req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
        if err != nil {
                return nil, err
        }

        resp, err := http.DefaultClient.Do(req)
        if err != nil {
                return nil, err
        }

        if resp.StatusCode != 200 {
                return nil, fmt.Errorf("getRunnerCgroup: unexpected status %s", resp.Status)
        }

        body, err := io.ReadAll(resp.Body)
        defer resp.Body.Close()
        if err != nil {
                return nil, err
        }
        var result api.VCPUCgroup
        err = json.Unmarshal(body, &result)
        if err != nil {
                return nil, err
        }

        return &result, nil
}

/*
Copyright 2022.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
        "context"
        "crypto"
        "crypto/ed25519"
        "crypto/rand"
        "encoding/base64"
        "encoding/json"
        "encoding/pem"
        "errors"
        "fmt"
        "os"
        "reflect"
        "strconv"
        "time"

        certv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1"
        nadapiv1 "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1"
        "github.com/samber/lo"
        "golang.org/x/crypto/ssh"
        ctrl "sigs.k8s.io/controller-runtime"
        "sigs.k8s.io/controller-runtime/pkg/client"
        "sigs.k8s.io/controller-runtime/pkg/controller"
        "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
        "sigs.k8s.io/controller-runtime/pkg/log"

        corev1 "k8s.io/api/core/v1"
        apierrors "k8s.io/apimachinery/pkg/api/errors"
        "k8s.io/apimachinery/pkg/api/meta"
        "k8s.io/apimachinery/pkg/api/resource"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/apimachinery/pkg/runtime"
        "k8s.io/apimachinery/pkg/types"
        "k8s.io/apimachinery/pkg/util/intstr"
        "k8s.io/apiserver/pkg/storage/names"
        "k8s.io/client-go/tools/record"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/neonvm/controllers/buildtag"
        "github.com/neondatabase/autoscaling/pkg/neonvm/ipam"
        "github.com/neondatabase/autoscaling/pkg/util/patch"
)

const (
        virtualmachineFinalizer = "vm.neon.tech/finalizer"
)

// Definitions to manage status conditions
const (
        // typeAvailableVirtualMachine represents the status of the Deployment reconciliation
        typeAvailableVirtualMachine = "Available"
        // typeDegradedVirtualMachine represents the status used when the custom resource is deleted and the finalizer operations are must to occur.
        typeDegradedVirtualMachine = "Degraded"
)

const (
        minSupportedRunnerVersion api.RunnerProtoVersion = api.RunnerProtoV1
        maxSupportedRunnerVersion api.RunnerProtoVersion = api.RunnerProtoV1
)

// VMReconciler reconciles a VirtualMachine object
type VMReconciler struct {
        client.Client
        Scheme   *runtime.Scheme
        Recorder record.EventRecorder
        Config   *ReconcilerConfig
        IPAM     *ipam.IPAM

        Metrics ReconcilerMetrics `exhaustruct:"optional"`
}

// The following markers are used to generate the rules permissions (RBAC) on config/rbac using controller-gen
// when controller-gen (used by 'make generate') is executed.
// To know more about markers see: https://book.kubebuilder.io/reference/markers.html

//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachines,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachines/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachines/finalizers,verbs=update
//+kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;list;watch
//+kubebuilder:rbac:groups=vm.neon.tech,resources=ippools,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=vm.neon.tech,resources=ippools/finalizers,verbs=update
//+kubebuilder:rbac:groups=k8s.cni.cncf.io,resources=network-attachment-definitions,verbs=get;list;watch
//+kubebuilder:rbac:groups=cert-manager.io,resources=certificaterequests,verbs=get;list;watch;create;update;patch;delete

// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.

// It is essential for the controller's reconciliation loop to be idempotent. By following the Operator
// pattern you will create Controllers which provide a reconcile function
// responsible for synchronizing resources until the desired state is reached on the cluster.
// Breaking this recommendation goes against the design principles of controller-runtime.
// and may lead to unforeseen consequences such as resources becoming stuck and requiring manual intervention.
// For further info:
// - About Operator Pattern: https://kubernetes.io/docs/concepts/extend-kubernetes/operator/
// - About Controllers: https://kubernetes.io/docs/concepts/architecture/controller/
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.13.0/pkg/reconcile
func (r *VMReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
        log := log.FromContext(ctx)

        var vm vmv1.VirtualMachine
        if err := r.Get(ctx, req.NamespacedName, &vm); err != nil {
                // Error reading the object - requeue the request.
                if notfound := client.IgnoreNotFound(err); notfound == nil {
                        log.Info("virtualmachine resource not found. Ignoring since object must be deleted")
                        return ctrl.Result{}, nil
                }
                log.Error(err, "Unable to fetch VirtualMachine")
                return ctrl.Result{}, client.IgnoreNotFound(err)
        }

        // examine DeletionTimestamp to determine if object is under deletion
        if vm.ObjectMeta.DeletionTimestamp.IsZero() {
                // The object is not being deleted, so if it does not have our finalizer,
                // then lets add the finalizer and update the object. This is equivalent
                // registering our finalizer.
                if !controllerutil.ContainsFinalizer(&vm, virtualmachineFinalizer) {
                        log.Info("Adding Finalizer for VirtualMachine")
                        if ok := controllerutil.AddFinalizer(&vm, virtualmachineFinalizer); !ok {
                                log.Info("Failed to add finalizer from VirtualMachine")
                                return ctrl.Result{Requeue: true}, nil
                        }
                        if err := r.tryUpdateVM(ctx, &vm); err != nil {
                                log.Error(err, "Failed to update status about adding finalizer to VirtualMachine")
                                return ctrl.Result{}, err
                        }
                        return ctrl.Result{Requeue: true}, nil
                }
        } else {
                // The object is being deleted
                if controllerutil.ContainsFinalizer(&vm, virtualmachineFinalizer) {
                        // our finalizer is present, so lets handle any external dependency
                        log.Info("Performing Finalizer Operations for VirtualMachine before delete it")
                        if err := r.doFinalizerOperationsForVirtualMachine(ctx, &vm); err != nil {
                                log.Error(err, "Failed to perform finalizer operations for VirtualMachine")
                                return ctrl.Result{}, err
                        }

                        // remove our finalizer from the list and update it.
                        log.Info("Removing Finalizer for VirtualMachine after successfully perform the operations")
                        if ok := controllerutil.RemoveFinalizer(&vm, virtualmachineFinalizer); !ok {
                                log.Info("Failed to remove finalizer from VirtualMachine")
                                return ctrl.Result{Requeue: true}, nil
                        }
                        if err := r.tryUpdateVM(ctx, &vm); err != nil {
                                log.Error(err, "Failed to update status about removing finalizer from VirtualMachine")
                                return ctrl.Result{}, err
                        }
                }
                // Stop reconciliation as the item is being deleted
                return ctrl.Result{}, nil
        }

        // examine for nil values that should be defaulted
        // this part is done for values that we want eventually explicitly override in the kube-api storage
        // to a default value.
        {
                changed := false
                // examine targetArchitecture and set it to the default value if it is not set
                if vm.Spec.TargetArchitecture == nil {
                        log.Info("Setting default target architecture", "default", vmv1.CPUArchitectureAMD64)
                        vm.Spec.TargetArchitecture = lo.ToPtr(vmv1.CPUArchitectureAMD64)
                        changed = true
                }

                // examine cpuScalingMode and set it to the default value if it is not set
                if vm.Spec.CpuScalingMode == nil {
                        log.Info("Setting default CPU scaling mode", "default", r.Config.DefaultCPUScalingMode)
                        vm.Spec.CpuScalingMode = lo.ToPtr(r.Config.DefaultCPUScalingMode)
                        changed = true
                }

                if changed {
                        if err := r.tryUpdateVM(ctx, &vm); err != nil {
                                log.Error(err, "Failed to set default values for VirtualMachine")
                                return ctrl.Result{}, err
                        }
                        return ctrl.Result{Requeue: true}, nil
                }
        }

        statusBefore := vm.Status.DeepCopy()
        if err := r.doReconcile(ctx, &vm); err != nil {
                r.Recorder.Eventf(&vm, corev1.EventTypeWarning, "Failed",
                        "Failed to reconcile (%s): %s", vm.Name, err)
                if errors.Is(err, ipam.ErrAgain) {
                        return ctrl.Result{RequeueAfter: time.Second}, nil
                }
                return ctrl.Result{}, err
        }

        // If the status changed, try to update the object
        if !DeepEqual(statusBefore, vm.Status) {
                if err := r.Status().Update(ctx, &vm); err != nil {
                        log.Error(err, "Failed to update VirtualMachine status after reconcile loop",
                                "virtualmachine", vm.Name)
                        return ctrl.Result{}, err
                }
        }

        // Only quickly requeue if we're scaling or migrating. Otherwise, we aren't expecting any
        // changes from QEMU, and it's wasteful to repeatedly check.
        requeueAfter := time.Second
        if vm.Status.Phase == vmv1.VmPending || vm.Status.Phase == vmv1.VmRunning {
                requeueAfter = 15 * time.Second
        }

        return ctrl.Result{RequeueAfter: requeueAfter}, nil
}

// doFinalizerOperationsForVirtualMachine will perform the required operations before delete the CR.
func (r *VMReconciler) doFinalizerOperationsForVirtualMachine(ctx context.Context, vm *vmv1.VirtualMachine) error {
        // Note: It is not recommended to use finalizers with the purpose of delete resources which are
        // created and managed in the reconciliation. These ones, such as the Pod created on this reconcile,
        // are defined as depended of the custom resource. See that we use the method ctrl.SetControllerReference.
        // to set the ownerRef which means that the Deployment will be deleted by the Kubernetes API.
        // More info: https://kubernetes.io/docs/tasks/administer-cluster/use-cascading-deletion/

        log := log.FromContext(ctx)

        // The following implementation will raise an event
        r.Recorder.Event(vm, "Warning", "Deleting",
                fmt.Sprintf("Custom Resource %s is being deleted from the namespace %s",
                        vm.Name,
                        vm.Namespace))

        // Release overlay IP address
        if vm.Spec.ExtraNetwork != nil {
                ip, err := r.IPAM.ReleaseIP(ctx, types.NamespacedName{Name: vm.Name, Namespace: vm.Namespace})
                if err != nil {
                        return fmt.Errorf("fail to release IP: %w", err)
                }
                message := fmt.Sprintf("Released IP %s", ip.String())
                log.Info(message)
                r.Recorder.Event(vm, "Normal", "OverlayNet", message)
        }
        return nil
}

func getRunnerVersion(pod *corev1.Pod) (api.RunnerProtoVersion, error) {
        val, ok := pod.Labels[vmv1.RunnerPodVersionLabel]
        if !ok {
                return api.RunnerProtoVersion(0), nil
        }

        uintVal, err := strconv.ParseUint(val, 10, 32)
        if err != nil {
                return 0, fmt.Errorf("failed to parse label value as integer: %w", err)
        }

        return api.RunnerProtoVersion(uintVal), nil
}

func runnerVersionIsSupported(version api.RunnerProtoVersion) bool {
        return version >= minSupportedRunnerVersion && version <= maxSupportedRunnerVersion
}

func (r *VMReconciler) updateVMStatusCPU(
        ctx context.Context,
        vm *vmv1.VirtualMachine,
        vmRunner *corev1.Pod,
        activeCPUs uint32,
        cgroupUsage *api.VCPUCgroup,
) {
        log := log.FromContext(ctx)

        // We expect:
        // - vm.Status.CPUs = cgroupUsage.VCPUs
        // - vm.Status.CPUs.RoundUp() == activeCPUs
        // Otherwise, we update the status.
        var currentCPUUsage vmv1.MilliCPU
        if cgroupUsage != nil {
                if cgroupUsage.VCPUs.RoundedUp() != activeCPUs {
                        // This is not expected but it's fine. We only report the
                        // mismatch here and will resolve it in the next reconcile
                        // iteration loops by comparing these values to spec CPU use
                        // and moving to the scaling phase.
                        log.Error(nil, "Mismatch in the number of VM's plugged CPUs and runner pod's cgroup vCPUs",
                                "VirtualMachine", vm.Name,
                                "Runner Pod", vmRunner.Name,
                                "plugged CPUs", activeCPUs,
                                "cgroup vCPUs", cgroupUsage.VCPUs)
                }
                currentCPUUsage = min(cgroupUsage.VCPUs, vmv1.MilliCPU(1000*activeCPUs))
        } else {
                currentCPUUsage = vmv1.MilliCPU(1000 * activeCPUs)
        }
        if vm.Status.CPUs == nil || *vm.Status.CPUs != currentCPUUsage {
                vm.Status.CPUs = &currentCPUUsage
                r.Recorder.Event(vm, "Normal", "CpuInfo",
                        fmt.Sprintf("VirtualMachine %s uses %v cpu cores",
                                vm.Name,
                                vm.Status.CPUs))
        }
}

func (r *VMReconciler) updateVMStatusMemory(
        vm *vmv1.VirtualMachine,
        qmpMemorySize *resource.Quantity,
) {
        if vm.Status.MemorySize == nil || !qmpMemorySize.Equal(*vm.Status.MemorySize) {
                vm.Status.MemorySize = qmpMemorySize
                r.Recorder.Event(vm, "Normal", "MemoryInfo",
                        fmt.Sprintf("VirtualMachine %s uses %v memory",
                                vm.Name,
                                vm.Status.MemorySize))
        }
}

func (r *VMReconciler) acquireOverlayIP(ctx context.Context, vm *vmv1.VirtualMachine) error {
        if vm.Spec.ExtraNetwork == nil || !vm.Spec.ExtraNetwork.Enable || len(vm.Status.ExtraNetIP) != 0 {
                // If the VM has extra network disabled or already has an IP, do nothing.
                return nil
        }

        log := log.FromContext(ctx)
        ip, err := r.IPAM.AcquireIP(ctx, types.NamespacedName{Name: vm.Name, Namespace: vm.Namespace})
        if err != nil {
                return err
        }
        message := fmt.Sprintf("Acquired IP %s for overlay network interface", ip.String())
        log.Info(message)
        vm.Status.ExtraNetIP = ip.IP.String()
        vm.Status.ExtraNetMask = fmt.Sprintf("%d.%d.%d.%d", ip.Mask[0], ip.Mask[1], ip.Mask[2], ip.Mask[3])
        r.Recorder.Event(vm, "Normal", "OverlayNet", message)
        return nil
}

func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine) error {
        log := log.FromContext(ctx)

        // Let's check and just set the condition status as Unknown when no status are available
        if len(vm.Status.Conditions) == 0 {
                // set Unknown condition status for AvailableVirtualMachine
                meta.SetStatusCondition(&vm.Status.Conditions, metav1.Condition{Type: typeAvailableVirtualMachine, Status: metav1.ConditionUnknown, Reason: "Reconciling", Message: "Starting reconciliation"})
        }

        // NB: .Spec.EnableSSH guaranteed non-nil because the k8s API server sets the default for us.
        enableSSH := *vm.Spec.EnableSSH

        // Generate ssh secret name
        if enableSSH && len(vm.Status.SSHSecretName) == 0 {
                vm.Status.SSHSecretName = fmt.Sprintf("ssh-neonvm-%s", vm.Name)
        }

        enableTLS := vm.Spec.TLS != nil

        // Generate tls secret name
        if enableTLS && len(vm.Status.TLSSecretName) == 0 {
                vm.Status.TLSSecretName = fmt.Sprintf("tls-neonvm-%s", vm.Name)
        }

        // check if the certificate needs renewal for this running VM.
        if enableTLS {
                certSecret, err := r.reconcileCertificateSecret(ctx, vm)
                if err != nil {
                        return err
                }
                // VM is not ready to start yet.
                if certSecret == nil {
                        return nil
                }
        }

        switch vm.Status.Phase {

        case "":
                if err := r.acquireOverlayIP(ctx, vm); err != nil {
                        if errors.Is(err, ipam.ErrAgain) {
                                // We are being rate limited by IPAM, let's try again later.
                                return err
                        }
                        log.Error(err, "Failed to acquire overlay IP", "VirtualMachine", vm.Name)
                        r.Recorder.Event(vm, "Warning", "OverlayNet", "Failed to acquire overlay IP")
                        return err
                }
                // VirtualMachine just created, change Phase to "Pending"
                vm.Status.Phase = vmv1.VmPending
        case vmv1.VmPending:
                // Generate runner pod name and set desired memory provider.
                if len(vm.Status.PodName) == 0 {
                        vm.Status.PodName = names.SimpleNameGenerator.GenerateName(fmt.Sprintf("%s-", vm.Name))
                        if err := vm.Spec.Guest.ValidateMemorySize(); err != nil {
                                return fmt.Errorf("Failed to validate memory size for VM: %w", err)
                        }

                        // Update the .Status on API Server to avoid creating multiple pods for a single VM
                        // See https://github.com/neondatabase/autoscaling/issues/794 for the context
                        if err := r.Status().Update(ctx, vm); err != nil {
                                return fmt.Errorf("Failed to update VirtualMachine status: %w", err)
                        }
                }

                // Check if the runner pod already exists, if not create a new one
                vmRunner := &corev1.Pod{}
                err := r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, vmRunner)
                if err != nil && apierrors.IsNotFound(err) {
                        var sshSecret *corev1.Secret
                        if enableSSH {
                                // Check if the ssh secret already exists, if not create a new one
                                sshSecret = &corev1.Secret{}
                                err := r.Get(ctx, types.NamespacedName{
                                        Name:      vm.Status.SSHSecretName,
                                        Namespace: vm.Namespace,
                                }, sshSecret)
                                if err != nil && apierrors.IsNotFound(err) {
                                        // Define a new ssh secret
                                        sshSecret, err = r.sshSecretForVirtualMachine(vm)
                                        if err != nil {
                                                log.Error(err, "Failed to define new SSH Secret for VirtualMachine")
                                                return err
                                        }

                                        log.Info("Creating a new SSH Secret", "Secret.Namespace", sshSecret.Namespace, "Secret.Name", sshSecret.Name)
                                        if err = r.Create(ctx, sshSecret); err != nil {
                                                log.Error(err, "Failed to create new SSH secret", "Secret.Namespace", sshSecret.Namespace, "Secret.Name", sshSecret.Name)
                                                return err
                                        }
                                        log.Info("SSH Secret was created", "Secret.Namespace", sshSecret.Namespace, "Secret.Name", sshSecret.Name)
                                } else if err != nil {
                                        log.Error(err, "Failed to get SSH Secret")
                                        return err
                                }
                        }

                        // Define a new pod
                        pod, err := r.podForVirtualMachine(vm, sshSecret)
                        if err != nil {
                                log.Error(err, "Failed to define new Pod resource for VirtualMachine")
                                return err
                        }

                        log.Info("Creating a new Pod", "Pod.Namespace", pod.Namespace, "Pod.Name", pod.Name)
                        if err = r.Create(ctx, pod); err != nil {
                                log.Error(err, "Failed to create new Pod", "Pod.Namespace", pod.Namespace, "Pod.Name", pod.Name)
                                return err
                        }
                        log.Info("Runner Pod was created", "Pod.Namespace", pod.Namespace, "Pod.Name", pod.Name)

                        msg := fmt.Sprintf("VirtualMachine %s created, Pod %s", vm.Name, pod.Name)
                        if sshSecret != nil {
                                msg = fmt.Sprintf("%s, SSH Secret %s", msg, sshSecret.Name)
                        }
                        r.Recorder.Event(vm, "Normal", "Created", msg)
                        if !vm.HasRestarted() {
                                d := pod.CreationTimestamp.Time.Sub(vm.CreationTimestamp.Time)
                                r.Metrics.vmCreationToRunnerCreationTime.Observe(d.Seconds())
                        }
                } else if err != nil {
                        log.Error(err, "Failed to get vm-runner Pod")
                        return err
                }

                // Update the metadata (including "usage" annotation) before anything else, so that it
                // will be correctly set even if the rest of the reconcile operation fails.
                if err := updatePodMetadataIfNecessary(ctx, r.Client, vm, vmRunner); err != nil {
                        log.Error(err, "Failed to sync pod labels and annotations", "VirtualMachine", vm.Name)
                }

                // runner pod found, check phase
                switch runnerStatus(vmRunner) {
                case runnerRunning:
                        vm.Status.PodIP = vmRunner.Status.PodIP
                        vm.Status.Phase = vmv1.VmRunning
                        meta.SetStatusCondition(&vm.Status.Conditions,
                                metav1.Condition{
                                        Type:    typeAvailableVirtualMachine,
                                        Status:  metav1.ConditionTrue,
                                        Reason:  "Reconciling",
                                        Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) created successfully", vm.Status.PodName, vm.Name),
                                })
                        {
                                // Calculating VM startup latency metrics
                                now := time.Now()
                                d := now.Sub(vmRunner.CreationTimestamp.Time)
                                r.Metrics.runnerCreationToVMRunningTime.Observe(d.Seconds())
                                if !vm.HasRestarted() {
                                        d := now.Sub(vm.CreationTimestamp.Time)
                                        r.Metrics.vmCreationToVMRunningTime.Observe(d.Seconds())
                                        log.Info("VM creation to VM running time", "duration(sec)", d.Seconds())
                                }
                        }
                case runnerSucceeded:
                        vm.Status.Phase = vmv1.VmSucceeded
                        meta.SetStatusCondition(&vm.Status.Conditions,
                                metav1.Condition{
                                        Type:    typeAvailableVirtualMachine,
                                        Status:  metav1.ConditionFalse,
                                        Reason:  "Reconciling",
                                        Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) succeeded", vm.Status.PodName, vm.Name),
                                })
                case runnerFailed:
                        vm.Status.Phase = vmv1.VmFailed
                        meta.SetStatusCondition(&vm.Status.Conditions,
                                metav1.Condition{
                                        Type:    typeDegradedVirtualMachine,
                                        Status:  metav1.ConditionTrue,
                                        Reason:  "Reconciling",
                                        Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) failed", vm.Status.PodName, vm.Name),
                                })
                default:
                        // do nothing
                }
        case vmv1.VmRunning:
                // Check if the runner pod exists
                vmRunner := &corev1.Pod{}
                err := r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, vmRunner)
                if err != nil && apierrors.IsNotFound(err) {
                        // lost runner pod for running VirtualMachine ?
                        r.Recorder.Event(vm, "Warning", "NotFound",
                                fmt.Sprintf("runner pod %s not found",
                                        vm.Status.PodName))
                        vm.Status.Phase = vmv1.VmFailed
                        meta.SetStatusCondition(&vm.Status.Conditions,
                                metav1.Condition{
                                        Type:    typeDegradedVirtualMachine,
                                        Status:  metav1.ConditionTrue,
                                        Reason:  "Reconciling",
                                        Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) not found", vm.Status.PodName, vm.Name),
                                })
                } else if err != nil {
                        log.Error(err, "Failed to get runner Pod")
                        return err
                }

                // Update the metadata (including "usage" annotation) before anything else, so that it
                // will be correctly set even if the rest of the reconcile operation fails.
                if err := updatePodMetadataIfNecessary(ctx, r.Client, vm, vmRunner); err != nil {
                        log.Error(err, "Failed to sync pod labels and annotations", "VirtualMachine", vm.Name)
                }

                // runner pod found, check/update phase now
                switch runnerStatus(vmRunner) {
                case runnerRunning:
                        // update status by IP of runner pod
                        vm.Status.PodIP = vmRunner.Status.PodIP
                        // update phase
                        vm.Status.Phase = vmv1.VmRunning
                        // update Node name where runner working
                        vm.Status.Node = vmRunner.Spec.NodeName

                        runnerVersion, err := getRunnerVersion(vmRunner)
                        if err != nil {
                                log.Error(err, "Failed to get runner version of VM runner pod", "VirtualMachine", vm.Name)
                                return err
                        }
                        if !runnerVersionIsSupported(runnerVersion) {
                                err := fmt.Errorf("runner version %v is not supported", runnerVersion)
                                log.Error(err, "VM runner pod has unsupported version", "VirtualMachine", vm.Name)
                                return err
                        }

                        // get cgroups CPU details from runner pod
                        cgroupUsage, err := getRunnerCPULimits(ctx, vm)
                        if err != nil {
                                log.Error(err, "Failed to get CPU details from runner", "VirtualMachine", vm.Name)
                                return err
                        }
                        var pluggedCPU uint32

                        if vm.Spec.CpuScalingMode == nil { // should not happen
                                err := fmt.Errorf("CPU scaling mode is not set")
                                log.Error(err, "Unknown CPU scaling mode", "VirtualMachine", vm.Name)
                                return err
                        }

                        switch *vm.Spec.CpuScalingMode {
                        case vmv1.CpuScalingModeSysfs:
                                pluggedCPU = cgroupUsage.VCPUs.RoundedUp()
                        case vmv1.CpuScalingModeQMP:
                                cpuSlotsPlugged, _, err := QmpGetCpus(QmpAddr(vm))
                                if err != nil {
                                        log.Error(err, "Failed to get CPU details from VirtualMachine", "VirtualMachine", vm.Name)
                                        return err
                                }
                                pluggedCPU = uint32(len(cpuSlotsPlugged))
                        default:
                                err := fmt.Errorf("unsupported CPU scaling mode: %s", *vm.Spec.CpuScalingMode)
                                log.Error(err, "Unknown CPU scaling mode", "VirtualMachine", vm.Name, "CPU scaling mode", *vm.Spec.CpuScalingMode)
                                return err
                        }

                        // update status by CPUs used in the VM
                        r.updateVMStatusCPU(ctx, vm, vmRunner, pluggedCPU, cgroupUsage)

                        // get Memory details from hypervisor and update VM status
                        memorySize, err := QmpGetMemorySize(QmpAddr(vm))
                        if err != nil {
                                log.Error(err, "Failed to get Memory details from VirtualMachine", "VirtualMachine", vm.Name)
                                return err
                        }
                        // update status by memory sizes used in the VM
                        r.updateVMStatusMemory(vm, memorySize)

                        // check if need hotplug/unplug CPU or memory
                        // compare guest spec and count of plugged

                        specUseCPU := vm.Spec.Guest.CPUs.Use
                        scaleCgroupCPU := specUseCPU != cgroupUsage.VCPUs
                        scaleQemuCPU := specUseCPU.RoundedUp() != pluggedCPU
                        if scaleCgroupCPU || scaleQemuCPU {
                                log.Info("VM goes into scaling mode, CPU count needs to be changed",
                                        "CPUs on runner pod cgroup", cgroupUsage.VCPUs,
                                        "CPUs on board", pluggedCPU,
                                        "CPUs in spec", vm.Spec.Guest.CPUs.Use)
                                vm.Status.Phase = vmv1.VmScaling
                        }

                        memorySizeFromSpec := resource.NewQuantity(int64(vm.Spec.Guest.MemorySlots.Use)*vm.Spec.Guest.MemorySlotSize.Value(), resource.BinarySI)
                        if !memorySize.Equal(*memorySizeFromSpec) {
                                log.Info("VM goes into scale mode, need to resize Memory",
                                        "Memory on board", memorySize,
                                        "Memory in spec", memorySizeFromSpec)
                                vm.Status.Phase = vmv1.VmScaling
                        }
                case runnerSucceeded:
                        vm.Status.Phase = vmv1.VmSucceeded
                        meta.SetStatusCondition(&vm.Status.Conditions,
                                metav1.Condition{
                                        Type:    typeAvailableVirtualMachine,
                                        Status:  metav1.ConditionFalse,
                                        Reason:  "Reconciling",
                                        Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) succeeded", vm.Status.PodName, vm.Name),
                                })
                case runnerFailed:
                        vm.Status.Phase = vmv1.VmFailed
                        meta.SetStatusCondition(&vm.Status.Conditions,
                                metav1.Condition{
                                        Type:    typeDegradedVirtualMachine,
                                        Status:  metav1.ConditionTrue,
                                        Reason:  "Reconciling",
                                        Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) failed", vm.Status.PodName, vm.Name),
                                })
                default:
                        // do nothing
                }

        case vmv1.VmScaling:
                // Check that runner pod is still ok
                vmRunner := &corev1.Pod{}
                err := r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, vmRunner)
                if err != nil && apierrors.IsNotFound(err) {
                        // lost runner pod for running VirtualMachine ?
                        r.Recorder.Event(vm, "Warning", "NotFound",
                                fmt.Sprintf("runner pod %s not found",
                                        vm.Status.PodName))
                        vm.Status.Phase = vmv1.VmFailed
                        meta.SetStatusCondition(&vm.Status.Conditions,
                                metav1.Condition{
                                        Type:    typeDegradedVirtualMachine,
                                        Status:  metav1.ConditionTrue,
                                        Reason:  "Reconciling",
                                        Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) not found", vm.Status.PodName, vm.Name),
                                })
                } else if err != nil {
                        log.Error(err, "Failed to get runner Pod")
                        return err
                }

                // Update the metadata (including "usage" annotation) before anything else, so that it
                // will be correctly set even if the rest of the reconcile operation fails.
                if err := updatePodMetadataIfNecessary(ctx, r.Client, vm, vmRunner); err != nil {
                        log.Error(err, "Failed to sync pod labels and annotations", "VirtualMachine", vm.Name)
                }

                // runner pod found, check that it's still up:
                switch runnerStatus(vmRunner) {
                case runnerSucceeded:
                        vm.Status.Phase = vmv1.VmSucceeded
                        meta.SetStatusCondition(&vm.Status.Conditions,
                                metav1.Condition{
                                        Type:    typeAvailableVirtualMachine,
                                        Status:  metav1.ConditionFalse,
                                        Reason:  "Reconciling",
                                        Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) succeeded", vm.Status.PodName, vm.Name),
                                })
                        return nil
                case runnerFailed:
                        vm.Status.Phase = vmv1.VmFailed
                        meta.SetStatusCondition(&vm.Status.Conditions,
                                metav1.Condition{
                                        Type:    typeDegradedVirtualMachine,
                                        Status:  metav1.ConditionTrue,
                                        Reason:  "Reconciling",
                                        Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) failed", vm.Status.PodName, vm.Name),
                                })
                        return nil
                default:
                        // do nothing
                }

                runnerVersion, err := getRunnerVersion(vmRunner)
                if err != nil {
                        log.Error(err, "Failed to get runner version of VM runner pod", "VirtualMachine", vm.Name)
                        return err
                }
                if !runnerVersionIsSupported(runnerVersion) {
                        err := fmt.Errorf("runner version %v is not supported", runnerVersion)
                        log.Error(err, "VM runner pod has unsupported version", "VirtualMachine", vm.Name)
                        return err
                }

                cpuScaled, err := r.handleCPUScaling(ctx, vm, vmRunner)
                if err != nil {
                        log.Error(err, "failed to handle CPU scaling")
                        return err
                }
                ramScaled := false

                // do hotplug/unplug Memory
                ramScaled, err = r.doVirtioMemScaling(vm)
                if err != nil {
                        return err
                }

                // set VM phase to running if everything scaled
                if cpuScaled && ramScaled {
                        vm.Status.Phase = vmv1.VmRunning
                }

        case vmv1.VmSucceeded, vmv1.VmFailed:
                // Always delete runner pod. Otherwise, we could end up with one container succeeded/failed
                // but the other one still running (meaning that the pod still ends up Running).
                vmRunner := &corev1.Pod{}
                err := r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, vmRunner)
                if err == nil {
                        // delete current runner
                        if err := r.deleteRunnerPodIfEnabled(ctx, vm, vmRunner); err != nil {
                                return err
                        }
                } else if !apierrors.IsNotFound(err) {
                        return err
                }

                // By default, we cleanup the VM, even if previous pod still exists. This behavior is for the case
                // where the pod is stuck deleting, and we want to progress without waiting for it.
                //
                // However, this opens up a possibility for cascading failures where the pods would be constantly
                // recreated, and then stuck deleting. That's why we have AtMostOnePod.
                if !r.Config.AtMostOnePod || apierrors.IsNotFound(err) {
                        // NB: Cleanup() leaves status .Phase and .RestartCount (+ some others) but unsets other fields.
                        vm.Cleanup()

                        var shouldRestart bool
                        switch vm.Spec.RestartPolicy {
                        case vmv1.RestartPolicyAlways:
                                shouldRestart = true
                        case vmv1.RestartPolicyOnFailure:
                                shouldRestart = vm.Status.Phase == vmv1.VmFailed
                        case vmv1.RestartPolicyNever:
                                shouldRestart = false
                        }

                        if shouldRestart {
                                log.Info("Restarting VM runner pod", "VM.Phase", vm.Status.Phase, "RestartPolicy", vm.Spec.RestartPolicy)
                                vm.Status.Phase = vmv1.VmPending // reset to trigger restart
                                vm.Status.RestartCount += 1      // increment restart count
                                r.Metrics.vmRestartCounts.Inc()
                        }

                        // TODO for RestartPolicyNever: implement TTL or do nothing
                }
        default:
                // do nothing
        }

        // Propagate TargetRevision to CurrentRevision. This is done only if the VM is fully
        // reconciled and running.
        if vm.Status.Phase == vmv1.VmRunning {
                propagateRevision(vm)
        }

        return nil
}

func propagateRevision(vm *vmv1.VirtualMachine) {
        if vm.Spec.TargetRevision == nil {
                return
        }
        if vm.Status.CurrentRevision != nil &&
                vm.Status.CurrentRevision.Revision == vm.Spec.TargetRevision.Revision {
                return
        }
        rev := vm.Spec.TargetRevision.WithTime(time.Now())
        vm.Status.CurrentRevision = &rev
}

func (r *VMReconciler) doVirtioMemScaling(vm *vmv1.VirtualMachine) (done bool, _ error) {
        targetSlotCount := int(vm.Spec.Guest.MemorySlots.Use - vm.Spec.Guest.MemorySlots.Min)

        targetVirtioMemSize := int64(targetSlotCount) * vm.Spec.Guest.MemorySlotSize.Value()
        previousTarget, err := QmpSetVirtioMem(vm, targetVirtioMemSize)
        if err != nil {
                return false, err
        }

        goalTotalSize := resource.NewQuantity(
                int64(vm.Spec.Guest.MemorySlots.Use)*vm.Spec.Guest.MemorySlotSize.Value(),
                resource.BinarySI,
        )

        if previousTarget != targetVirtioMemSize {
                // We changed the requested size. Make an event for it.
                reason := "ScaleUp"
                if targetVirtioMemSize < previousTarget {
                        reason = "ScaleDown"
                }
                r.Recorder.Eventf(vm, "Normal", reason, "Set virtio-mem size for %v total memory", goalTotalSize)
        }

        // Maybe we're already using the amount we want?
        // Update the status to reflect the current size - and if it matches goalTotalSize, ram
        // scaling is done.
        currentTotalSize, err := QmpGetMemorySize(QmpAddr(vm))
        if err != nil {
                return false, err
        }

        done = currentTotalSize.Value() == goalTotalSize.Value()
        r.updateVMStatusMemory(vm, currentTotalSize)
        return done, nil
}

type runnerStatusKind string

const (
        runnerPending   runnerStatusKind = "Pending"
        runnerRunning   runnerStatusKind = "Running"
        runnerFailed    runnerStatusKind = "Failed"
        runnerSucceeded runnerStatusKind = "Succeeded"
)

// runnerStatus returns a description of the status of the VM inside the runner pod.
//
// This is *similar* to the value of pod.Status.Phase, but we'd like to retain our own abstraction
// to have more control over the semantics.
// We handle PodRunning phase differently during VM Migration phase.
func runnerStatus(pod *corev1.Pod) runnerStatusKind {
        // Add 5 seconds to account for clock skew and k8s lagging behind.
        deadline := metav1.NewTime(metav1.Now().Add(-5 * time.Second))

        // If the pod is being deleted, we consider it failed. The deletion might be stalled
        // because the node is shutting down, or the pod is stuck pulling an image.
        if pod.DeletionTimestamp != nil && pod.DeletionTimestamp.Before(&deadline) {
                return runnerFailed
        }
        switch pod.Status.Phase {
        case "", corev1.PodPending:
                return runnerPending
        case corev1.PodSucceeded:
                return runnerSucceeded
        case corev1.PodFailed:
                return runnerFailed
        case corev1.PodRunning:
                return runnerContainerStatus(pod)
        default:
                panic(fmt.Errorf("unknown pod phase: %q", pod.Status.Phase))
        }
}

const (
        runnerContainerName = "neonvm-runner"
)

// runnerContainerStatus returns status of the runner container.
func runnerContainerStatus(pod *corev1.Pod) runnerStatusKind {
        // if the pod has no container statuses, we consider it pending
        if len(pod.Status.ContainerStatuses) == 0 {
                return runnerPending
        }
        _, role, ownedByMigration := vmv1.MigrationOwnerForPod(pod)

        // if a target pod for a migration, we consider the pod running
        // because the qemu is started in incoming migration mode
        // and neonvm-daemon which is used in readiness probe is not available
        if ownedByMigration && role == vmv1.MigrationRoleTarget {
                return runnerRunning
        }

        // normal case scenario, pod is not owned by the migration
        // and we check the neonvm-runner container for the readiness probe
        for _, c := range pod.Status.ContainerStatuses {
                // we only care about the neonvm-runner container
                if c.Name == runnerContainerName && !c.Ready {
                        return runnerPending
                }
        }

        return runnerRunning
}

// deleteRunnerPodIfEnabled deletes the runner pod if buildtag.NeverDeleteRunnerPods is false, and
// then emits an event and log line about what it did, whether it actually deleted the runner pod.
func (r *VMReconciler) deleteRunnerPodIfEnabled(
        ctx context.Context,
        vm *vmv1.VirtualMachine,
        runner *corev1.Pod,
) error {
        log := log.FromContext(ctx)
        var msg, eventReason string
        if buildtag.NeverDeleteRunnerPods {
                msg = fmt.Sprintf("VM runner pod deletion was skipped due to '%s' build tag", buildtag.TagnameNeverDeleteRunnerPods)
                eventReason = "DeleteSkipped"
        } else {
                // delete current runner
                if err := r.Delete(ctx, runner); err != nil {
                        return err
                }
                msg = "VM runner pod was deleted"
                eventReason = "Deleted"
        }
        log.Info(msg, "Pod.Namespace", runner.Namespace, "Pod.Name", runner.Name)
        r.Recorder.Event(vm, "Normal", eventReason, fmt.Sprintf("%s: %s", msg, runner.Name))
        return nil
}

// updates the values of the runner pod's labels and annotations so that they are exactly equal to
// the set of labels/annotations we expect - minus some that are ignored.
//
// The reason we also need to delete unrecognized labels/annotations is so that if a
// label/annotation on the VM itself is deleted, we can accurately reflect that in the pod.
func updatePodMetadataIfNecessary(ctx context.Context, c client.Client, vm *vmv1.VirtualMachine, runnerPod *corev1.Pod) error {
        log := log.FromContext(ctx)

        var patches []patch.Operation

        metaSpecs := []struct {
                metaField   string
                expected    map[string]string
                actual      map[string]string
                ignoreExtra map[string]bool // use bool here so `if ignoreExtra[key] { ... }` works
        }{
                {
                        metaField: "labels",
                        expected:  labelsForVirtualMachine(vm, nil), // don't include runner version
                        actual:    runnerPod.Labels,
                        ignoreExtra: map[string]bool{
                                // Don't override the runner pod version - we need to keep it around without
                                // changing it; otherwise it's not useful!
                                vmv1.RunnerPodVersionLabel: true,
                        },
                },
                {
                        metaField: "annotations",
                        expected:  annotationsForVirtualMachine(vm),
                        actual:    runnerPod.Annotations,
                        ignoreExtra: map[string]bool{
                                "k8s.v1.cni.cncf.io/networks":        true,
                                "k8s.v1.cni.cncf.io/network-status":  true,
                                "k8s.v1.cni.cncf.io/networks-status": true,
                        },
                },
        }

        var removedMessageParts []string

        for _, spec := range metaSpecs {
                // Add/update the entries we're expecting to be there
                for k, e := range spec.expected {
                        if a, ok := spec.actual[k]; !ok || e != a {
                                patches = append(patches, patch.Operation{
                                        // From RFC 6902 (JSON patch):
                                        //
                                        // > The "add" operation performs one of the following functions, depending upon
                                        // > what the target location references:
                                        // >
                                        // > [ ... ]
                                        // >
                                        // > * If the target location specifies an object member that does not already
                                        // >   exist, a new member is added to the object.
                                        // > * If the target location specifies an object member that does exist, that
                                        // >   member's value is replaced.
                                        //
                                        // So: if the value is missing we'll add it. And if it's different, we'll replace it.
                                        Op:    patch.OpAdd,
                                        Path:  fmt.Sprintf("/metadata/%s/%s", spec.metaField, patch.PathEscape(k)),
                                        Value: e,
                                })
                        }
                }

                // Remove the entries we aren't expecting to be there
                var removed []string
                for k := range spec.actual {
                        if _, expected := spec.expected[k]; !expected && !spec.ignoreExtra[k] {
                                removed = append(removed, k)
                                patches = append(patches, patch.Operation{
                                        Op:   patch.OpRemove,
                                        Path: fmt.Sprintf("/metadata/%s/%s", spec.metaField, patch.PathEscape(k)),
                                })
                        }
                }

                if len(removed) != 0 {
                        // note: formatting with %q for a []string will print the array normally, but escape the
                        // strings inside. For example:
                        //
                        //   fmt.Printf("%q\n", []string{"foo", "bar", "escaped\nstring"})
                        //
                        // outputs:
                        //
                        //   ["foo" "bar" "escaped\nstring"]
                        //
                        // So the "message part" might look like `labels ["foo" "test-label"]`
                        removedMessageParts = append(removedMessageParts, fmt.Sprintf("%s %q", spec.metaField, removed))
                }
        }

        if len(patches) == 0 {
                return nil
        }

        patchData, err := json.Marshal(patches)
        if err != nil {
                panic(fmt.Errorf("error marshalling JSON patch: %w", err))
        }

        if len(removedMessageParts) != 0 {
                var msg string

                if len(removedMessageParts) == 1 {
                        msg = fmt.Sprintf("removing runner pod %s", removedMessageParts[0])
                } else /* len = 2 */ {
                        msg = fmt.Sprintf("removing runner pod %s and %s", removedMessageParts[0], removedMessageParts[1])
                }

                // We want to log something when labels/annotations are removed, because the ignoreExtra
                // values above might be incomplete, and it'd be hard to debug without an logs for the
                // change.
                log.Info(msg, "VirtualMachine", vm.Name, "Pod", runnerPod.Name)
        }

        // NOTE: We don't need to update the data in runnerPod ourselves because c.Patch will update it
        // with what we get back from the k8s API after the patch completes.
        return c.Patch(ctx, runnerPod, client.RawPatch(types.JSONPatchType, patchData))
}

func extractVirtualMachineUsageJSON(spec vmv1.VirtualMachineSpec) string {
        cpu := spec.Guest.CPUs.Use

        memorySlots := spec.Guest.MemorySlots.Use

        usage := vmv1.VirtualMachineUsage{
                CPU:    cpu.ToResourceQuantity(),
                Memory: resource.NewQuantity(spec.Guest.MemorySlotSize.Value()*int64(memorySlots), resource.BinarySI),
        }

        usageJSON, err := json.Marshal(usage)
        if err != nil {
                panic(fmt.Errorf("error marshalling JSON: %w", err))
        }

        return string(usageJSON)
}

func extractVirtualMachineResourcesJSON(spec vmv1.VirtualMachineSpec) string {
        resourcesJSON, err := json.Marshal(spec.Resources())
        if err != nil {
                panic(fmt.Errorf("error marshalling JSON: %w", err))
        }

        return string(resourcesJSON)
}

// podForVirtualMachine returns a VirtualMachine Pod object
func (r *VMReconciler) podForVirtualMachine(
        vm *vmv1.VirtualMachine,
        sshSecret *corev1.Secret,
) (*corev1.Pod, error) {
        pod, err := podSpec(vm, sshSecret, r.Config)
        if err != nil {
                return nil, err
        }

        // Set the ownerRef for the Pod
        // More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/owners-dependents/
        if err := ctrl.SetControllerReference(vm, pod, r.Scheme); err != nil {
                return nil, err
        }

        return pod, nil
}

func (r *VMReconciler) sshSecretForVirtualMachine(vm *vmv1.VirtualMachine) (*corev1.Secret, error) {
        secret, err := sshSecretSpec(vm)
        if err != nil {
                return nil, err
        }

        // Set the ownerRef for the Secret
        // More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/owners-dependents/
        if err := ctrl.SetControllerReference(vm, secret, r.Scheme); err != nil {
                return nil, err
        }
        return secret, nil
}

func sshSecretSpec(vm *vmv1.VirtualMachine) (*corev1.Secret, error) {
        // using ed25519 signatures it takes ~16us to finish
        publicKey, privateKey, err := sshKeygen()
        if err != nil {
                return nil, err
        }

        secret := &corev1.Secret{
                ObjectMeta: metav1.ObjectMeta{
                        Name:      vm.Status.SSHSecretName,
                        Namespace: vm.Namespace,
                },
                Immutable: lo.ToPtr(true),
                Type:      corev1.SecretTypeSSHAuth,
                Data: map[string][]byte{
                        "ssh-publickey":  publicKey,
                        "ssh-privatekey": privateKey,
                },
        }

        return secret, nil
}

// certReqForVirtualMachine returns a VirtualMachine CertificateRequest object
func (r *VMReconciler) certReqForVirtualMachine(
        vm *vmv1.VirtualMachine,
        key crypto.Signer,
) (*certv1.CertificateRequest, error) {
        cert, err := certReqSpec(vm, key)
        if err != nil {
                return nil, err
        }

        // Set the ownerRef for the Certificate
        // More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/owners-dependents/
        if err := ctrl.SetControllerReference(vm, cert, r.Scheme); err != nil {
                return nil, err
        }

        return cert, nil
}

// tmpKeySecretForVirtualMachine returns a VirtualMachine Secret object for temporarily storing the key
func (r *VMReconciler) tmpKeySecretForVirtualMachine(
        vm *vmv1.VirtualMachine,
        key crypto.Signer,
) (*corev1.Secret, error) {
        secret, err := tmpKeySecretSpec(vm, key)
        if err != nil {
                return nil, err
        }

        // Set the ownerRef for the Certificate
        // More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/owners-dependents/
        if err := ctrl.SetControllerReference(vm, secret, r.Scheme); err != nil {
                return nil, err
        }

        return secret, nil
}

// certSecretForVirtualMachine returns a VirtualMachine Secret object for storing the key+cert
func (r *VMReconciler) certSecretForVirtualMachine(
        vm *vmv1.VirtualMachine,
        key crypto.Signer,
        cert []byte,
) (*corev1.Secret, error) {
        secret, err := certSecretSpec(vm, key, cert)
        if err != nil {
                return nil, err
        }

        // Set the ownerRef for the Certificate
        // More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/owners-dependents/
        if err := ctrl.SetControllerReference(vm, secret, r.Scheme); err != nil {
                return nil, err
        }

        return secret, nil
}

// labelsForVirtualMachine returns the labels for selecting the resources
// More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/
func labelsForVirtualMachine(vm *vmv1.VirtualMachine, runnerVersion *api.RunnerProtoVersion) map[string]string {
        l := make(map[string]string, len(vm.Labels)+3)
        for k, v := range vm.Labels {
                l[k] = v
        }

        l["app.kubernetes.io/name"] = "NeonVM"
        l[vmv1.VirtualMachineNameLabel] = vm.Name
        if runnerVersion != nil {
                l[vmv1.RunnerPodVersionLabel] = fmt.Sprintf("%d", *runnerVersion)
        }
        return l
}

func annotationsForVirtualMachine(vm *vmv1.VirtualMachine) map[string]string {
        // use bool here so `if ignored[key] { ... }` works
        ignored := map[string]bool{
                "kubectl.kubernetes.io/last-applied-configuration": true,
        }

        a := make(map[string]string, len(vm.Annotations)+2)
        for k, v := range vm.Annotations {
                if !ignored[k] {
                        a[k] = v
                }
        }

        a["kubectl.kubernetes.io/default-container"] = "neonvm-runner"
        a[vmv1.VirtualMachineUsageAnnotation] = extractVirtualMachineUsageJSON(vm.Spec)
        a[vmv1.VirtualMachineResourcesAnnotation] = extractVirtualMachineResourcesJSON(vm.Spec)
        return a
}

func affinityForVirtualMachine(vm *vmv1.VirtualMachine) *corev1.Affinity {
        a := vm.Spec.Affinity
        if a == nil {
                a = &corev1.Affinity{}
        }
        if a.NodeAffinity == nil {
                a.NodeAffinity = &corev1.NodeAffinity{}
        }
        if a.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil {
                a.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &corev1.NodeSelector{}
        }
        nodeSelector := a.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution

        // always add default values (arch==vm.Spec.Affinity or os==linux) even if there are already some values
        nodeSelector.NodeSelectorTerms = append(
                nodeSelector.NodeSelectorTerms,
                corev1.NodeSelectorTerm{
                        MatchExpressions: []corev1.NodeSelectorRequirement{
                                {
                                        Key:      "kubernetes.io/os",
                                        Operator: "In",
                                        Values:   []string{"linux"},
                                },
                                {
                                        Key:      "kubernetes.io/arch",
                                        Operator: "In",
                                        // vm.Spec.TargetArchitecture is guaranteed to be set by reconciler loop
                                        Values: []string{string(*vm.Spec.TargetArchitecture)},
                                },
                        },
                },
        )

        return a
}

// imageForVirtualMachine gets the Operand image which is managed by this controller
// from the VM_RUNNER_IMAGE environment variable defined in the config/manager/manager.yaml
func imageForVmRunner() (string, error) {
        imageEnvVar := "VM_RUNNER_IMAGE"
        image, found := os.LookupEnv(imageEnvVar)
        if !found {
                return "", fmt.Errorf("unable to find %s environment variable with the image", imageEnvVar)
        }
        return image, nil
}

func podSpec(
        vm *vmv1.VirtualMachine,
        sshSecret *corev1.Secret,
        config *ReconcilerConfig,
) (*corev1.Pod, error) {
        runnerVersion := api.RunnerProtoV1
        labels := labelsForVirtualMachine(vm, &runnerVersion)
        annotations := annotationsForVirtualMachine(vm)
        affinity := affinityForVirtualMachine(vm)

        // Get the Operand image
        image, err := imageForVmRunner()
        if err != nil {
                return nil, err
        }

        vmSpecJson, err := json.Marshal(vm.Spec)
        if err != nil {
                return nil, fmt.Errorf("marshal VM Spec: %w", err)
        }

        vmStatusJson, err := json.Marshal(vm.Status)
        if err != nil {
                return nil, fmt.Errorf("marshal VM Status: %w", err)
        }

        // We have to add tolerations explicitly here.
        // Otherwise, if the k8s node becomes unavailable, the default
        // tolerations will be added, which are 300s (5m) long, which is
        // not acceptable for us.
        tolerations := append([]corev1.Toleration{}, vm.Spec.Tolerations...)
        tolerations = append(tolerations,
                corev1.Toleration{
                        Key:               "node.kubernetes.io/not-ready",
                        TolerationSeconds: lo.ToPtr(int64(30)),
                        Effect:            "NoExecute",
                },
                corev1.Toleration{
                        Key:               "node.kubernetes.io/unreachable",
                        TolerationSeconds: lo.ToPtr(int64(30)),
                        Effect:            "NoExecute",
                },
        )

        pod := &corev1.Pod{
                ObjectMeta: metav1.ObjectMeta{
                        Name:        vm.Status.PodName,
                        Namespace:   vm.Namespace,
                        Labels:      labels,
                        Annotations: annotations,
                },
                Spec: corev1.PodSpec{
                        EnableServiceLinks:            vm.Spec.ServiceLinks,
                        AutomountServiceAccountToken:  lo.ToPtr(false),
                        RestartPolicy:                 corev1.RestartPolicyNever,
                        TerminationGracePeriodSeconds: vm.Spec.TerminationGracePeriodSeconds,
                        NodeSelector:                  vm.Spec.NodeSelector,
                        ImagePullSecrets:              vm.Spec.ImagePullSecrets,
                        Tolerations:                   tolerations,
                        ServiceAccountName:            vm.Spec.ServiceAccountName,
                        SchedulerName:                 vm.Spec.SchedulerName,
                        Affinity:                      affinity,
                        InitContainers: []corev1.Container{
                                {
                                        Image:           vm.Spec.Guest.RootDisk.Image,
                                        Name:            "init",
                                        ImagePullPolicy: vm.Spec.Guest.RootDisk.ImagePullPolicy,
                                        VolumeMounts: []corev1.VolumeMount{{
                                                Name:      "virtualmachineimages",
                                                MountPath: "/vm/images",
                                        }},
                                        Command: []string{
                                                "sh", "-c",
                                                "mv /disk.qcow2 /vm/images/rootdisk.qcow2 && " +
                                                        /* uid=36(qemu) gid=34(kvm) groups=34(kvm) */
                                                        "chown 36:34 /vm/images/rootdisk.qcow2 && " +
                                                        "sysctl -w net.ipv4.ip_forward=1",
                                        },
                                        SecurityContext: &corev1.SecurityContext{
                                                Privileged: lo.ToPtr(true),
                                        },
                                },
                        },
                        // generate containers as an inline function so the context isn't isolated
                        Containers: func() []corev1.Container {
                                runner := corev1.Container{
                                        Image:           image,
                                        Name:            "neonvm-runner",
                                        ImagePullPolicy: corev1.PullIfNotPresent,
                                        // Ensure restrictive context for the container
                                        // More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
                                        SecurityContext: &corev1.SecurityContext{
                                                Privileged: lo.ToPtr(false),
                                                Capabilities: &corev1.Capabilities{
                                                        Add: []corev1.Capability{
                                                                "NET_ADMIN",
                                                                "SYS_ADMIN",
                                                                "SYS_RESOURCE",
                                                        },
                                                },
                                        },
                                        Ports: []corev1.ContainerPort{
                                                {
                                                        ContainerPort: vm.Spec.QMP,
                                                        Name:          "qmp",
                                                },
                                                {
                                                        ContainerPort: vm.Spec.QMPManual,
                                                        Name:          "qmp-manual",
                                                },
                                                {
                                                        ContainerPort: vm.Spec.RunnerPort,
                                                        Name:          "runner",
                                                },
                                        },
                                        Command: func() []string {
                                                cmd := []string{"runner"}
                                                if config.DisableRunnerCgroup {
                                                        cmd = append(cmd, "-skip-cgroup-management")
                                                }

                                                memhpAutoMovableRatio := config.MemhpAutoMovableRatio
                                                if specValue := vm.Spec.Guest.MemhpAutoMovableRatio; specValue != nil {
                                                        memhpAutoMovableRatio = *specValue
                                                }

                                                cmd = append(
                                                        cmd,
                                                        "-qemu-disk-cache-settings", config.QEMUDiskCacheSettings,
                                                        "-memhp-auto-movable-ratio", memhpAutoMovableRatio,
                                                )
                                                // put these last, so that the earlier args are easier to see (because these
                                                // can get quite large)
                                                cmd = append(
                                                        cmd,
                                                        "-vmspec", base64.StdEncoding.EncodeToString(vmSpecJson),
                                                        "-vmstatus", base64.StdEncoding.EncodeToString(vmStatusJson),
                                                )
                                                // NB: We don't need to check if the value is nil because the default value
                                                // was set in Reconcile
                                                cmd = append(cmd, "-cpu-scaling-mode", string(*vm.Spec.CpuScalingMode))
                                                return cmd
                                        }(),
                                        Env: []corev1.EnvVar{{
                                                Name: "K8S_POD_NAME",
                                                ValueFrom: &corev1.EnvVarSource{
                                                        FieldRef: &corev1.ObjectFieldSelector{
                                                                FieldPath: "metadata.name",
                                                        },
                                                },
                                        }},
                                        VolumeMounts: func() []corev1.VolumeMount {
                                                images := corev1.VolumeMount{
                                                        Name:      "virtualmachineimages",
                                                        MountPath: "/vm/images",
                                                }
                                                cgroups := corev1.VolumeMount{
                                                        Name:      "sysfscgroup",
                                                        MountPath: "/sys/fs/cgroup",
                                                        // MountPropagationNone means that the volume in a container will
                                                        // not receive new mounts from the host or other containers, and filesystems
                                                        // mounted inside the container won't be propagated to the host or other
                                                        // containers.
                                                        // Note that this mode corresponds to "private" in Linux terminology.
                                                        MountPropagation: lo.ToPtr(corev1.MountPropagationNone),
                                                }

                                                if config.DisableRunnerCgroup {
                                                        return []corev1.VolumeMount{images}
                                                } else {
                                                        // the /sys/fs/cgroup mount is only necessary if neonvm-runner has to
                                                        // do is own cpu limiting
                                                        return []corev1.VolumeMount{images, cgroups}
                                                }
                                        }(),
                                        Resources: vm.Spec.PodResources,
                                        ReadinessProbe: &corev1.Probe{
                                                ProbeHandler: corev1.ProbeHandler{
                                                        HTTPGet: &corev1.HTTPGetAction{
                                                                Path:   "/ready",
                                                                Port:   intstr.FromInt32(vm.Spec.RunnerPort),
                                                                Scheme: corev1.URISchemeHTTP,
                                                        },
                                                },
                                                InitialDelaySeconds: 5,
                                                PeriodSeconds:       5,
                                                FailureThreshold:    3,
                                        },
                                }

                                return []corev1.Container{runner}
                        }(),
                        Volumes: func() []corev1.Volume {
                                images := corev1.Volume{
                                        Name: "virtualmachineimages",
                                        VolumeSource: corev1.VolumeSource{
                                                EmptyDir: &corev1.EmptyDirVolumeSource{},
                                        },
                                }
                                cgroup := corev1.Volume{
                                        Name: "sysfscgroup",
                                        VolumeSource: corev1.VolumeSource{
                                                HostPath: &corev1.HostPathVolumeSource{
                                                        Path: "/sys/fs/cgroup",
                                                        Type: lo.ToPtr(corev1.HostPathDirectory),
                                                },
                                        },
                                }
                                if config.DisableRunnerCgroup {
                                        return []corev1.Volume{images}
                                } else {
                                        return []corev1.Volume{images, cgroup}
                                }
                        }(),
                },
        }

        if sshSecret != nil {
                pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts,
                        corev1.VolumeMount{
                                Name:      "ssh-privatekey",
                                MountPath: "/mnt/ssh",
                        },
                        corev1.VolumeMount{
                                Name:      "ssh-publickey",
                                MountPath: "/vm/ssh",
                        },
                )
                pod.Spec.Volumes = append(pod.Spec.Volumes,
                        corev1.Volume{
                                Name: "ssh-privatekey",
                                VolumeSource: corev1.VolumeSource{
                                        Secret: &corev1.SecretVolumeSource{
                                                SecretName: sshSecret.Name,
                                                Items: []corev1.KeyToPath{
                                                        {
                                                                Key:  "ssh-privatekey",
                                                                Path: "id_ed25519",
                                                                Mode: lo.ToPtr[int32](0o600),
                                                        },
                                                },
                                        },
                                },
                        },
                        corev1.Volume{
                                Name: "ssh-publickey",
                                VolumeSource: corev1.VolumeSource{
                                        Secret: &corev1.SecretVolumeSource{
                                                SecretName: sshSecret.Name,
                                                Items: []corev1.KeyToPath{
                                                        {
                                                                Key:  "ssh-publickey",
                                                                Path: "authorized_keys",
                                                                Mode: lo.ToPtr[int32](0o644),
                                                        },
                                                },
                                        },
                                },
                        },
                )
        }

        // If a custom neonvm-runner image is requested, use that instead:
        if vm.Spec.RunnerImage != nil {
                pod.Spec.Containers[0].Image = *vm.Spec.RunnerImage
        }

        // If a custom kernel is used, add that image:
        if vm.Spec.Guest.KernelImage != nil {
                pod.Spec.Containers[0].Args = append(pod.Spec.Containers[0].Args, "-kernelpath=/vm/images/vmlinuz")
                pod.Spec.InitContainers = append(pod.Spec.InitContainers, corev1.Container{
                        Image:           *vm.Spec.Guest.KernelImage,
                        Name:            "init-kernel",
                        ImagePullPolicy: vm.Spec.Guest.RootDisk.ImagePullPolicy,
                        Args:            []string{"cp", "/vmlinuz", "/vm/images/vmlinuz"},
                        VolumeMounts: []corev1.VolumeMount{{
                                Name:      "virtualmachineimages",
                                MountPath: "/vm/images",
                        }},
                        SecurityContext: &corev1.SecurityContext{
                                // uid=36(qemu) gid=34(kvm) groups=34(kvm)
                                RunAsUser:  lo.ToPtr[int64](36),
                                RunAsGroup: lo.ToPtr[int64](34),
                        },
                })
        }

        if vm.Spec.Guest.AppendKernelCmdline != nil {
                pod.Spec.Containers[0].Args = append(pod.Spec.Containers[0].Args, fmt.Sprintf("-appendKernelCmdline=%s", *vm.Spec.Guest.AppendKernelCmdline))
        }

        // Add any InitContainers that were specified by the spec
        pod.Spec.InitContainers = append(pod.Spec.InitContainers, vm.Spec.ExtraInitContainers...)

        // allow access to /dev/kvm and /dev/vhost-net devices by generic-device-plugin for kubelet
        if pod.Spec.Containers[0].Resources.Limits == nil {
                pod.Spec.Containers[0].Resources.Limits = corev1.ResourceList{}
        }
        pod.Spec.Containers[0].Resources.Limits["neonvm/vhost-net"] = resource.MustParse("1")
        // NB: EnableAcceleration guaranteed non-nil because the k8s API server sets the default for us.
        if *vm.Spec.EnableAcceleration {
                pod.Spec.Containers[0].Resources.Limits["neonvm/kvm"] = resource.MustParse("1")
        }

        for _, port := range vm.Spec.Guest.Ports {
                cPort := corev1.ContainerPort{
                        ContainerPort: int32(port.Port),
                }
                if len(port.Name) != 0 {
                        cPort.Name = port.Name
                }
                if len(port.Protocol) != 0 {
                        cPort.Protocol = corev1.Protocol(port.Protocol)
                }
                pod.Spec.Containers[0].Ports = append(pod.Spec.Containers[0].Ports, cPort)
        }

        if settings := vm.Spec.Guest.Settings; settings != nil {
                if swapSize := settings.Swap; swapSize != nil {
                        diskName := "swapdisk"
                        pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, corev1.VolumeMount{
                                Name:      diskName,
                                MountPath: fmt.Sprintf("/vm/mounts/%s", diskName),
                        })
                        pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
                                Name: diskName,
                                VolumeSource: corev1.VolumeSource{
                                        EmptyDir: &corev1.EmptyDirVolumeSource{
                                                SizeLimit: swapSize,
                                        },
                                },
                        })
                }
        }

        for _, disk := range vm.Spec.Disks {

                mnt := corev1.VolumeMount{
                        Name:      disk.Name,
                        MountPath: fmt.Sprintf("/vm/mounts%s", disk.MountPath),
                }
                if disk.ReadOnly != nil {
                        mnt.ReadOnly = *disk.ReadOnly
                }

                switch {
                case disk.ConfigMap != nil:
                        pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, mnt)
                        pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
                                Name: disk.Name,
                                VolumeSource: corev1.VolumeSource{
                                        ConfigMap: &corev1.ConfigMapVolumeSource{
                                                LocalObjectReference: corev1.LocalObjectReference{
                                                        Name: disk.ConfigMap.Name,
                                                },
                                                Items: disk.ConfigMap.Items,
                                        },
                                },
                        })
                case disk.Secret != nil:
                        pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, mnt)
                        pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
                                Name: disk.Name,
                                VolumeSource: corev1.VolumeSource{
                                        Secret: &corev1.SecretVolumeSource{
                                                SecretName: disk.Secret.SecretName,
                                                Items:      disk.Secret.Items,
                                        },
                                },
                        })
                case disk.EmptyDisk != nil:
                        pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, mnt)
                        pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
                                Name: disk.Name,
                                VolumeSource: corev1.VolumeSource{
                                        EmptyDir: &corev1.EmptyDirVolumeSource{
                                                SizeLimit: &disk.EmptyDisk.Size,
                                        },
                                },
                        })
                default:
                        // do nothing
                }
        }

        if vm.Spec.TLS != nil {
                // Add TLS secret
                mnt := corev1.VolumeMount{
                        Name:      "tls",
                        MountPath: fmt.Sprintf("/vm/mounts%s", vm.Spec.TLS.MountPath),
                }
                pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, mnt)
                pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
                        Name: "tls",
                        VolumeSource: corev1.VolumeSource{
                                Secret: &corev1.SecretVolumeSource{
                                        SecretName: vm.Status.TLSSecretName,
                                },
                        },
                })
        }

        // use multus network to add extra network interface
        if vm.Spec.ExtraNetwork != nil && vm.Spec.ExtraNetwork.Enable {
                var nadNetwork string
                if len(vm.Spec.ExtraNetwork.MultusNetwork) > 0 { // network specified in spec
                        nadNetwork = vm.Spec.ExtraNetwork.MultusNetwork
                } else { // get network from env variables
                        nadNetwork = fmt.Sprintf("%s/%s", config.NADConfig.RunnerNamespace, config.NADConfig.RunnerName)
                }
                pod.ObjectMeta.Annotations[nadapiv1.NetworkAttachmentAnnot] = fmt.Sprintf("%s@%s", nadNetwork, vm.Spec.ExtraNetwork.Interface)
        }

        return pod, nil
}

// SetupWithManager sets up the controller with the Manager.
// Note that the Runner Pod will be also watched in order to ensure its
// desirable state on the cluster
func (r *VMReconciler) SetupWithManager(mgr ctrl.Manager) (ReconcilerWithMetrics, error) {
        cntrlName := "virtualmachine"
        reconciler := WithMetrics(
                withCatchPanic(r),
                r.Metrics,
                cntrlName,
                r.Config.FailurePendingPeriod,
                r.Config.FailingRefreshInterval,
        )
        err := ctrl.NewControllerManagedBy(mgr).
                For(&vmv1.VirtualMachine{}).
                Owns(&certv1.CertificateRequest{}).
                Owns(&corev1.Secret{}).
                Owns(&corev1.Pod{}).
                WithOptions(controller.Options{MaxConcurrentReconciles: r.Config.MaxConcurrentReconciles}).
                Named(cntrlName).
                Complete(reconciler)
        return reconciler, err
}

func DeepEqual(v1, v2 interface{}) bool {
        if reflect.DeepEqual(v1, v2) {
                return true
        }
        var x1 interface{}
        bytesA, _ := json.Marshal(v1)
        _ = json.Unmarshal(bytesA, &x1)
        var x2 interface{}
        bytesB, _ := json.Marshal(v2)
        _ = json.Unmarshal(bytesB, &x2)

        return reflect.DeepEqual(x1, x2)
}

// TODO: reimplement to r.Patch()
func (r *VMReconciler) tryUpdateVM(ctx context.Context, vm *vmv1.VirtualMachine) error {
        return r.Update(ctx, vm)
}

type NADConfig struct {
        IPAMName        string
        IPAMNamespace   string
        RunnerName      string
        RunnerNamespace string
}

func GetNADConfig() *NADConfig {
        getVar := func(envVarName string) string {
                value, ok := os.LookupEnv(envVarName)
                if !ok {
                        panic(fmt.Errorf("unable to find %s environment variable", envVarName))
                }
                return value
        }
        return &NADConfig{
                IPAMName:        getVar("NAD_IPAM_NAME"),
                IPAMNamespace:   getVar("NAD_IPAM_NAMESPACE"),
                RunnerName:      getVar("NAD_RUNNER_NAME"),
                RunnerNamespace: getVar("NAD_RUNNER_NAMESPACE"),
        }
}

// sshKeygen generates a pair of public and private keys using the ed25519
// algorithm. It returns the generated public key and private key as byte
// slices. If an error occurs during key generation or encoding, it returns nil
// for both keys and the error.
func sshKeygen() (publicKeyBytes []byte, privateKeyBytes []byte, err error) {
        publicKey, privateKey, err := ed25519.GenerateKey(rand.Reader)
        if err != nil {
                return nil, nil, err
        }

        publicKeyBytes, err = encodePublicKey(publicKey)
        if err != nil {
                return nil, nil, err
        }

        privateKeyBytes, err = encodePrivateKey(privateKey)
        if err != nil {
                return nil, nil, err
        }

        return
}

func encodePrivateKey(privateKey ed25519.PrivateKey) ([]byte, error) {
        privBlock, err := ssh.MarshalPrivateKey(privateKey, "")
        if err != nil {
                return nil, err
        }
        privatePEM := pem.EncodeToMemory(privBlock)

        return privatePEM, nil
}

func encodePublicKey(publicKey ed25519.PublicKey) ([]byte, error) {
        sshPublicKey, err := ssh.NewPublicKey(publicKey)
        if err != nil {
                return nil, err
        }

        pubKeyBytes := ssh.MarshalAuthorizedKey(sshPublicKey)
        return pubKeyBytes, nil
}

package controllers

import (
        "context"
        "fmt"

        "sigs.k8s.io/controller-runtime/pkg/log"

        corev1 "k8s.io/api/core/v1"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/api"
)

// handleCPUScaling encapsulates the logic to handle CPU scaling.
// If vm scaling mode is set to CpuScalingModeSysfs, the scaling is delegated to neonvm-daemon to scale using sys fs state of the CPU cores.
// otherwise the scaling is first done by scaling amount of cores in the VM using QMP and then by updating the cgroups through neonvm-daemon.
// At the moment the cgroup update is not implemented in the daemon, so effectively the scaling is done by QMP or by using sys fs state for the CPU cores.
func (r *VMReconciler) handleCPUScaling(ctx context.Context, vm *vmv1.VirtualMachine, vmRunner *corev1.Pod) (bool, error) {
        log := log.FromContext(ctx)
        useCpuSysfsStateScaling := *vm.Spec.CpuScalingMode == vmv1.CpuScalingModeSysfs

        var scaled bool
        var err error
        if !useCpuSysfsStateScaling {
                scaled, err = r.handleCPUScalingQMP(ctx, vm, vmRunner)
        } else {
                scaled, err = r.handleCPUScalingSysfs(ctx, vm, vmRunner)
        }

        if err != nil {
                log.Error(err, "Failed to scale CPU", "VirtualMachine", vm.Name, "CpuScalingMode", vm.Spec.CpuScalingMode)
                return false, err
        }

        return scaled, nil
}

// handleCPUScalingQMP handles CPU scaling using qemu CPU hotplug/unplug feature.
func (r *VMReconciler) handleCPUScalingQMP(ctx context.Context, vm *vmv1.VirtualMachine, vmRunner *corev1.Pod) (bool, error) {
        log := log.FromContext(ctx)
        specCPU := vm.Spec.Guest.CPUs.Use

        // get cgroups CPU details from runner pod
        cgroupUsage, err := getRunnerCPULimits(ctx, vm)
        if err != nil {
                log.Error(err, "Failed to get CPU details from runner", "VirtualMachine", vm.Name)
                return false, err
        }

        // get CPU details from QEMU
        var pluggedCPU uint32
        cpuSlotsPlugged, _, err := QmpGetCpus(QmpAddr(vm))
        if err != nil {
                log.Error(err, "Failed to get CPU details from VirtualMachine", "VirtualMachine", vm.Name)
                return false, err
        }
        pluggedCPU = uint32(len(cpuSlotsPlugged))

        // start scaling CPU
        log.Info("Scaling using QMP CPU control")
        var hotPlugCPUScaled bool
        if specCPU.RoundedUp() > pluggedCPU {
                // going to plug one CPU
                log.Info("Plug one more CPU into VM")
                if err := QmpPlugCpu(QmpAddr(vm)); err != nil {
                        return false, err
                }
                r.Recorder.Event(vm, "Normal", "ScaleUp",
                        fmt.Sprintf("One more CPU was plugged into VM %s",
                                vm.Name))
        } else if specCPU.RoundedUp() < pluggedCPU {
                // going to unplug one CPU
                log.Info("Unplug one CPU from VM")
                if err := QmpUnplugCpu(QmpAddr(vm)); err != nil {
                        return false, err
                }
                r.Recorder.Event(vm, "Normal", "ScaleDown",
                        fmt.Sprintf("One CPU was unplugged from VM %s",
                                vm.Name))
                return false, nil
        } else if specCPU != cgroupUsage.VCPUs {
                _, err := r.handleCgroupCPUUpdate(ctx, vm, cgroupUsage)
                if err != nil {
                        log.Error(err, "Failed to update cgroup CPU", "VirtualMachine", vm.Name)
                        return false, err
                }
        } else {
                hotPlugCPUScaled = true
        }

        // update status by CPUs used in the VM
        r.updateVMStatusCPU(ctx, vm, vmRunner, pluggedCPU, cgroupUsage)
        return hotPlugCPUScaled, nil
}

func (r *VMReconciler) handleCPUScalingSysfs(ctx context.Context, vm *vmv1.VirtualMachine, vmRunner *corev1.Pod) (bool, error) {
        log := log.FromContext(ctx)
        specCPU := vm.Spec.Guest.CPUs.Use

        cgroupUsage, err := getRunnerCPULimits(ctx, vm)
        if err != nil {
                log.Error(err, "Failed to get CPU details from runner", "VirtualMachine", vm.Name)
                return false, err
        }
        if specCPU != cgroupUsage.VCPUs {
                return r.handleCgroupCPUUpdate(ctx, vm, cgroupUsage)
        }
        r.updateVMStatusCPU(ctx, vm, vmRunner, cgroupUsage.VCPUs.RoundedUp(), cgroupUsage)
        return true, nil
}

func (r *VMReconciler) handleCgroupCPUUpdate(ctx context.Context, vm *vmv1.VirtualMachine, cgroupUsage *api.VCPUCgroup) (bool, error) {
        specCPU := vm.Spec.Guest.CPUs.Use
        if err := setRunnerCPULimits(ctx, vm, specCPU); err != nil {
                return false, err
        }
        reason := "ScaleDown"
        if specCPU > cgroupUsage.VCPUs {
                reason = "ScaleUp"
        }
        r.Recorder.Event(vm, "Normal", reason,
                fmt.Sprintf("Runner pod cgroups was updated on VM %s %s",
                        vm.Name, specCPU))
        return true, nil
}

package controllers

import (
        "encoding/json"
        "errors"
        "fmt"
        "strings"
        "time"

        "github.com/digitalocean/go-qemu/qmp"

        "k8s.io/apimachinery/pkg/api/resource"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)

type QmpCpus struct {
        Return []struct {
                Props struct {
                        CoreId   int32 `json:"core-id"`
                        ThreadId int32 `json:"thread-id"`
                        SocketId int32 `json:"socket-id"`
                } `json:"props"`
                VcpusCount int32   `json:"vcpus-count"`
                QomPath    *string `json:"qom-path"`
                Type       string  `json:"type"`
        } `json:"return"`
}

type QmpMemorySize struct {
        Return struct {
                BaseMemory    int64 `json:"base-memory"`
                PluggedMemory int64 `json:"plugged-memory"`
        } `json:"return"`
}

type QmpCpuSlot struct {
        Core int32  `json:"core"`
        QOM  string `json:"qom"`
        Type string `json:"type"`
}

type QmpMemoryDevices struct {
        Return []QmpMemoryDevice `json:"return"`
}

type QmpMemoryDevice struct {
        Type string `json:"type"`
        Data struct {
                Memdev       string `json:"memdev"`
                Hotplugged   bool   `json:"hotplugged"`
                Addr         int64  `json:"addr"`
                Hotplugguble bool   `json:"hotpluggable"`
                Size         int64  `json:"size"`
                Slot         int64  `json:"slot"`
                Node         int64  `json:"node"`
                Id           string `json:"id"`
        } `json:"data"`
}

type QmpObjects struct {
        Return []QmpObject `json:"return"`
}

type QmpObject struct {
        Name string `json:"name"`
        Type string `json:"type"`
}

type QmpMigrationInfo struct {
        Return MigrationInfo `json:"return"`
}

type MigrationInfo struct {
        Status      string `json:"status"`
        TotalTimeMs int64  `json:"total-time"`
        SetupTimeMs int64  `json:"setup-time"`
        DowntimeMs  int64  `json:"downtime"`
        Ram         struct {
                Transferred    int64 `json:"transferred"`
                Remaining      int64 `json:"remaining"`
                Total          int64 `json:"total"`
                Duplicate      int64 `json:"duplicate"`
                Normal         int64 `json:"normal"`
                NormalBytes    int64 `json:"normal-bytes"`
                DirtySyncCount int64 `json:"dirty-sync-count"`
        } `json:"ram"`
        Compression struct {
                CompressedSize  int64   `json:"compressed-size"`
                CompressionRate float64 `json:"compression-rate"`
        } `json:"compression"`
}

func QmpAddr(vm *vmv1.VirtualMachine) (ip string, port int32) {
        return vm.Status.PodIP, vm.Spec.QMP
}

func QmpConnect(ip string, port int32) (*qmp.SocketMonitor, error) {
        mon, err := qmp.NewSocketMonitor("tcp", fmt.Sprintf("%s:%d", ip, port), 2*time.Second)
        if err != nil {
                return nil, err
        }
        if err := mon.Connect(); err != nil {
                return nil, err
        }

        return mon, nil
}

func QmpGetCpus(ip string, port int32) ([]QmpCpuSlot, []QmpCpuSlot, error) {
        mon, err := QmpConnect(ip, port)
        if err != nil {
                return nil, nil, err
        }
        defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?

        qmpcmd := []byte(`{"execute": "query-hotpluggable-cpus"}`)
        raw, err := mon.Run(qmpcmd)
        if err != nil {
                return nil, nil, err
        }

        var result QmpCpus
        if err := json.Unmarshal(raw, &result); err != nil {
                return nil, nil, fmt.Errorf("error unmarshaling json: %w", err)
        }

        plugged := []QmpCpuSlot{}
        empty := []QmpCpuSlot{}
        for _, entry := range result.Return {
                if entry.QomPath != nil {
                        plugged = append(plugged, QmpCpuSlot{Core: entry.Props.CoreId, QOM: *entry.QomPath, Type: entry.Type})
                } else {
                        empty = append(empty, QmpCpuSlot{Core: entry.Props.CoreId, QOM: "", Type: entry.Type})
                }
        }

        return plugged, empty, nil
}

func QmpPlugCpu(ip string, port int32) error {
        _, empty, err := QmpGetCpus(ip, port)
        if err != nil {
                return err
        }
        if len(empty) == 0 {
                return errors.New("no empty slots for CPU hotplug")
        }

        mon, err := QmpConnect(ip, port)
        if err != nil {
                return err
        }
        defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?

        // empty list reversed, first cpu slot in the end of list and last cpu slot in the beginning
        slot := empty[len(empty)-1]
        qmpcmd := []byte(fmt.Sprintf(`{
                "execute": "device_add",
                "arguments": {
                        "id": "cpu%d",
                        "driver": %q,
                        "core-id": %d,
                        "socket-id": 0,
                        "thread-id": 0
                }
        }`, slot.Core, slot.Type, slot.Core))

        _, err = mon.Run(qmpcmd)
        if err != nil {
                return err
        }

        return nil
}

func QmpUnplugCpu(ip string, port int32) error {
        plugged, _, err := QmpGetCpus(ip, port)
        if err != nil {
                return err
        }

        slot := -1
        found := false
        for i, s := range plugged {
                if strings.Contains(s.QOM, "machine/peripheral/cpu") {
                        found = true
                        slot = i
                        break
                }
        }
        if !found {
                return errors.New("there are no unpluggable CPUs")
        }

        mon, err := QmpConnect(ip, port)
        if err != nil {
                return err
        }
        defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?

        cmd := []byte(fmt.Sprintf(`{"execute": "device_del", "arguments": {"id": %q}}`, plugged[slot].QOM))
        _, err = mon.Run(cmd)
        if err != nil {
                return err
        }
        // small pause to let hypervisor do unplug
        time.Sleep(500 * time.Millisecond)

        return nil
}

func QmpSyncCpuToTarget(vm *vmv1.VirtualMachine, migration *vmv1.VirtualMachineMigration) error {
        plugged, _, err := QmpGetCpus(QmpAddr(vm))
        if err != nil {
                return err
        }
        pluggedInTarget, _, err := QmpGetCpus(migration.Status.TargetPodIP, vm.Spec.QMP)
        if err != nil {
                return err
        }
        if len(plugged) == len(pluggedInTarget) {
                // no need plug anything
                return nil
        }

        target, err := QmpConnect(migration.Status.TargetPodIP, vm.Spec.QMP)
        if err != nil {
                return err
        }
        defer target.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?

searchForEmpty:
        for _, slot := range plugged {
                // firsly check if slot occupied already
                // run over Target CPUs and compare with source
                for _, tslot := range pluggedInTarget {
                        if slot == tslot {
                                // that mean such CPU already present in Target, skip it
                                continue searchForEmpty
                        }
                }
                qmpcmd := []byte(fmt.Sprintf(`{
                        "execute": "device_add",
                        "arguments": {
                                "id": "cpu%d",
                                "driver": %q,
                                "core-id": %d,
                                "socket-id": 0,
                                "thread-id": 0
                        }
                }`, slot.Core, slot.Type, slot.Core))
                _, err = target.Run(qmpcmd)
                if err != nil {
                        return err
                }
        }

        return nil
}

// QmpSetVirtioMem updates virtio-mem to the new target size, returning the previous target.
//
// If the new target size is equal to the previous one, this function does nothing but query the
// target.
func QmpSetVirtioMem(vm *vmv1.VirtualMachine, targetVirtioMemSize int64) (previous int64, _ error) {
        // Note: The virtio-mem device only exists when max mem != min mem.
        // So if min == max, we should just short-cut, skip the queries, and say it's all good.
        // Refer to the instantiation in neonvm-runner for more.
        if vm.Spec.Guest.MemorySlots.Min == vm.Spec.Guest.MemorySlots.Max {
                // if target size is non-zero even though min == max, something went very wrong
                if targetVirtioMemSize != 0 {
                        panic(fmt.Sprintf(
                                "VM min mem slots == max mem slots, but target virtio-mem size %d != 0",
                                targetVirtioMemSize,
                        ))
                }
                // Otherwise, we're all good, just pretend like we talked to the VM.
                return 0, nil
        }

        mon, err := QmpConnect(QmpAddr(vm))
        if err != nil {
                return 0, err
        }
        defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?

        // First, fetch current desired virtio-mem size. If it's the same as targetVirtioMemSize, then
        // we can report that it was already the same.
        cmd := []byte(`{"execute": "qom-get", "arguments": {"path": "vm0", "property": "requested-size"}}`)
        raw, err := mon.Run(cmd)
        if err != nil {
                return 0, err
        }
        result := struct {
                Return int64 `json:"return"`
        }{Return: 0}
        if err := json.Unmarshal(raw, &result); err != nil {
                return 0, fmt.Errorf("error unmarshaling json: %w", err)
        }

        previous = result.Return

        if previous == targetVirtioMemSize {
                return previous, nil
        }

        // The current requested size is not equal to the new desired size. Let's change that.
        cmd = []byte(fmt.Sprintf(
                `{"execute": "qom-set", "arguments": {"path": "vm0", "property": "requested-size", "value": %d}}`,
                targetVirtioMemSize,
        ))
        _, err = mon.Run(cmd)
        if err != nil {
                return 0, err
        }

        return previous, nil
}

func QmpGetMemorySize(ip string, port int32) (*resource.Quantity, error) {
        mon, err := QmpConnect(ip, port)
        if err != nil {
                return nil, err
        }
        defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?

        qmpcmd := []byte(`{"execute": "query-memory-size-summary"}`)
        raw, err := mon.Run(qmpcmd)
        if err != nil {
                return nil, err
        }

        var result QmpMemorySize
        if err := json.Unmarshal(raw, &result); err != nil {
                return nil, fmt.Errorf("error unmarshaling json: %w", err)
        }

        return resource.NewQuantity(result.Return.BaseMemory+result.Return.PluggedMemory, resource.BinarySI), nil
}

func QmpStartMigration(virtualmachine *vmv1.VirtualMachine, virtualmachinemigration *vmv1.VirtualMachineMigration) error {
        // QMP port
        port := virtualmachine.Spec.QMP

        // connect to source runner QMP
        s_ip := virtualmachinemigration.Status.SourcePodIP
        smon, err := qmp.NewSocketMonitor("tcp", fmt.Sprintf("%s:%d", s_ip, port), 2*time.Second)
        if err != nil {
                return err
        }
        if err := smon.Connect(); err != nil {
                return err
        }
        defer smon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?

        // connect to target runner QMP
        t_ip := virtualmachinemigration.Status.TargetPodIP
        tmon, err := qmp.NewSocketMonitor("tcp", fmt.Sprintf("%s:%d", t_ip, port), 2*time.Second)
        if err != nil {
                return err
        }
        if err := tmon.Connect(); err != nil {
                return err
        }
        defer tmon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?

        cache := resource.MustParse("256Mi")
        var qmpcmd []byte
        // setup migration on source runner
        qmpcmd = []byte(fmt.Sprintf(`{
                "execute": "migrate-set-capabilities",
                "arguments":
                    {
                        "capabilities": [
                            {"capability": "postcopy-ram",  "state": %t},
                            {"capability": "xbzrle",        "state": true},
                            {"capability": "compress",      "state": true},
                            {"capability": "auto-converge", "state": %t},
                            {"capability": "zero-blocks",   "state": true}
                        ]
                    }
                }`, virtualmachinemigration.Spec.AllowPostCopy, virtualmachinemigration.Spec.AutoConverge))
        _, err = smon.Run(qmpcmd)
        if err != nil {
                return err
        }
        qmpcmd = []byte(fmt.Sprintf(`{
                "execute": "migrate-set-parameters",
                "arguments":
                    {
                        "xbzrle-cache-size":   %d,
                        "max-bandwidth":       %d,
                        "multifd-compression": "zstd"
                    }
                }`, cache.Value(), virtualmachinemigration.Spec.MaxBandwidth.Value()))
        _, err = smon.Run(qmpcmd)
        if err != nil {
                return err
        }

        // setup migration on target runner
        qmpcmd = []byte(fmt.Sprintf(`{
                "execute": "migrate-set-capabilities",
                "arguments":
                    {
                        "capabilities": [
                            {"capability": "postcopy-ram",  "state": %t},
                            {"capability": "xbzrle",        "state": true},
                            {"capability": "compress",      "state": true},
                            {"capability": "auto-converge", "state": %t},
                            {"capability": "zero-blocks",   "state": true}
                        ]
                    }
                }`, virtualmachinemigration.Spec.AllowPostCopy, virtualmachinemigration.Spec.AutoConverge))
        _, err = tmon.Run(qmpcmd)
        if err != nil {
                return err
        }
        qmpcmd = []byte(fmt.Sprintf(`{
                "execute": "migrate-set-parameters",
                "arguments":
                    {
                        "xbzrle-cache-size":   %d,
                        "max-bandwidth":       %d,
                        "multifd-compression": "zstd"
                    }
                }`, cache.Value(), virtualmachinemigration.Spec.MaxBandwidth.Value()))
        _, err = tmon.Run(qmpcmd)
        if err != nil {
                return err
        }

        // trigger migration
        qmpcmd = []byte(fmt.Sprintf(`{
                "execute": "migrate",
                "arguments":
                    {
                        "uri": "tcp:%s:%d",
                        "inc": %t,
                        "blk": %t
                    }
                }`, t_ip, vmv1.MigrationPort, virtualmachinemigration.Spec.Incremental, !virtualmachinemigration.Spec.Incremental))
        _, err = smon.Run(qmpcmd)
        if err != nil {
                return err
        }
        if virtualmachinemigration.Spec.AllowPostCopy {
                qmpcmd = []byte(`{"execute": "migrate-start-postcopy"}`)
                _, err = smon.Run(qmpcmd)
                if err != nil {
                        return err
                }
        }

        return nil
}

func QmpGetMigrationInfo(ip string, port int32) (*MigrationInfo, error) {
        mon, err := QmpConnect(ip, port)
        if err != nil {
                return nil, err
        }
        defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?

        qmpcmd := []byte(`{"execute": "query-migrate"}`)
        raw, err := mon.Run(qmpcmd)
        if err != nil {
                return nil, err
        }

        var result QmpMigrationInfo
        if err := json.Unmarshal(raw, &result); err != nil {
                return nil, fmt.Errorf("error unmarshaling json: %w", err)
        }

        return &result.Return, nil
}

func QmpCancelMigration(ip string, port int32) error {
        mon, err := QmpConnect(ip, port)
        if err != nil {
                return err
        }
        defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?

        qmpcmd := []byte(`{"execute": "migrate_cancel"}`)
        _, err = mon.Run(qmpcmd)
        if err != nil {
                return err
        }

        return nil
}

func QmpQuit(ip string, port int32) error {
        mon, err := QmpConnect(ip, port)
        if err != nil {
                return err
        }
        defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?

        qmpcmd := []byte(`{"execute": "quit"}`)
        _, err = mon.Run(qmpcmd)
        if err != nil {
                return err
        }

        return nil
}

/*
Copyright 2023.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
        "context"
        "errors"
        "fmt"
        "math"
        "time"

        ctrl "sigs.k8s.io/controller-runtime"
        "sigs.k8s.io/controller-runtime/pkg/client"
        "sigs.k8s.io/controller-runtime/pkg/controller"
        "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
        "sigs.k8s.io/controller-runtime/pkg/log"

        corev1 "k8s.io/api/core/v1"
        apierrors "k8s.io/apimachinery/pkg/api/errors"
        "k8s.io/apimachinery/pkg/api/meta"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/apimachinery/pkg/runtime"
        "k8s.io/apimachinery/pkg/types"
        "k8s.io/apiserver/pkg/storage/names"
        "k8s.io/client-go/tools/record"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/neonvm/controllers/buildtag"
)

const virtualmachinemigrationFinalizer = "vm.neon.tech/finalizer"

// Definitions to manage status conditions
const (
        // typeAvailableVirtualMachineMigration represents the status of the Deployment reconciliation
        typeAvailableVirtualMachineMigration = "Available"
        // typeDegradedVirtualMachineMigration represents the status used when the custom resource is deleted and the finalizer operations are must to occur.
        typeDegradedVirtualMachineMigration = "Degraded"
)

// VirtualMachineMigrationReconciler reconciles a VirtualMachineMigration object
type VirtualMachineMigrationReconciler struct {
        client.Client
        Scheme   *runtime.Scheme
        Recorder record.EventRecorder
        Config   *ReconcilerConfig

        Metrics ReconcilerMetrics
}

func (r *VirtualMachineMigrationReconciler) createTargetPod(
        ctx context.Context,
        migration *vmv1.VirtualMachineMigration,
        vm *vmv1.VirtualMachine,
) (ctrl.Result, error) {
        logger := log.FromContext(ctx)
        // NB: .Spec.EnableSSH guaranteed non-nil because the k8s API server sets the default for us.
        enableSSH := *vm.Spec.EnableSSH
        var sshSecret *corev1.Secret
        if enableSSH {
                // We require the SSH secret to exist because we cannot unmount and
                // mount the new secret into the VM after the live migration. If a
                // VM's SSH secret is deleted accidentally then live migration is
                // not possible.
                if len(vm.Status.SSHSecretName) == 0 {
                        err := errors.New("VM has .Spec.EnableSSH but its .Status.SSHSecretName is empty")
                        logger.Error(err, "Failed to get VM's SSH Secret")
                        r.Recorder.Event(migration, "Warning", "Failed", err.Error())
                        return ctrl.Result{}, err
                }
                sshSecret = &corev1.Secret{}
                err := r.Get(ctx, types.NamespacedName{Name: vm.Status.SSHSecretName, Namespace: vm.Namespace}, sshSecret)
                if err != nil {
                        logger.Error(err, "Failed to get VM's SSH Secret")
                        r.Recorder.Event(migration, "Warning", "Failed", fmt.Sprintf("Failed to get VM's SSH Secret: %v", err))
                        return ctrl.Result{}, err
                }
        }

        // Define a new target pod
        tpod, err := r.targetPodForVirtualMachine(vm, migration, sshSecret)
        if err != nil {
                logger.Error(err, "Failed to generate Target Pod spec")
                return ctrl.Result{}, err
        }
        logger.Info("Creating a Target Pod", "Pod.Namespace", tpod.Namespace, "Pod.Name", tpod.Name)
        if err = r.Create(ctx, tpod); err != nil {
                logger.Error(err, "Failed to create Target Pod", "Pod.Namespace", tpod.Namespace, "Pod.Name", tpod.Name)
                return ctrl.Result{}, err
        }
        logger.Info("Target runner Pod was created", "Pod.Namespace", tpod.Namespace, "Pod.Name", tpod.Name)
        // add event with some info
        r.Recorder.Event(migration, "Normal", "Created",
                fmt.Sprintf("VM (%s) ready migrate to target pod (%s)",
                        vm.Name, tpod.Name))
        // target pod was just created, so requeue reconcile
        return ctrl.Result{RequeueAfter: time.Second}, nil
}

// The following markers are used to generate the rules permissions (RBAC) on config/rbac using controller-gen
// when controller-gen (used by 'make generate') is executed.
// To know more about markers see: https://book.kubebuilder.io/reference/markers.html

//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachinemigrations,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachinemigrations/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachinemigrations/finalizers,verbs=update
//+kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;list;watch

// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.

// It is essential for the controller's reconciliation loop to be idempotent. By following the Operator
// pattern you will create Controllers which provide a reconcile function
// responsible for synchronizing resources until the desired state is reached on the cluster.
// Breaking this recommendation goes against the design principles of controller-runtime.
// and may lead to unforeseen consequences such as resources becoming stuck and requiring manual intervention.
// For further info:
// - About Operator Pattern: https://kubernetes.io/docs/concepts/extend-kubernetes/operator/
// - About Controllers: https://kubernetes.io/docs/concepts/architecture/controller/
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.13.0/pkg/reconcile
func (r *VirtualMachineMigrationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
        log := log.FromContext(ctx)

        // Fetch the VirtualMachineMigration instance
        // The purpose is check if the Custom Resource for the Kind VirtualMachineMigration
        // is applied on the cluster if not we return nil to stop the reconciliation
        migration := new(vmv1.VirtualMachineMigration)
        if err := r.Get(ctx, req.NamespacedName, migration); err != nil {
                // ignore error and stop reconcile loop if object not found (already deleted?)
                if apierrors.IsNotFound(err) {
                        return ctrl.Result{}, nil
                }
                log.Error(err, "Unable to fetch Migration")
                return ctrl.Result{}, err
        }

        getVM := func() (*vmv1.VirtualMachine, error) {
                var vm vmv1.VirtualMachine
                err := r.Get(ctx, types.NamespacedName{Name: migration.Spec.VmName, Namespace: migration.Namespace}, &vm)
                if err != nil {
                        log.Error(err, "Failed to get VM", "VmName", migration.Spec.VmName)
                        return nil, err
                }
                return &vm, nil
        }

        if !migration.ObjectMeta.DeletionTimestamp.IsZero() {
                // The object is being deleted
                if controllerutil.ContainsFinalizer(migration, virtualmachinemigrationFinalizer) {
                        // our finalizer is present, so lets handle any external dependency
                        log.Info("Performing Finalizer Operations for Migration")
                        vm, err := getVM()
                        if err != nil {
                                return ctrl.Result{}, err
                        }
                        if err := r.doFinalizerOperationsForVirtualMachineMigration(ctx, migration, vm); err != nil {
                                // if fail to delete the external dependency here, return with error
                                // so that it can be retried
                                return ctrl.Result{}, err
                        }
                        // remove our finalizer from the list and update it.
                        log.Info("Removing Finalizer from Migration")
                        if !controllerutil.RemoveFinalizer(migration, virtualmachinemigrationFinalizer) {
                                return ctrl.Result{}, errors.New("Failed to remove finalizer from Migration")
                        }
                        if err := r.Update(ctx, migration); err != nil {
                                return ctrl.Result{}, err
                        }
                }
                // Stop reconciliation as the item is being deleted
                return ctrl.Result{}, nil
        }
        // The object is not being deleted, so if it does not have our finalizer,
        // then lets add the finalizer and update the object. This is equivalent
        // registering our finalizer.
        if !controllerutil.ContainsFinalizer(migration, virtualmachinemigrationFinalizer) {
                log.Info("Adding Finalizer to Migration")
                if !controllerutil.AddFinalizer(migration, virtualmachinemigrationFinalizer) {
                        return ctrl.Result{}, errors.New("Failed to add finalizer to Migration")
                }
                if err := r.Update(ctx, migration); err != nil {
                        return ctrl.Result{}, err
                }
                // stop this reconciliation cycle, new will be triggered as Migration updated
                return ctrl.Result{}, nil
        }

        // Fetch the corresponding VirtualMachine instance
        vm, err := getVM()
        if err != nil {
                log.Error(err, "Failed to get VM", "VmName", migration.Spec.VmName)
                if apierrors.IsNotFound(err) {
                        // stop reconcile loop if vm not found (already deleted?)
                        message := fmt.Sprintf("VM (%s) not found", migration.Spec.VmName)
                        r.Recorder.Event(migration, "Warning", "Failed", message)
                        meta.SetStatusCondition(&migration.Status.Conditions,
                                metav1.Condition{
                                        Type:    typeDegradedVirtualMachineMigration,
                                        Status:  metav1.ConditionTrue,
                                        Reason:  "Reconciling",
                                        Message: message,
                                })
                        migration.Status.Phase = vmv1.VmmFailed
                        return r.updateMigrationStatus(ctx, migration)
                }
                // return err and try reconcile again
                return ctrl.Result{}, err
        }

        // Set owner for VM migration object
        if !metav1.IsControlledBy(migration, vm) {
                log.Info("Set VM as owner for Migration", "vm.Name", vm.Name)
                if err := ctrl.SetControllerReference(vm, migration, r.Scheme); err != nil {
                        return ctrl.Result{}, err
                }
                if err := r.Update(ctx, migration); err != nil {
                        log.Info("Failed to add owner to Migration", "error", err)
                        return ctrl.Result{}, err
                }
                // stop this reconciliation cycle, new will be triggered as Migration updated
                return ctrl.Result{}, nil
        }

        // MAIN RECONCILE LOOP START

        // Let's check and just set the condition status as Unknown when no status are available
        if len(migration.Status.Conditions) == 0 {
                log.Info("Set initial Unknown condition status")
                meta.SetStatusCondition(&migration.Status.Conditions, metav1.Condition{Type: typeAvailableVirtualMachineMigration, Status: metav1.ConditionUnknown, Reason: "Reconciling", Message: "Starting reconciliation"})
                return r.updateMigrationStatus(ctx, migration)
        }

        // target runner pod details - generate name
        if len(migration.Status.TargetPodName) == 0 {
                targetPodName := names.SimpleNameGenerator.GenerateName(fmt.Sprintf("%s-", vm.Name))
                log.Info("Set Target Pod Name", "TargetPod.Name", targetPodName)
                migration.Status.TargetPodName = targetPodName
                return r.updateMigrationStatus(ctx, migration)
        }

        if migration.Status.Phase == "" {
                // need change VM status asap to prevent autoscaler change CPU/RAM in VM
                // but only if VM running
                if vm.Status.Phase == vmv1.VmRunning {
                        vm.Status.Phase = vmv1.VmPreMigrating
                        if err := r.Status().Update(ctx, vm); err != nil {
                                log.Error(err, "Failed to update VM status to PreMigrating", "Status", vm.Status.Phase)
                                return ctrl.Result{}, err
                        }
                        // Migration just created, change Phase to "Pending"
                        migration.Status.Phase = vmv1.VmmPending
                        return r.updateMigrationStatus(ctx, migration)
                }
                // some other VM status (Scaling may be), requeue after second
                return ctrl.Result{RequeueAfter: time.Second}, nil
        }

        switch migration.Status.Phase {

        case vmv1.VmmPending:

                // Check if the target runner pod already exists,
                // if not create a new one using source pod as template
                targetRunner := &corev1.Pod{}
                err := r.Get(ctx, types.NamespacedName{Name: migration.Status.TargetPodName, Namespace: vm.Namespace}, targetRunner)
                if err != nil && apierrors.IsNotFound(err) {
                        return r.createTargetPod(ctx, migration, vm)
                }
                if err != nil {
                        log.Error(err, "Failed to get Target Pod")
                        return ctrl.Result{}, err
                }

                // Update the metadata (including "usage" annotation) before anything else, so that it
                // will be correctly set even if the rest of the reconcile operation fails.
                if err := updatePodMetadataIfNecessary(ctx, r.Client, vm, targetRunner); err != nil {
                        log.Error(err, "Failed to sync pod labels and annotations", "TargetPod.Name", targetRunner.Name)
                }

                // If not already, set an additional (non-controller) owner reference for the source pod:
                sourceRunner := &corev1.Pod{}
                err = r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, sourceRunner)
                if err != nil {
                        log.Error(err, "Failed to get migration source pod")
                        return ctrl.Result{}, err
                }
                ownedByMigration := false
                for _, ref := range sourceRunner.OwnerReferences {
                        if ref.UID == migration.UID {
                                ownedByMigration = true
                                break
                        }
                }
                if !ownedByMigration {
                        if err = controllerutil.SetOwnerReference(migration, sourceRunner, r.Scheme); err != nil {
                                log.Error(err, "Failed to set owner reference for source pod")
                                return ctrl.Result{}, err
                        }
                        if err = r.Update(ctx, sourceRunner); err != nil {
                                log.Error(err, "Failed to update owner of source runner")
                                // Requeue so that we try again, even though we're not an owner of the source runner
                                return ctrl.Result{RequeueAfter: time.Second}, err
                        }
                }

                // now inspect target pod status and update migration
                switch runnerStatus(targetRunner) {
                case runnerRunning:
                        // update migration status
                        migration.Status.SourcePodName = vm.Status.PodName
                        migration.Status.SourcePodIP = vm.Status.PodIP
                        migration.Status.TargetPodIP = targetRunner.Status.PodIP

                        if *vm.Spec.CpuScalingMode == vmv1.CpuScalingModeQMP {
                                // do hotplugCPU in targetRunner before migration
                                log.Info("Syncing CPUs in Target runner", "TargetPod.Name", migration.Status.TargetPodName)
                                if err := QmpSyncCpuToTarget(vm, migration); err != nil {
                                        return ctrl.Result{}, err
                                }
                                log.Info("CPUs in Target runner synced", "TargetPod.Name", migration.Status.TargetPodName)
                        }
                        // Migrate only running VMs to target with plugged devices
                        if vm.Status.Phase == vmv1.VmPreMigrating {
                                // update VM status
                                vm.Status.Phase = vmv1.VmMigrating
                                if err := r.Status().Update(ctx, vm); err != nil {
                                        log.Error(err, "Failed to update VirtualMachine status to 'Migrating'")
                                        return ctrl.Result{}, err
                                }
                                // trigger migration
                                if err := QmpStartMigration(vm, migration); err != nil {
                                        migration.Status.Phase = vmv1.VmmFailed
                                        return ctrl.Result{}, err
                                }
                                message := fmt.Sprintf("Migration was started to target runner (%s)", targetRunner.Name)
                                log.Info(message)
                                r.Recorder.Event(migration, "Normal", "Started", message)
                                meta.SetStatusCondition(&migration.Status.Conditions,
                                        metav1.Condition{
                                                Type:    typeAvailableVirtualMachineMigration,
                                                Status:  metav1.ConditionTrue,
                                                Reason:  "Reconciling",
                                                Message: message,
                                        })
                                // finally update migration phase to Running
                                migration.Status.Phase = vmv1.VmmRunning
                                return r.updateMigrationStatus(ctx, migration)
                        }
                case runnerSucceeded:
                        // target runner pod finished without error? but it shouldn't finish
                        message := fmt.Sprintf("Target Pod (%s) completed suddenly", targetRunner.Name)
                        log.Info(message)
                        r.Recorder.Event(migration, "Warning", "Failed", message)
                        meta.SetStatusCondition(&migration.Status.Conditions,
                                metav1.Condition{
                                        Type:    typeDegradedVirtualMachineMigration,
                                        Status:  metav1.ConditionTrue,
                                        Reason:  "Reconciling",
                                        Message: message,
                                })
                        migration.Status.Phase = vmv1.VmmFailed
                        return r.updateMigrationStatus(ctx, migration)
                case runnerFailed:
                        message := fmt.Sprintf("Target Pod (%s) failed", targetRunner.Name)
                        log.Info(message)
                        r.Recorder.Event(migration, "Warning", "Failed", message)
                        meta.SetStatusCondition(&migration.Status.Conditions,
                                metav1.Condition{
                                        Type:    typeDegradedVirtualMachineMigration,
                                        Status:  metav1.ConditionTrue,
                                        Reason:  "Reconciling",
                                        Message: message,
                                })
                        migration.Status.Phase = vmv1.VmmFailed
                        return r.updateMigrationStatus(ctx, migration)
                default:
                        // not sure what to do, so try rqueue
                        return ctrl.Result{RequeueAfter: time.Second}, nil
                }

        case vmv1.VmmRunning:
                // retrieve target pod details
                targetRunner := &corev1.Pod{}
                err := r.Get(ctx, types.NamespacedName{Name: migration.Status.TargetPodName, Namespace: migration.Namespace}, targetRunner)
                if err != nil && apierrors.IsNotFound(err) {
                        // lost target pod for running Migration ?
                        message := fmt.Sprintf("Target Pod (%s) disappeared", migration.Status.TargetPodName)
                        r.Recorder.Event(migration, "Error", "NotFound", message)
                        meta.SetStatusCondition(&migration.Status.Conditions,
                                metav1.Condition{
                                        Type:    typeDegradedVirtualMachineMigration,
                                        Status:  metav1.ConditionTrue,
                                        Reason:  "Reconciling",
                                        Message: message,
                                })
                        migration.Status.Phase = vmv1.VmmFailed
                        return r.updateMigrationStatus(ctx, migration)
                } else if err != nil {
                        log.Error(err, "Failed to get target runner Pod")
                        return ctrl.Result{}, err
                }

                // Update the metadata (including "usage" annotation) before anything else, so that it
                // will be correctly set even if the rest of the reconcile operation fails.
                if err := updatePodMetadataIfNecessary(ctx, r.Client, vm, targetRunner); err != nil {
                        log.Error(err, "Failed to sync pod labels and annotations", "TargetPod.Name", targetRunner.Name)
                }

                // retrieve migration statistics
                migrationInfo, err := QmpGetMigrationInfo(QmpAddr(vm))
                if err != nil {
                        log.Error(err, "Failed to get migration info")
                        return ctrl.Result{}, err
                }

                // check if migration done
                if migrationInfo.Status == "completed" {
                        message := fmt.Sprintf("Migration finished with success to target pod (%s)",
                                targetRunner.Name)
                        log.Info(message)
                        r.Recorder.Event(migration, "Normal", "Finished", message)

                        // re-fetch the vm
                        err := r.Get(ctx, types.NamespacedName{Name: migration.Spec.VmName, Namespace: migration.Namespace}, vm)
                        if err != nil {
                                log.Error(err, "Failed to re-fetch VM", "VmName", migration.Spec.VmName)
                                return ctrl.Result{}, err
                        }
                        // Redefine runner Pod for VM
                        vm.Status.PodName = migration.Status.TargetPodName
                        vm.Status.PodIP = migration.Status.TargetPodIP
                        vm.Status.Phase = vmv1.VmRunning
                        // update VM status
                        if err := r.Status().Update(ctx, vm); err != nil {
                                log.Error(err, "Failed to redefine runner pod in VM")
                                return ctrl.Result{}, err
                        }

                        // Redefine owner references for the source and target pod:
                        //
                        // For the source, we change it from "VM object controlling + migration owning" to
                        // just "VM object owning + migration owning" (and nothing controlling).
                        //
                        // For the target, we change it from "migration controlling" to "VM object controlling".
                        targetRunner.OwnerReferences = []metav1.OwnerReference{}
                        if err := ctrl.SetControllerReference(vm, targetRunner, r.Scheme); err != nil {
                                return ctrl.Result{}, err
                        }
                        if err := r.Update(ctx, targetRunner); err != nil {
                                log.Error(err, "Failed to update ownerRef for target runner pod")
                                return ctrl.Result{}, err
                        }
                        // ... and change the source runner:
                        sourceRunner := &corev1.Pod{}
                        err = r.Get(ctx, types.NamespacedName{Name: migration.Status.SourcePodName, Namespace: migration.Namespace}, sourceRunner)
                        if err == nil {
                                sourceRunner.OwnerReferences = []metav1.OwnerReference{}
                                if err := controllerutil.SetOwnerReference(migration, sourceRunner, r.Scheme); err != nil {
                                        return ctrl.Result{}, err
                                }
                                if err := controllerutil.SetOwnerReference(vm, sourceRunner, r.Scheme); err != nil {
                                        return ctrl.Result{}, err
                                }
                                if err := r.Update(ctx, sourceRunner); err != nil {
                                        log.Error(err, "Failed to update ownerRef for source runner pod")
                                        return ctrl.Result{}, err
                                }
                        } else if !apierrors.IsNotFound(err) {
                                return ctrl.Result{}, err
                        }

                        // try to stop hypervisor in source runner if it running still
                        if sourceRunner.Status.Phase == corev1.PodRunning {
                                if err := QmpQuit(migration.Status.SourcePodIP, vm.Spec.QMP); err != nil {
                                        log.Error(err, "Failed stop hypervisor in source runner pod")
                                } else {
                                        log.Info("Hypervisor in source runner pod stopped")
                                }
                        } else {
                                log.Info("Skip stopping hypervisor in source runner pod", "pod.Status.Phase", sourceRunner.Status.Phase)
                        }

                        // finally update migration phase to Succeeded
                        migration.Status.Phase = vmv1.VmmSucceeded
                        migration.Status.Info.Status = migrationInfo.Status
                        return r.updateMigrationStatus(ctx, migration)
                }

                // check if migration failed
                if migrationInfo.Status == "failed" {
                        // oops, migration failed
                        message := fmt.Sprintf("Migration to target pod (%s) was failed",
                                targetRunner.Name)
                        log.Info(message)
                        r.Recorder.Event(migration, "Warning", "Failed", message)

                        // try to stop hypervisor in target runner
                        if targetRunner.Status.Phase == corev1.PodRunning {
                                if err := QmpQuit(migration.Status.TargetPodIP, vm.Spec.QMP); err != nil {
                                        log.Error(err, "Failed stop hypervisor in target runner pod")
                                } else {
                                        log.Info("Hypervisor in target runner pod stopped")
                                }
                        } else {
                                log.Info("Skip stopping hypervisor in target runner pod", "pod.Status.Phase", targetRunner.Status.Phase)
                        }
                        // change VM status to Running
                        vm.Status.Phase = vmv1.VmRunning
                        if err := r.Status().Update(ctx, vm); err != nil {
                                log.Error(err, "Failed to update VM status from Migrating back to Running as Migration was failed")
                                return ctrl.Result{}, err
                        }
                        // finally update migration phase to Failed
                        migration.Status.Phase = vmv1.VmmFailed
                        migration.Status.Info.Status = migrationInfo.Status
                        return r.updateMigrationStatus(ctx, migration)
                }
                // seems migration still going on, just update status with migration progress once per second
                time.Sleep(time.Second)
                // re-retrieve migration statistics
                migrationInfo, err = QmpGetMigrationInfo(QmpAddr(vm))
                if err != nil {
                        log.Error(err, "Failed to re-get migration info")
                        return ctrl.Result{}, err
                }
                // re-fetch the vm
                err = r.Get(ctx, types.NamespacedName{Name: migration.Spec.VmName, Namespace: migration.Namespace}, vm)
                if err != nil {
                        log.Error(err, "Failed to re-fetch VM before Mgration progress update", "VmName", migration.Spec.VmName)
                        return ctrl.Result{}, err
                }
                migration.Status.Info.Status = migrationInfo.Status
                migration.Status.Info.TotalTimeMs = migrationInfo.TotalTimeMs
                migration.Status.Info.SetupTimeMs = migrationInfo.SetupTimeMs
                migration.Status.Info.DowntimeMs = migrationInfo.DowntimeMs
                migration.Status.Info.Ram.Transferred = migrationInfo.Ram.Transferred
                migration.Status.Info.Ram.Remaining = migrationInfo.Ram.Remaining
                migration.Status.Info.Ram.Total = migrationInfo.Ram.Total
                migration.Status.Info.Compression.CompressedSize = migrationInfo.Compression.CompressedSize
                migration.Status.Info.Compression.CompressionRate = int64(math.Round(migrationInfo.Compression.CompressionRate))
                return r.updateMigrationStatus(ctx, migration)

        case vmv1.VmmSucceeded:
                // do additional VM status checks
                if vm.Status.Phase == vmv1.VmMigrating {
                        // migration Succeeded and VM should have status Running
                        vm.Status.Phase = vmv1.VmRunning
                        // update VM status
                        if err := r.Status().Update(ctx, vm); err != nil {
                                log.Error(err, "Failed to update VM status from Migrating to Running as Migration succeeded")
                                return ctrl.Result{}, err
                        }
                }

                if len(migration.Status.SourcePodName) > 0 {
                        // try to find and remove source runner Pod
                        sourceRunner := &corev1.Pod{}
                        err := r.Get(ctx, types.NamespacedName{Name: migration.Status.SourcePodName, Namespace: migration.Namespace}, sourceRunner)
                        if err != nil && !apierrors.IsNotFound(err) {
                                log.Error(err, "Failed to get source runner Pod for deletion")
                                return ctrl.Result{}, err
                        }
                        var msg, eventReason string
                        if buildtag.NeverDeleteRunnerPods {
                                msg = fmt.Sprintf("Source runner pod deletion was skipped due to '%s' build tag", buildtag.TagnameNeverDeleteRunnerPods)
                                eventReason = "DeleteSkipped"
                        } else {
                                if err := r.Delete(ctx, sourceRunner); err != nil {
                                        log.Error(err, "Failed to delete source runner Pod")
                                        return ctrl.Result{}, err
                                }
                                msg = "Source runner was deleted"
                                eventReason = "Deleted"
                        }
                        log.Info(msg, "Pod.Namespace", sourceRunner.Namespace, "Pod.Name", sourceRunner.Name)
                        r.Recorder.Event(migration, "Normal", eventReason, fmt.Sprintf("%s: %s", msg, sourceRunner.Name))
                        migration.Status.SourcePodName = ""
                        migration.Status.SourcePodIP = ""
                        return r.updateMigrationStatus(ctx, migration)
                }
                // all done, stop reconciliation
                return ctrl.Result{}, nil

        case vmv1.VmmFailed:
                // do additional VM status checks
                if vm.Status.Phase == vmv1.VmMigrating {
                        // migration Failed and VM should back to Running state
                        vm.Status.Phase = vmv1.VmRunning
                        if err := r.Status().Update(ctx, vm); err != nil {
                                log.Error(err, "Failed to update VM status from Migrating back to Running as Migration was failed")
                                return ctrl.Result{}, err
                        }
                }
                // all done, stop reconciliation
                return ctrl.Result{}, nil

        default:
                // not sure what to do, so try rqueue
                log.Info("Requeuing current request")
                return ctrl.Result{RequeueAfter: time.Second}, nil
        }

        // MAIN RECONCILE LOOP END
        return ctrl.Result{}, nil
}

func (r *VirtualMachineMigrationReconciler) updateMigrationStatus(ctx context.Context, migration *vmv1.VirtualMachineMigration) (ctrl.Result, error) {
        log := log.FromContext(ctx)
        if err := r.Status().Update(ctx, migration); err != nil {
                log.Error(err, "Failed update Migration status")
                return ctrl.Result{}, err
        }
        return ctrl.Result{}, nil
}

// finalizeVirtualMachineMigration will perform the required operations before delete the CR.
func (r *VirtualMachineMigrationReconciler) doFinalizerOperationsForVirtualMachineMigration(ctx context.Context, migration *vmv1.VirtualMachineMigration, vm *vmv1.VirtualMachine) error {
        log := log.FromContext(ctx)

        if migration.Status.Phase == vmv1.VmmRunning || vm.Status.Phase == vmv1.VmPreMigrating {
                message := fmt.Sprintf("Running Migration (%s) is being deleted", migration.Name)
                log.Info(message)
                r.Recorder.Event(migration, "Warning", "Deleting", message)

                // try to cancel migration
                log.Info("Canceling migration")
                if err := QmpCancelMigration(QmpAddr(vm)); err != nil {
                        // inform about error but not return error to avoid stuckness in reconciliation cycle
                        log.Error(err, "Migration canceling failed")
                }

                if vm.Status.Phase == vmv1.VmMigrating || vm.Status.Phase == vmv1.VmPreMigrating {
                        // migration being deleted and VM should have status Running
                        vm.Status.Phase = vmv1.VmRunning
                        // update VM status
                        if err := r.Status().Update(ctx, vm); err != nil {
                                log.Error(err, "Failed to update VM status from Migrating to Running on Migration deletion")
                                return err
                        }
                }

                // try to remove target runner pod
                if len(migration.Status.TargetPodName) > 0 {
                        pod := &corev1.Pod{}
                        err := r.Get(ctx, types.NamespacedName{Name: migration.Status.TargetPodName, Namespace: migration.Namespace}, pod)
                        if err != nil && !apierrors.IsNotFound(err) {
                                log.Error(err, "Failed to get target runner Pod for deletion")
                                return err
                        }
                        if apierrors.IsNotFound(err) {
                                // pod already deleted ?
                                return nil
                        }
                        // NB: here, we ignore buildtag.NeverDeleteRunnerPods because we delete runner pods on
                        // VM object deletion with the tag anyways, so it's more consistent to keep the same
                        // behavior for VMMs.
                        if err := r.Delete(ctx, pod); err != nil {
                                log.Error(err, "Failed to delete target runner Pod")
                                return err
                        }
                        message := fmt.Sprintf("Target runner (%s) was deleted", pod.Name)
                        log.Info(message)
                        r.Recorder.Event(migration, "Normal", "Deleted", message)
                }
        }

        return nil
}

// SetupWithManager sets up the controller with the Manager.
// Note that the Pods will be also watched in order to ensure its
// desirable state on the cluster
func (r *VirtualMachineMigrationReconciler) SetupWithManager(mgr ctrl.Manager) (ReconcilerWithMetrics, error) {
        cntrlName := "virtualmachinemigration"
        reconciler := WithMetrics(
                withCatchPanic(r),
                r.Metrics,
                cntrlName,
                r.Config.FailurePendingPeriod,
                r.Config.FailingRefreshInterval,
        )
        err := ctrl.NewControllerManagedBy(mgr).
                For(&vmv1.VirtualMachineMigration{}).
                Owns(&corev1.Pod{}).
                WithOptions(controller.Options{MaxConcurrentReconciles: r.Config.MaxConcurrentReconciles}).
                Named(cntrlName).
                Complete(reconciler)
        return reconciler, err
}

// targetPodForVirtualMachine returns a VirtualMachine Pod object
func (r *VirtualMachineMigrationReconciler) targetPodForVirtualMachine(
        vm *vmv1.VirtualMachine,
        migration *vmv1.VirtualMachineMigration,
        sshSecret *corev1.Secret,
) (*corev1.Pod, error) {
        if err := vm.Spec.Guest.ValidateMemorySize(); err != nil {
                return nil, fmt.Errorf("cannot create target pod because memory is invalid: %w", err)
        }

        pod, err := podSpec(vm, sshSecret, r.Config)
        if err != nil {
                return nil, err
        }

        // override pod name
        pod.Name = migration.Status.TargetPodName

        // add env variable to turn on migration receiver
        // TODO: make it false or empty after the migration is done to enable correct readiness probe
        pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, corev1.EnvVar{Name: "RECEIVE_MIGRATION", Value: "true"})

        // add podAntiAffinity to schedule target pod to another k8s node
        if migration.Spec.PreventMigrationToSameHost {
                if pod.Spec.Affinity == nil {
                        pod.Spec.Affinity = &corev1.Affinity{}
                }
                if pod.Spec.Affinity.PodAntiAffinity == nil {
                        pod.Spec.Affinity.PodAntiAffinity = &corev1.PodAntiAffinity{}
                }
                if pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil {
                        pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = []corev1.PodAffinityTerm{}
                }
                pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = append(pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, corev1.PodAffinityTerm{
                        LabelSelector: &metav1.LabelSelector{
                                MatchLabels: map[string]string{
                                        vmv1.VirtualMachineNameLabel: migration.Spec.VmName,
                                },
                        },
                        TopologyKey: "kubernetes.io/hostname",
                })
        }

        // Set the ownerRef for the Pod
        if err := ctrl.SetControllerReference(migration, pod, r.Scheme); err != nil {
                return nil, err
        }
        if err := controllerutil.SetOwnerReference(vm, pod, r.Scheme); err != nil {
                return nil, err
        }

        return pod, nil
}

package controllers

// Wrapper around the default VirtualMachine/VirtualMachineMigration webhook interfaces so that the
// controller has a bit more control over them, without needing to actually implement that control
// inside of the apis package.

import (
        "context"
        "fmt"

        ctrl "sigs.k8s.io/controller-runtime"
        "sigs.k8s.io/controller-runtime/pkg/client"
        "sigs.k8s.io/controller-runtime/pkg/log"
        "sigs.k8s.io/controller-runtime/pkg/webhook"
        "sigs.k8s.io/controller-runtime/pkg/webhook/admission"

        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/apimachinery/pkg/runtime"
        "k8s.io/client-go/tools/record"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/util/stack"
)

func validateUpdate(
        ctx context.Context,
        cfg *ReconcilerConfig,
        recorder record.EventRecorder,
        oldObj runtime.Object,
        newObj interface {
                webhook.Validator
                metav1.Object
        },
) (admission.Warnings, error) {
        log := log.FromContext(ctx)

        namespacedName := client.ObjectKeyFromObject(newObj)
        _, skipValidation := cfg.SkipUpdateValidationFor[namespacedName]

        warnings, err := func() (w admission.Warnings, e error) {
                // if we plan to skip validation, catch any panics so that they can be ignored.
                if skipValidation {
                        defer func() {
                                if err := recover(); err != nil {
                                        e = fmt.Errorf("validation panicked with: %v", err)
                                        st := stack.GetStackTrace(nil, 1).String()
                                        log.Error(e, "webhook update validation panicked", "stack", st)
                                }
                        }()
                }

                return newObj.ValidateUpdate(oldObj)
        }()

        if err != nil && skipValidation {
                recorder.Event(
                        newObj,
                        "Warning",
                        "SkippedValidation",
                        "Ignoring failed webhook validation because of controller's '--skip-update-validation-for' flag",
                )
                log.Error(err, "Ignoring failed webhook validation")
                return warnings, nil
        }

        return warnings, err
}

type VMWebhook struct {
        Recorder record.EventRecorder
        Config   *ReconcilerConfig
}

func (w *VMWebhook) SetupWithManager(mgr ctrl.Manager) error {
        return ctrl.NewWebhookManagedBy(mgr).
                For(&vmv1.VirtualMachine{}).
                WithDefaulter(w).
                WithValidator(w).
                Complete()
}

var _ webhook.CustomDefaulter = (*VMWebhook)(nil)

// Default implements webhook.CustomDefaulter
func (w *VMWebhook) Default(ctx context.Context, obj runtime.Object) error {
        vm := obj.(*vmv1.VirtualMachine)
        vm.Default()
        return nil
}

var _ webhook.CustomValidator = (*VMWebhook)(nil)

// ValidateCreate implements webhook.CustomValidator
func (w *VMWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
        vm := obj.(*vmv1.VirtualMachine)
        return vm.ValidateCreate()
}

// ValidateUpdate implements webhook.CustomValidator
func (w *VMWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
        newVM := newObj.(*vmv1.VirtualMachine)
        return validateUpdate(ctx, w.Config, w.Recorder, oldObj, newVM)
}

// ValidateDelete implements webhook.CustomValidator
func (w *VMWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
        vm := obj.(*vmv1.VirtualMachine)
        return vm.ValidateDelete()
}

type VMMigrationWebhook struct {
        Recorder record.EventRecorder
        Config   *ReconcilerConfig
}

func (w *VMMigrationWebhook) SetupWithManager(mgr ctrl.Manager) error {
        return ctrl.NewWebhookManagedBy(mgr).
                For(&vmv1.VirtualMachineMigration{}).
                WithDefaulter(w).
                WithValidator(w).
                Complete()
}

var _ webhook.CustomDefaulter = (*VMWebhook)(nil)

// Default implements webhook.CustomDefaulter
func (w *VMMigrationWebhook) Default(ctx context.Context, obj runtime.Object) error {
        vmm := obj.(*vmv1.VirtualMachineMigration)
        vmm.Default()
        return nil
}

var _ webhook.CustomValidator = (*VMWebhook)(nil)

// ValidateCreate implements webhook.CustomValidator
func (w *VMMigrationWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
        vmm := obj.(*vmv1.VirtualMachineMigration)
        return vmm.ValidateCreate()
}

// ValidateUpdate implements webhook.CustomValidator
func (w *VMMigrationWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
        newVMM := newObj.(*vmv1.VirtualMachineMigration)
        return validateUpdate(ctx, w.Config, w.Recorder, oldObj, newVMM)
}

// ValidateDelete implements webhook.CustomValidator
func (w *VMMigrationWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
        vmm := obj.(*vmv1.VirtualMachineMigration)
        return vmm.ValidateDelete()
}

package cpuscaling

import (
        "errors"
        "slices"
)

type CPUStater interface {
        OnlineCPUs() ([]int, error)
        OfflineCPUs() ([]int, error)
        SetState(cpuID int, cpuState cpuState) error
}

type cpuState string

const (
        cpuOnline  cpuState = "online"
        cpuOffline cpuState = "offline"
)

type CPUScaler struct {
        cpuState CPUStater
}

func NewCPUScaler() *CPUScaler {
        return &CPUScaler{
                cpuState: &cpuSysfsState{},
        }
}

func (c *CPUScaler) ReconcileOnlineCPU(targetCount int) error {
        online, err := c.cpuState.OnlineCPUs()
        if err != nil {
                return err
        }

        if len(online) == targetCount {
                return nil
        }

        if len(online) > targetCount {
                diff := len(online) - targetCount
                // offline 'diff' CPUs that are currently online
                // reverse online slice so that we offline in the reverse order of onlining.
                slices.Reverse(online)
                return c.setStateTo(cpuOffline, diff, online)

        } else if len(online) < targetCount {
                offline, err := c.cpuState.OfflineCPUs()
                if err != nil {
                        return nil
                }

                diff := targetCount - len(online)
                // online 'diff' CPUs that are currently offline
                return c.setStateTo(cpuOnline, diff, offline)
        }

        return nil
}

func (c *CPUScaler) setStateTo(state cpuState, count int, candidateCPUs []int) error {
        for _, cpuID := range candidateCPUs {
                if cpuID == 0 {
                        // Not allowed to change the status of CPU 0
                        continue
                }

                if err := c.cpuState.SetState(cpuID, state); err != nil {
                        return err
                }

                count -= 1
                // nothing left to do
                if count <= 0 {
                        return nil
                }
        }

        // Got through the entire list but didn't change the state of enough CPUs
        return errors.New("could not change the state of enough CPUs")
}

// ActiveCPUsCount() returns the count of online CPUs.
func (c *CPUScaler) ActiveCPUsCount() (int, error) {
        onlineCPUs, err := c.cpuState.OnlineCPUs()
        if err != nil {
                return 0, err
        }
        return len(onlineCPUs), nil
}

package cpuscaling

import (
        "fmt"
        "os"
        "path/filepath"
        "strconv"
        "strings"
)

// CPU directory path
const cpuPath = "/sys/devices/system/cpu/"

type cpuSysfsState struct{}

func (cs *cpuSysfsState) SetState(cpuNum int, cpuState cpuState) error {
        var state string
        switch cpuState {
        case cpuOnline:
                state = "1"
        case cpuOffline:
                state = "0"
        }

        err := os.WriteFile(filepath.Join(cpuPath, fmt.Sprintf("cpu%d/online", cpuNum)), []byte(state), 0o644)
        if err != nil {
                return fmt.Errorf("failed to set CPU %d online status: %w", cpuNum, err)
        }

        return nil
}

func (cs *cpuSysfsState) OnlineCPUs() ([]int, error) {
        data, err := os.ReadFile(filepath.Join(cpuPath, "online"))
        if err != nil {
                return nil, fmt.Errorf("failed to read online CPUs: %w", err)
        }
        cpuIDs, err := cs.parseMultipleCPURange(string(data))
        if err != nil {
                // log value of the file in case we can't parse to help debugging
                return nil, fmt.Errorf("failed to parse online CPUs %q: %w", string(data), err)
        }
        return cpuIDs, nil
}

func (cs *cpuSysfsState) OfflineCPUs() ([]int, error) {
        data, err := os.ReadFile(filepath.Join(cpuPath, "offline"))
        if err != nil {
                return nil, fmt.Errorf("failed to read offline CPUs: %w", err)
        }
        cpuIDs, err := cs.parseMultipleCPURange(string(data))
        if err != nil {
                // log value of the file in case we can't parse to help debugging
                return nil, fmt.Errorf("failed to parse offline CPUs %q: %w", string(data), err)
        }
        return cpuIDs, nil
}

func (cs *cpuSysfsState) parseCPURange(cpuRange string) (int, int, error) {
        cpuRange = strings.TrimSpace(cpuRange)
        parts := strings.Split(cpuRange, "-")

        // Single CPU case, e.g., "0"
        if len(parts) == 1 {
                cpu, err := strconv.Atoi(parts[0])
                if err != nil {
                        return -1, -1, err
                }
                return cpu, cpu, nil
        }

        // Range case, e.g., "0-3"
        start, err := strconv.Atoi(parts[0])
        if err != nil {
                return -1, -1, err
        }
        end, err := strconv.Atoi(parts[1])
        if err != nil {
                return -1, -1, err
        }
        return start, end, nil
}

// parseMultipleCPURange parses the multiple CPU range string (e.g., "0-3,5-7") and returns a list of CPUs.
func (cs *cpuSysfsState) parseMultipleCPURange(cpuRanges string) ([]int, error) {
        cpuRanges = strings.TrimSpace(cpuRanges)
        parts := strings.Split(cpuRanges, ",")

        var cpus []int
        for _, part := range parts {
                start, end, err := cs.parseCPURange(part)
                if err != nil {
                        return nil, err
                }

                for cpu := start; cpu <= end; cpu++ {
                        cpus = append(cpus, cpu)
                }
        }

        return cpus, nil
}

package ipam

import (
        "context"
        "net"

        whereaboutsallocate "github.com/k8snetworkplumbingwg/whereabouts/pkg/allocate"
        whereaboutslogging "github.com/k8snetworkplumbingwg/whereabouts/pkg/logging"
        whereaboutstypes "github.com/k8snetworkplumbingwg/whereabouts/pkg/types"
        "sigs.k8s.io/controller-runtime/pkg/log"

        "k8s.io/apimachinery/pkg/types"
)

type ipamAction = func(
        ipRange RangeConfiguration,
        reservation []whereaboutstypes.IPReservation,
) (net.IPNet, []whereaboutstypes.IPReservation, error)

// makeAcquireAction creates a callback which changes IPPool state to include a new IP reservation.
func makeAcquireAction(ctx context.Context, vmName types.NamespacedName) ipamAction {
        return func(ipRange RangeConfiguration, reservation []whereaboutstypes.IPReservation) (net.IPNet, []whereaboutstypes.IPReservation, error) {
                return doAcquire(ctx, ipRange, reservation, vmName)
        }
}

// makeReleaseAction creates a callback which changes IPPool state to deallocate an IP reservation.
func makeReleaseAction(ctx context.Context, vmName types.NamespacedName) ipamAction {
        return func(ipRange RangeConfiguration, reservation []whereaboutstypes.IPReservation) (net.IPNet, []whereaboutstypes.IPReservation, error) {
                return doRelease(ctx, ipRange, reservation, vmName)
        }
}

func doAcquire(
        _ context.Context,
        ipRange RangeConfiguration,
        reservation []whereaboutstypes.IPReservation,
        vmName types.NamespacedName,
) (net.IPNet, []whereaboutstypes.IPReservation, error) {
        // reduce whereabouts logging
        whereaboutslogging.SetLogLevel("error")

        _, ipnet, _ := net.ParseCIDR(ipRange.Range)

        // check if IP reserved for VM already
        foundidx := getMatchingIPReservationIndex(reservation, vmName.String())
        if foundidx >= 0 {
                return net.IPNet{IP: reservation[foundidx].IP, Mask: ipnet.Mask}, reservation, nil
        }

        // try to reserve new IP gor given VM
        ip, newReservation, err := whereaboutsallocate.IterateForAssignment(*ipnet,
                ipRange.RangeStart, ipRange.RangeEnd,
                reservation, ipRange.OmitRanges, vmName.String(), "")
        if err != nil {
                return net.IPNet{}, nil, err
        }

        return net.IPNet{IP: ip, Mask: ipnet.Mask}, newReservation, nil
}

func doRelease(
        ctx context.Context,
        ipRange RangeConfiguration,
        reservation []whereaboutstypes.IPReservation,
        vmName types.NamespacedName,
) (net.IPNet, []whereaboutstypes.IPReservation, error) {
        // reduce whereabouts logging
        whereaboutslogging.SetLogLevel("error")

        log := log.FromContext(ctx)

        _, ipnet, _ := net.ParseCIDR(ipRange.Range)

        // try to release IP for given VM
        newReservation, ip, err := whereaboutsallocate.IterateForDeallocation(reservation, vmName.String(), getMatchingIPReservationIndex)
        if err != nil {
                // The only reason to get an error here is if we are trying
                // to deallocate the same IP twice.
                log.Info("Failed to deallocate IP", "error", err)

                // Ignore the error.
                return net.IPNet{IP: ip, Mask: ipnet.Mask}, newReservation, nil
        }

        return net.IPNet{IP: ip, Mask: ipnet.Mask}, newReservation, nil
}

func getMatchingIPReservationIndex(reservation []whereaboutstypes.IPReservation, id string) int {
        foundidx := -1
        for idx, v := range reservation {
                if v.ContainerID == id {
                        foundidx = idx
                        break
                }
        }
        return foundidx
}

package ipam

import (
        nad "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/clientset/versioned"

        "k8s.io/client-go/kubernetes"
        "k8s.io/client-go/rest"

        neonvm "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
)

// Set of kubernetets clients
type Client struct {
        KubeClient kubernetes.Interface
        VMClient   neonvm.Interface
        NADClient  nad.Interface
}

func NewKubeClient(cfg *rest.Config) (*Client, error) {
        kubeClient, err := kubernetes.NewForConfig(cfg)
        if err != nil {
                return nil, err
        }
        vmClient, err := neonvm.NewForConfig(cfg)
        if err != nil {
                return nil, err
        }
        nadClient, err := nad.NewForConfig(cfg)
        if err != nil {
                return nil, err
        }

        return &Client{
                KubeClient: kubeClient,
                VMClient:   vmClient,
                NADClient:  nadClient,
        }, nil
}

package main

import (
        "context"
        "flag"
        "fmt"
        "os"
        "sync"
        "time"

        "go.uber.org/zap/zapcore"
        "sigs.k8s.io/controller-runtime/pkg/log"
        "sigs.k8s.io/controller-runtime/pkg/log/zap"

        "k8s.io/apimachinery/pkg/types"
        "k8s.io/klog/v2"

        "github.com/neondatabase/autoscaling/pkg/neonvm/ipam"
)

var (
        nadName = flag.String("nad-name", "ipam-demo", "Network Attachment Definition name")
        nadNs   = flag.String("nad-namespace", "default", "Network Attachment Definition namespace")

        demoLoggerName = "ipam-demo"
        demoNamespace  = "default"
        demoCount      = 10
)

func main() {
        opts := zap.Options{ //nolint:exhaustruct // typical options struct; not all fields expected to be filled.
                Development:     true,
                StacktraceLevel: zapcore.Level(zapcore.PanicLevel),
                TimeEncoder:     zapcore.ISO8601TimeEncoder,
        }
        opts.BindFlags(flag.CommandLine)
        flag.Parse()

        // define logger
        logger := zap.New(zap.UseFlagOptions(&opts)).WithName(demoLoggerName)

        // define klog settings (used in LeaderElector)
        klog.SetLogger(logger.V(2))

        // define context with logger
        ctx := log.IntoContext(context.Background(), logger)

        // Create IPAM object
        ipam, err := ipam.New(*nadName, *nadNs, 1)
        if err != nil {
                logger.Error(err, "failed to create IPAM")
                os.Exit(1)
        }
        defer ipam.Close()

        var wg sync.WaitGroup

        // acquire IPs in parallel
        for i := 1; i <= demoCount; i++ {
                wg.Add(1)
                go func(i int) {
                        defer wg.Done()
                        startTime := time.Now()
                        id := fmt.Sprintf("demo-ipam-%d", i)
                        logger.Info("try to lease", "id", id)
                        if ip, err := ipam.AcquireIP(ctx, types.NamespacedName{Name: id, Namespace: demoNamespace}); err != nil {
                                logger.Error(err, "lease failed", "id", id)
                        } else {
                                logger.Info("acquired", "id", id, "ip", ip.String(), "acquired in", time.Since(startTime))
                        }
                }(i)
                time.Sleep(time.Millisecond * 200)
        }
        wg.Wait()

        // release IPs in parallel
        for i := 1; i <= demoCount; i++ {
                wg.Add(1)
                go func(i int) {
                        defer wg.Done()
                        startTime := time.Now()
                        id := fmt.Sprintf("demo-ipam-%d", i)
                        logger.Info("try to release", "id", id)
                        if ip, err := ipam.ReleaseIP(ctx, types.NamespacedName{Name: id, Namespace: demoNamespace}); err != nil {
                                logger.Error(err, "release failed", "id", id)
                        } else {
                                logger.Info("released", "id", id, "ip", ip.String(), "released in", time.Since(startTime))
                        }
                }(i)
                time.Sleep(time.Millisecond * 200)
        }
        wg.Wait()
}

package ipam

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "net"
        "strconv"
        "strings"
        "sync"
        "time"

        whereaboutsallocate "github.com/k8snetworkplumbingwg/whereabouts/pkg/allocate"
        whereaboutstypes "github.com/k8snetworkplumbingwg/whereabouts/pkg/types"
        "golang.org/x/sync/semaphore"
        "sigs.k8s.io/controller-runtime/pkg/client/config"
        "sigs.k8s.io/controller-runtime/pkg/log"

        apierrors "k8s.io/apimachinery/pkg/api/errors"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/apimachinery/pkg/types"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        neonvm "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
)

const (
        UnnamedNetwork string = ""

        // kubernetes client-go rate limiter settings
        // https://pkg.go.dev/k8s.io/client-go@v0.27.2/rest#Config
        KubernetesClientQPS   = 100
        KubernetesClientBurst = 200

        // RequestTimeout for IPAM queries
        IpamRequestTimeout = 10 * time.Second

        // DatastoreRetries defines how many retries are attempted when reading/updating the IP Pool
        DatastoreRetries      = 5
        DatastoreRetriesDelay = 100 * time.Millisecond
)

var ErrAgain = errors.New("IPAM concurrency limit reached. Try again later.")

type Temporary interface {
        Temporary() bool
}

type IPAM struct {
        Client
        Config IPAMConfig

        mu                 sync.Mutex
        concurrencyLimiter *semaphore.Weighted
}

func (i *IPAM) AcquireIP(ctx context.Context, vmName types.NamespacedName) (net.IPNet, error) {
        ip, err := i.runIPAM(ctx, makeAcquireAction(ctx, vmName))
        if err != nil {
                return net.IPNet{}, fmt.Errorf("failed to acquire IP: %w", err)
        }
        return ip, nil
}

func (i *IPAM) ReleaseIP(ctx context.Context, vmName types.NamespacedName) (net.IPNet, error) {
        ip, err := i.runIPAM(ctx, makeReleaseAction(ctx, vmName))
        if err != nil {
                return net.IPNet{}, fmt.Errorf("failed to release IP: %w", err)
        }
        return ip, nil
}

// New returns a new IPAM object with ipam config and k8s/crd clients
func New(nadName string, nadNamespace string, concurrencyLimit int) (*IPAM, error) {
        // get Kubernetes client config
        cfg, err := config.GetConfig()
        if err != nil {
                return nil, fmt.Errorf("error building kubernetes configuration: %w", err)
        }

        // tune Kubernetes client performance
        cfg.QPS = KubernetesClientQPS
        cfg.Burst = KubernetesClientBurst

        kClient, err := NewKubeClient(cfg)
        if err != nil {
                return nil, fmt.Errorf("error creating kubernetes client: %w", err)
        }
        return NewWithClient(kClient, nadName, nadNamespace, concurrencyLimit)
}

func NewWithClient(kClient *Client, nadName string, nadNamespace string, concurrencyLimit int) (*IPAM, error) {
        ctx, cancel := context.WithTimeout(context.Background(), IpamRequestTimeout)
        defer cancel()

        // read network-attachment-definition from Kubernetes
        nad, err := kClient.NADClient.K8sCniCncfIoV1().NetworkAttachmentDefinitions(nadNamespace).Get(ctx, nadName, metav1.GetOptions{})
        if err != nil {
                return nil, err
        }
        if len(nad.Spec.Config) == 0 {
                return nil, fmt.Errorf("network-attachment-definition %s hasn't IPAM config section", nad.Name)
        }

        ipamConfig, err := LoadFromNad(nad.Spec.Config, nadNamespace)
        if err != nil {
                return nil, fmt.Errorf("network-attachment-definition IPAM config parse error: %w", err)
        }
        if len(ipamConfig.IPRanges) == 0 {
                return nil, fmt.Errorf("network-attachment-definition %s has not IP ranges", nad.Name)
        }

        return &IPAM{
                Config:             *ipamConfig,
                Client:             *kClient,
                mu:                 sync.Mutex{},
                concurrencyLimiter: semaphore.NewWeighted(int64(concurrencyLimit)),
        }, nil
}

// Load Network Attachment Definition and parse config to fill IPAM config
func LoadFromNad(nadConfig string, nadNamespace string) (*IPAMConfig, error) {
        var n Nad
        if err := json.Unmarshal([]byte(nadConfig), &n); err != nil {
                return nil, fmt.Errorf("json parsing error: %w", err)
        }

        if n.IPAM == nil {
                return nil, fmt.Errorf("missing 'ipam' key")
        }

        // process old-style Range to Ranges array
        if n.IPAM.Range != "" {
                oldRange := RangeConfiguration{
                        OmitRanges: n.IPAM.OmitRanges,
                        Range:      n.IPAM.Range,
                        RangeStart: n.IPAM.RangeStart,
                        RangeEnd:   n.IPAM.RangeEnd,
                }
                n.IPAM.IPRanges = append([]RangeConfiguration{oldRange}, n.IPAM.IPRanges...)
        }

        // check IP ranges
        for idx, rangeConfig := range n.IPAM.IPRanges {
                firstip, ipNet, err := net.ParseCIDR(rangeConfig.Range)
                if err != nil {
                        return nil, fmt.Errorf("invalid CIDR %s: %w", rangeConfig.Range, err)
                }
                rangeConfig.Range = ipNet.String()
                if rangeConfig.RangeStart == nil {
                        firstip = net.ParseIP(firstip.Mask(ipNet.Mask).String()) // get real first IP from cidr
                        rangeConfig.RangeStart = firstip
                }
                if rangeConfig.RangeStart != nil && !ipNet.Contains(rangeConfig.RangeStart) {
                        return nil, fmt.Errorf("range_start IP %s not in IP Range %s",
                                rangeConfig.RangeStart.String(),
                                rangeConfig.Range)
                }
                if rangeConfig.RangeEnd != nil && !ipNet.Contains(rangeConfig.RangeEnd) {
                        return nil, fmt.Errorf("range_end IP %s not in IP Range %s",
                                rangeConfig.RangeEnd.String(),
                                rangeConfig.Range)
                }

                n.IPAM.IPRanges[idx] = rangeConfig
        }

        // delete old style settings
        n.IPAM.OmitRanges = nil
        n.IPAM.Range = ""
        n.IPAM.RangeStart = nil
        n.IPAM.RangeEnd = nil

        // check Excluded IP ranges
        for idx := range n.IPAM.OmitRanges {
                _, _, err := net.ParseCIDR(n.IPAM.OmitRanges[idx])
                if err != nil {
                        return nil, fmt.Errorf("invalid exclude CIDR %s: %w", n.IPAM.OmitRanges[idx], err)
                }
        }

        // set network namespace
        n.IPAM.NetworkNamespace = nadNamespace

        return n.IPAM, nil
}

// Performing IPAM actions
func (i *IPAM) runIPAM(ctx context.Context, action ipamAction) (net.IPNet, error) {
        var err error
        var ip net.IPNet
        log := log.FromContext(ctx)

        // We have a semaphore to limit the number of concurrent IPAM requests.
        // Note that we use TryAcquire(), so we release the current reconcilliation worker
        // from waiting on a mutex below.
        ok := i.concurrencyLimiter.TryAcquire(1)
        if !ok {
                return net.IPNet{}, ErrAgain
        }
        defer i.concurrencyLimiter.Release(1)

        // We still want to access the IPPool one VM at a time.
        i.mu.Lock()
        defer i.mu.Unlock()

        ctx, ctxCancel := context.WithTimeout(ctx, IpamRequestTimeout)
        defer ctxCancel()

        // handle the ip add/del until successful
        for _, ipRange := range i.Config.IPRanges {
                // retry loop used to retry CRUD operations against Kubernetes
                // if we meet some issue then just do another attepmt
                ip, err = i.runIPAMRange(ctx, ipRange, action)
                // break ipRanges loop if ip was acquired/released
                if err == nil {
                        return ip, nil
                }
                log.Error(err, "error acquiring/releasing IP from range", ipRange.Range)
        }
        return net.IPNet{}, err
}

func (i *IPAM) runIPAMRange(ctx context.Context, ipRange RangeConfiguration, action ipamAction) (net.IPNet, error) {
        var ip net.IPNet
        for retry := 0; retry < DatastoreRetries; retry++ {
                select {
                case <-ctx.Done():
                        return net.IPNet{}, ctx.Err()
                default:
                        // live in retry loop until context not cancelled
                }

                // read IPPool from ipppols.vm.neon.tech custom resource
                pool, err := i.getNeonvmIPPool(ctx, ipRange.Range)
                if err != nil {
                        if e, ok := err.(Temporary); ok && e.Temporary() {
                                // retry attempt to read IPPool
                                time.Sleep(DatastoreRetriesDelay)
                                continue
                        }
                        return net.IPNet{}, fmt.Errorf("error reading IP pool: %w", err)
                }

                currentReservation := pool.Allocations(ctx)
                var newReservation []whereaboutstypes.IPReservation
                ip, newReservation, err = action(ipRange, currentReservation)
                if err != nil {
                        return net.IPNet{}, err
                }

                // update IPPool with newReservation
                err = pool.Update(ctx, newReservation)
                if err != nil {
                        if e, ok := err.(Temporary); ok && e.Temporary() {
                                // retry attempt to update IPPool
                                time.Sleep(DatastoreRetriesDelay)
                                continue
                        }
                        return net.IPNet{}, fmt.Errorf("error updating IP pool: %w", err)
                }
                return ip, nil
        }
        return ip, errors.New("IPAMretries limit reached")
}

// Status do List() request to check NeonVM client connectivity
func (i *IPAM) Status(ctx context.Context) error {
        _, err := i.VMClient.NeonvmV1().IPPools(i.Config.NetworkNamespace).List(ctx, metav1.ListOptions{})
        return err
}

// TODO: think about
func (i *IPAM) Close() error {
        return nil
}

// NeonvmIPPool represents an IPPool resource and its parsed set of allocations
type NeonvmIPPool struct {
        vmClient neonvm.Interface
        pool     *vmv1.IPPool
        firstip  net.IP
}

// Allocations returns the initially retrieved set of allocations for this pool
func (p *NeonvmIPPool) Allocations(ctx context.Context) []whereaboutstypes.IPReservation {
        return toIPReservation(ctx, p.pool.Spec.Allocations, p.firstip)
}

// getNeonvmIPPool returns a NeonVM IPPool for the given IP range
func (i *IPAM) getNeonvmIPPool(ctx context.Context, ipRange string) (*NeonvmIPPool, error) {
        // for IP range 10.11.22.0/24 poll name will be
        // "10.11.22.0-24" if no network name in ipam spec, or
        // "samplenet-10.11.22.0-24" if nametwork name is `samplenet`
        var poolName string
        if i.Config.NetworkName == UnnamedNetwork {
                poolName = strings.ReplaceAll(ipRange, "/", "-")
        } else {
                poolName = fmt.Sprintf("%s-%s", i.Config.NetworkName, strings.ReplaceAll(ipRange, "/", "-"))
        }

        pool, err := i.VMClient.NeonvmV1().IPPools(i.Config.NetworkNamespace).Get(ctx, poolName, metav1.GetOptions{})
        if err != nil && apierrors.IsNotFound(err) {
                // pool does not exist, create it
                newPool := &vmv1.IPPool{
                        ObjectMeta: metav1.ObjectMeta{
                                Name:      poolName,
                                Namespace: i.Config.NetworkNamespace,
                        },
                        Spec: vmv1.IPPoolSpec{
                                Range:       ipRange,
                                Allocations: make(map[string]vmv1.IPAllocation),
                        },
                }
                _, err = i.VMClient.NeonvmV1().IPPools(i.Config.NetworkNamespace).Create(ctx, newPool, metav1.CreateOptions{})
                if err != nil && apierrors.IsAlreadyExists(err) {
                        // the pool was just created -- allow retry
                        return nil, &temporaryError{err}
                } else if err != nil {
                        return nil, err
                }
                // if the pool was created for the first time, trigger another retry of the allocation loop
                return nil, &temporaryError{errors.New("NeonvmIPPool was initialized")}
        } else if err != nil {
                return nil, err
        }

        // get first IP in the pool
        ip, _, err := net.ParseCIDR(pool.Spec.Range)
        if err != nil {
                return nil, err
        }

        return &NeonvmIPPool{
                vmClient: i.Client.VMClient,
                pool:     pool,
                firstip:  ip,
        }, nil
}

// Update NeonvmIPPool with new IP reservation
func (p *NeonvmIPPool) Update(ctx context.Context, reservation []whereaboutstypes.IPReservation) error {
        p.pool.Spec.Allocations = toAllocations(reservation, p.firstip)
        _, err := p.vmClient.NeonvmV1().IPPools(p.pool.Namespace).Update(ctx, p.pool, metav1.UpdateOptions{})
        if err != nil {
                if apierrors.IsConflict(err) {
                        return &temporaryError{err}
                }
                return err
        }
        return nil
}

// taken from whereabouts code as it not exported
func toIPReservation(ctx context.Context, allocations map[string]vmv1.IPAllocation, firstip net.IP) []whereaboutstypes.IPReservation {
        log := log.FromContext(ctx)
        reservelist := []whereaboutstypes.IPReservation{}
        for offset, a := range allocations {
                numOffset, err := strconv.ParseInt(offset, 10, 64)
                if err != nil {
                        // allocations that are invalid int64s should be ignored
                        // toAllocationMap should be the only writer of offsets, via `fmt.Sprintf("%d", ...)``
                        log.Error(err, "error decoding ip offset")
                        continue
                }
                ip := whereaboutsallocate.IPAddOffset(firstip, uint64(numOffset))
                reservelist = append(reservelist, whereaboutstypes.IPReservation{
                        IP:          ip,
                        ContainerID: a.ContainerID,
                        PodRef:      a.PodRef,
                        IsAllocated: false,
                })
        }
        return reservelist
}

// taken from whereabouts code as it not exported
func toAllocations(reservelist []whereaboutstypes.IPReservation, firstip net.IP) map[string]vmv1.IPAllocation {
        allocations := make(map[string]vmv1.IPAllocation)
        for _, r := range reservelist {
                index := whereaboutsallocate.IPGetOffset(r.IP, firstip)
                allocations[fmt.Sprintf("%d", index)] = vmv1.IPAllocation{ContainerID: r.ContainerID, PodRef: r.PodRef}
        }
        return allocations
}

package ipam

import (
        "net"

        cnitypes "github.com/containernetworking/cni/pkg/types"
)

type temporaryError struct {
        error
}

func (t *temporaryError) Temporary() bool {
        return true
}

type RangeConfiguration struct {
        OmitRanges []string `json:"exclude,omitempty"`
        Range      string   `json:"range"`
        RangeStart net.IP   `json:"range_start,omitempty"`
        RangeEnd   net.IP   `json:"range_end,omitempty"`
}

type Nad struct {
        IPAM *IPAMConfig `json:"ipam"`
}

// IPAMConfig describes the expected json configuration for this plugin
type IPAMConfig struct {
        Routes           []*cnitypes.Route    `json:"routes"`
        IPRanges         []RangeConfiguration `json:"ipRanges"`
        OmitRanges       []string             `json:"exclude,omitempty"`
        DNS              cnitypes.DNS         `json:"dns"`
        Range            string               `json:"range"`
        RangeStart       net.IP               `json:"range_start,omitempty"`
        RangeEnd         net.IP               `json:"range_end,omitempty"`
        NetworkNamespace string
        NetworkName      string `json:"network_name,omitempty"`
}

package plugin

import (
        "encoding/json"
        "errors"
        "fmt"
        "os"
        "slices"
)

//////////////////
// CONFIG TYPES //
//////////////////

// Config stores the global configuration for the scheduler plugin.
//
// It is parsed from a JSON file in a separate ConfigMap.
type Config struct {
        // Scoring defines our policies around how to weight where Pods should be scheduled.
        Scoring ScoringConfig `json:"scoring"`

        // Watermark is the fraction of total resources allocated above which we should be migrating VMs
        // away to reduce usage.
        Watermark float64 `json:"watermark"`

        // SchedulerName informs the scheduler of its name, so that it can identify pods that a previous
        // version handled.
        SchedulerName string `json:"schedulerName"`

        // ReconcileWorkers sets the number of parallel workers to use for the global reconcile queue.
        ReconcileWorkers int `json:"reconcileWorkers"`

        // LogSuccessiveFailuresThreshold is the threshold for number of failures in a row at which
        // we'll start logging that an object is failing to be reconciled.
        //
        // This is to help make it easier to go from metrics saying "N objects are failing" to actually
        // finding the relevant objects.
        LogSuccessiveFailuresThreshold int `json:"logSuccessiveFailuresThreshold"`

        // StartupEventHandlingTimeoutSeconds gives the maximum duration, in seconds, that we are
        // allowed to wait to finish handling all of the initial events generated by reading the cluster
        // state on startup.
        //
        // If event processing takes longer than this time, then plugin creation will fail, and the
        // scheduler pod will retry.
        StartupEventHandlingTimeoutSeconds int `json:"startupEventHandlingTimeoutSeconds"`

        // K8sCRUDTimeoutSeconds sets the timeout to use for creating, updating, or deleting singular
        // kubernetes objects.
        K8sCRUDTimeoutSeconds int `json:"k8sCRUDTimeoutSeconds"`

        // PatchRetryWaitSeconds sets the minimum duration, in seconds, that we must wait between
        // successive patch operations on a VirtualMachine object.
        PatchRetryWaitSeconds int `json:"patchRetryWaitSeconds"`

        // NodeMetricLabels gives additional labels to annotate node metrics with.
        // The map is keyed by the metric name, and gives the kubernetes label that should be used to
        // populate it.
        //
        // For example, we might use the following:
        //
        //   {
        //     "availability_zone": "topology.kubernetes.io/zone",
        //     "node_group": "eks.amazonaws.com/nodegroup"
        //   }
        NodeMetricLabels map[string]string `json:"nodeMetricLabels"`

        // IgnoredNamespaces, if provided, gives a list of namespaces that the plugin should completely
        // ignore, as if pods from those namespaces do not exist.
        //
        // This is specifically designed for our "overprovisioning" namespace, which creates paused pods
        // to trigger cluster-autoscaler.
        //
        // The only exception to this rule is during Filter method calls, where we do still count the
        // resources from such pods. The reason to do that is so that these overprovisioning pods can be
        // evicted, which will allow cluster-autoscaler to trigger scale-up.
        IgnoredNamespaces []string `json:"ignoredNamespaces"`
}

type ScoringConfig struct {
        // Details about node scoring:
        // See also: https://www.desmos.com/calculator/wg8s0yn63s
        // In the desmos, the value f(x,s) gives the score (from 0 to 1) of a node that's x amount full
        // (where x is a fraction from 0 to 1), with a total size that is equal to the maximum size node
        // times s (i.e. s (or: "scale") gives the ratio between this nodes's size and the biggest one).

        // MinUsageScore gives the ratio of the score at the minimum usage (i.e. 0) relative to the
        // score at the midpoint, which will have the maximum.
        //
        // This corresponds to y₀ in the desmos link above.
        MinUsageScore float64 `json:"minUsageScore"`
        // MaxUsageScore gives the ratio of the score at the maximum usage (i.e. full) relative to the
        // score at the midpoint, which will have the maximum.
        //
        // This corresponds to y₁ in the desmos link above.
        MaxUsageScore float64 `json:"maxUsageScore"`
        // ScorePeak gives the fraction at which the "target" or highest score should be, with the score
        // sloping down on either side towards MinUsageScore at 0 and MaxUsageScore at 1.
        //
        // This corresponds to xₚ in the desmos link.
        ScorePeak float64 `json:"scorePeak"`

        // Randomize, if true, will cause the scheduler to score a node with a random number in the
        // range [minScore + 1, trueScore], instead of the trueScore.
        Randomize bool
}

///////////////////////
// CONFIG VALIDATION //
///////////////////////

// if the returned error is not nil, the string is a JSON path to the invalid value
func (c *Config) validate() (string, error) {
        if path, err := c.Scoring.validate(); err != nil {
                return fmt.Sprintf("nodeConfig.%s", path), err
        }

        if c.SchedulerName == "" {
                return "schedulerName", errors.New("string cannot be empty")
        }

        if c.ReconcileWorkers <= 0 {
                return "reconcileWorkers", errors.New("value must be > 0")
        }

        if c.LogSuccessiveFailuresThreshold <= 0 {
                return "logSuccessiveFailuresThreshold", errors.New("value must be > 0")
        }

        if c.StartupEventHandlingTimeoutSeconds <= 0 {
                return "startupEventHandlingTimeoutSeconds", errors.New("value must be > 0")
        }

        if c.K8sCRUDTimeoutSeconds <= 0 {
                return "k8sCRUDTimeoutSeconds", errors.New("value must be > 0")
        }

        if c.PatchRetryWaitSeconds <= 0 {
                return "patchRetryWaitSeconds", errors.New("value must be > 0")
        }

        if c.Watermark <= 0.0 {
                return "watermark", errors.New("value must be > 0")
        } else if c.Watermark > 1.0 {
                return "watermark", errors.New("value must be <= 1")
        }

        return "", nil
}

func (c *ScoringConfig) validate() (string, error) {
        if c.MinUsageScore < 0 || c.MinUsageScore > 1 {
                return "minUsageScore", errors.New("value must be between 0 and 1, inclusive")
        } else if c.MaxUsageScore < 0 || c.MaxUsageScore > 1 {
                return "maxUsageScore", errors.New("value must be between 0 and 1, inclusive")
        } else if c.ScorePeak < 0 || c.ScorePeak > 1 {
                return "scorePeak", errors.New("value must be between 0 and 1, inclusive")
        }

        return "", nil
}

////////////////////
// CONFIG READING //
////////////////////

const DefaultConfigPath = "/etc/scheduler-plugin-config/autoscale-enforcer-config.json"

func ReadConfig(path string) (*Config, error) {
        file, err := os.Open(path)
        if err != nil {
                return nil, fmt.Errorf("Error opening config file %q: %w", path, err)
        }

        defer file.Close()
        var config Config
        jsonDecoder := json.NewDecoder(file)
        jsonDecoder.DisallowUnknownFields()
        if err = jsonDecoder.Decode(&config); err != nil {
                return nil, fmt.Errorf("Error decoding JSON config in %q: %w", path, err)
        }

        if path, err = config.validate(); err != nil {
                return nil, fmt.Errorf("Invalid config at %s: %w", path, err)
        }

        return &config, nil
}

//////////////////////////////////////
// HELPER METHODS FOR USING CONFIGS //
//////////////////////////////////////

func (c Config) ignoredNamespace(namespace string) bool {
        return slices.Contains(c.IgnoredNamespaces, namespace)
}

package plugin

import (
        "context"
        "fmt"
        "time"

        "github.com/prometheus/client_golang/prometheus"
        "github.com/samber/lo"
        "go.uber.org/zap"

        corev1 "k8s.io/api/core/v1"
        "k8s.io/client-go/kubernetes/scheme"
        "k8s.io/client-go/rest"
        "k8s.io/kubernetes/pkg/scheduler/framework"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
        "github.com/neondatabase/autoscaling/pkg/plugin/initevents"
        "github.com/neondatabase/autoscaling/pkg/plugin/metrics"
        "github.com/neondatabase/autoscaling/pkg/plugin/reconcile"
        "github.com/neondatabase/autoscaling/pkg/util"
        "github.com/neondatabase/autoscaling/pkg/util/watch"
)

func NewAutoscaleEnforcerPlugin(
        baseCtx context.Context,
        logger *zap.Logger,
        handle framework.Handle,
        config *Config,
) (_ *AutoscaleEnforcer, finalError error) {
        // create the NeonVM client
        if err := vmv1.AddToScheme(scheme.Scheme); err != nil {
                return nil, err
        }
        vmConfig := rest.CopyConfig(handle.KubeConfig())
        // The handler's ContentType is not the default "application/json" (it's protobuf), so we need
        // to set it back to JSON because NeonVM doesn't support protobuf.
        vmConfig.ContentType = "application/json"
        vmConfig.QPS = 1000 // default QPS is 5. That's too little to handle thousands of pods.
        vmClient, err := vmclient.NewForConfig(vmConfig)
        if err != nil {
                return nil, fmt.Errorf("could not create NeonVM client: %w", err)
        }

        // set up a new context to cancel the background tasks if we bail early.
        ctx, cancel := context.WithCancel(baseCtx)
        defer func() {
                if finalError != nil {
                        cancel()
                }
        }()

        promReg := prometheus.NewRegistry()
        metrics.RegisterDefaultCollectors(promReg)

        // pre-define this so that we can reference it in the handlers, knowing that it won't be used
        // until we start the workers (which we do *after* we've set this value).
        var pluginState *PluginState

        initEvents := initevents.NewInitEventsMiddleware()

        reconcileQueue, err := reconcile.NewQueue(
                map[reconcile.Object]reconcile.HandlerFunc{
                        &corev1.Node{}: func(logger *zap.Logger, k reconcile.EventKind, obj reconcile.Object) (reconcile.Result, error) {
                                return lo.Empty[reconcile.Result](), pluginState.HandleNodeEvent(logger, k, obj.(*corev1.Node))
                        },

                        &corev1.Pod{}: func(logger *zap.Logger, k reconcile.EventKind, obj reconcile.Object) (reconcile.Result, error) {
                                result, err := pluginState.HandlePodEvent(logger, k, obj.(*corev1.Pod))
                                return lo.FromPtr(result), err
                        },

                        &vmv1.VirtualMachineMigration{}: func(logger *zap.Logger, k reconcile.EventKind, obj reconcile.Object) (reconcile.Result, error) {
                                vmm := obj.(*vmv1.VirtualMachineMigration)
                                return lo.Empty[reconcile.Result](), pluginState.HandleMigrationEvent(logger, k, vmm)
                        },
                },
                reconcile.WithBaseContext(ctx),
                reconcile.WithMiddleware(initEvents),
                // Note: we need one layer of indirection for callbacks referencing pluginState, because
                // it's initialized later, so directly referencing the methods at this point will use the
                // nil pluginState and panic on use.
                reconcile.WithQueueWaitDurationCallback(func(duration time.Duration) {
                        pluginState.reconcileQueueWaitCallback(duration)
                }),
                reconcile.WithResultCallback(func(params reconcile.ObjectParams, duration time.Duration, err error) {
                        pluginState.reconcileResultCallback(params, duration, err)
                }),
                reconcile.WithErrorStatsCallback(func(params reconcile.ObjectParams, stats reconcile.ErrorStats) {
                        pluginState.reconcileErrorStatsCallback(logger, params, stats)
                }),
                reconcile.WithPanicCallback(func(params reconcile.ObjectParams) {
                        pluginState.reconcilePanicCallback(params)
                }),
        )
        if err != nil {
                return nil, fmt.Errorf("could not setup reconcile queue: %w", err)
        }

        watchMetrics := watch.NewMetrics("autoscaling_plugin_watchers", promReg)

        // Fetch the nodes first, so that they'll *tend* to be added to the state before we try to
        // handle the pods that are on them.
        // It's not guaranteed, because parallel workers acquiring the same lock ends up with *some*
        // reordered handling, but it helps dramatically reduce the number of warnings in practice.
        nodeHandlers := watchHandlers[*corev1.Node](reconcileQueue, initEvents)
        nodeStore, err := watchNodeEvents(ctx, logger, handle.ClientSet(), watchMetrics, nodeHandlers)
        if err != nil {
                return nil, fmt.Errorf("could not start watch on Node events: %w", err)
        }

        podHandlers := watchHandlers[*corev1.Pod](reconcileQueue, initEvents)
        podStore, err := watchPodEvents(ctx, logger, handle.ClientSet(), watchMetrics, podHandlers)
        if err != nil {
                return nil, fmt.Errorf("could not start watch on Pod events: %w", err)
        }

        // we make these handlers with nil instead of initEvents so that we're not blocking plugin setup
        // on the migration objects being handled.
        vmmHandlers := watchHandlers[*vmv1.VirtualMachineMigration](reconcileQueue, nil)
        if err := watchMigrationEvents(ctx, logger, vmClient, watchMetrics, vmmHandlers); err != nil {
                return nil, fmt.Errorf("could not start watch on VirtualMachineMigration events: %w", err)
        }

        pluginState = NewPluginState(*config, vmClient, promReg, podStore, nodeStore)

        // Start the workers for the queue. We can't do these earlier because our handlers depend on the
        // PluginState that only exists now.
        reconcileLogger := logger.Named("reconcile")
        for i := 0; i < config.ReconcileWorkers; i++ {
                go reconcileWorker(ctx, reconcileLogger, reconcileQueue)
        }

        err = util.StartPrometheusMetricsServer(ctx, logger.Named("prometheus"), 9100, promReg)
        if err != nil {
                return nil, fmt.Errorf("could not start prometheus server: %w", err)
        }

        indexedPodStore := watch.NewIndexedStore(podStore, watch.NewNameIndex[corev1.Pod]())
        getPod := func(p util.NamespacedName) (*corev1.Pod, bool) {
                return indexedPodStore.GetIndexed(func(index *watch.NameIndex[corev1.Pod]) (*corev1.Pod, bool) {
                        return index.Get(p.Namespace, p.Name)
                })
        }
        err = pluginState.startPermitHandler(ctx, logger.Named("agent-handler"), getPod, podStore.Listen)
        if err != nil {
                return nil, fmt.Errorf("could not start agent request handler: %w", err)
        }

        // The reconciles are ongoing -- we need to wait until they're finished.
        timeout := time.Second * time.Duration(config.StartupEventHandlingTimeoutSeconds)
        start := time.Now()
        select {
        case <-ctx.Done():
                logger.Warn("Context unexpectedly canceled while waiting for initial events to be handled")
                return nil, ctx.Err()
        case <-time.After(timeout):
                logger.Error("Timed out handling initial events")
                // intentionally use separate log lines, to emit *something* if it deadlocks.
                logger.Warn("Objects remaining to be reconciled", zap.Any("Remaining", initEvents.Remaining()))
                return nil, fmt.Errorf("timed out after %s while handling initial events", time.Since(start))
        case <-initEvents.Done():
                logger.Info("Handled all initial events", zap.Duration("duration", time.Since(start)))
        }

        // Reconciles are finished -- for now. Some of them may be waiting on startup to complete, in
        // order to guarantee accuracy. Let's mark startup as done, and requeue those:
        pluginState.mu.Lock()
        defer pluginState.mu.Unlock()

        pluginState.startupDone = true
        for uid := range pluginState.requeueAfterStartup {
                err := pluginState.requeuePod(uid)
                if err != nil {
                        logger.Warn(
                                "Could not requeue Pod after startup, maybe it was deleted?",
                                zap.String("UID", string(uid)),
                        )
                }
        }
        clear(pluginState.requeueAfterStartup)

        return &AutoscaleEnforcer{
                logger:  logger.Named("plugin"),
                state:   pluginState,
                metrics: &pluginState.metrics.Framework,
        }, nil
}

package plugin

import (
        "context"
        "fmt"
        "math/rand"

        "go.uber.org/zap"
        "go.uber.org/zap/zapcore"

        corev1 "k8s.io/api/core/v1"
        "k8s.io/apimachinery/pkg/types"
        "k8s.io/kubernetes/pkg/scheduler/framework"

        "github.com/neondatabase/autoscaling/pkg/plugin/metrics"
        "github.com/neondatabase/autoscaling/pkg/plugin/reconcile"
        "github.com/neondatabase/autoscaling/pkg/plugin/state"
)

const PluginName = "AutoscaleEnforcer"

// AutoscaleEnforcer implements Kubernetes scheduling plugins to account for available autoscaling
// resources during scheduling.
//
// For more info on k8s scheduling plugins, see:
// https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/
type AutoscaleEnforcer struct {
        logger  *zap.Logger
        state   *PluginState
        metrics *metrics.Framework
}

// Compile-time checks that AutoscaleEnforcer actually implements the interfaces we want it to
var (
        _ framework.Plugin           = (*AutoscaleEnforcer)(nil)
        _ framework.PostFilterPlugin = (*AutoscaleEnforcer)(nil)
        _ framework.FilterPlugin     = (*AutoscaleEnforcer)(nil)
        _ framework.ScorePlugin      = (*AutoscaleEnforcer)(nil)
        _ framework.ReservePlugin    = (*AutoscaleEnforcer)(nil)
)

// Name returns the name of the AutoscaleEnforcer plugin
//
// Name implements framework.Plugin.
func (e *AutoscaleEnforcer) Name() string {
        return PluginName
}

func logFieldForNodeName(nodeName string) zap.Field {
        return zap.Object("Node", zapcore.ObjectMarshalerFunc(
                func(enc zapcore.ObjectEncoder) error {
                        enc.AddString("Name", nodeName)
                        return nil
                },
        ))
}

func (e *AutoscaleEnforcer) checkSchedulerName(logger *zap.Logger, pod *corev1.Pod) *framework.Status {
        if e.state.config.SchedulerName != pod.Spec.SchedulerName {
                err := fmt.Errorf(
                        "mismatched SchedulerName for pod: our config has %q, but the pod has %q",
                        e.state.config.SchedulerName, pod.Spec.SchedulerName,
                )
                logger.Error("Pod has unexpected SchedulerName", zap.Error(err))
                return framework.NewStatus(framework.Error, err.Error())
        }
        return nil
}

// PostFilter is used by us for metrics on filter cycles that reject a Pod by filtering out all
// applicable nodes.
//
// Quoting the docs for PostFilter:
//
// > These plugins are called after Filter phase, but only when no feasible nodes were found for the
// > pod.
//
// PostFilter implements framework.PostFilterPlugin.
func (e *AutoscaleEnforcer) PostFilter(
        ctx context.Context,
        state *framework.CycleState,
        pod *corev1.Pod,
        filteredNodeStatusMap framework.NodeToStatusMap,
) (_ *framework.PostFilterResult, status *framework.Status) {
        ignored := e.state.config.ignoredNamespace(pod.Namespace)

        e.metrics.IncMethodCall("PostFilter", pod, ignored)
        defer func() {
                e.metrics.IncFailIfnotSuccess("PostFilter", pod, ignored, status)
        }()

        logger := e.logger.With(
                zap.String("method", "PostFilter"),
                reconcile.ObjectMetaLogField("FilterPod", pod),
        )
        logger.Error("Pod rejected by all Filter method calls")

        return nil, nil // PostFilterResult is optional, nil Status is success.
}

// Filter gives our plugin a chance to signal that a pod shouldn't be put onto a particular node
//
// Filter implements framework.FilterPlugin.
func (e *AutoscaleEnforcer) Filter(
        ctx context.Context,
        _state *framework.CycleState,
        pod *corev1.Pod,
        nodeInfo *framework.NodeInfo,
) (status *framework.Status) {
        ignored := e.state.config.ignoredNamespace(pod.Namespace)

        e.metrics.IncMethodCall("Filter", pod, ignored)
        defer func() {
                e.metrics.IncFailIfnotSuccess("Filter", pod, ignored, status)
        }()

        nodeName := nodeInfo.Node().Name

        logger := e.logger.With(
                zap.String("method", "Filter"),
                reconcile.ObjectMetaLogField("FilterPod", pod),
                reconcile.ObjectMetaLogField("Node", nodeInfo.Node()),
        )

        logger.Info("Handling Filter request")

        if status := e.checkSchedulerName(logger, pod); status != nil {
                return status
        }

        podState, err := state.PodStateFromK8sObj(pod)
        if err != nil {
                msg := "Error extracting local information for Pod"
                logger.Error(msg, zap.Error(err))
                return framework.NewStatus(
                        framework.UnschedulableAndUnresolvable,
                        fmt.Sprintf("%s: %s", msg, err.Error()),
                )
        }

        // precreate a map for the pods that are proposed to exist on this node, so that we're not doing
        // this with the lock acquired.
        proposedPods := make(map[types.UID]*framework.PodInfo)
        for _, p := range nodeInfo.Pods {
                proposedPods[p.Pod.UID] = p
        }

        e.state.mu.Lock()
        defer e.state.mu.Unlock()

        ns, ok := e.state.nodes[nodeName]
        if !ok {
                msg := "Node not found in local state"
                logger.Error(msg)
                return framework.NewStatus(framework.Error, msg)
        }

        var approve bool
        ns.node.Speculatively(func(n *state.Node) (commit bool) {
                approve = e.filterCheck(logger, ns.node, n, podState, proposedPods)
                return false // never commit these changes; we're just using this for a temp node.
        })

        if !approve {
                return framework.NewStatus(framework.Unschedulable, "Not enough resources for Pod")
        } else {
                return nil
        }
}

func (e *AutoscaleEnforcer) filterCheck(
        logger *zap.Logger,
        oldNode *state.Node,
        tmpNode *state.Node,
        filterPod state.Pod,
        otherPods map[types.UID]*framework.PodInfo,
) (ok bool) {
        type podInfo struct {
                Namespace string
                Name      string
                UID       types.UID
        }

        // Our strategy here is to make the set of pods on the temporary node match what was supplied as
        // the proposal by the core scheduler -- basically making sure that it's exactly otherPods and
        // nothing more.
        //
        // We *could* derive a new node state from scratch using the node and pod objects we were
        // provided, but this risks us making decisions on inconsistent state (maybe the core
        // scheduler's view is outdated), and error handling is trickier -- here, we can at least use
        // the last known good state of the pod.
        var localNotInProposed []podInfo
        for uid, pod := range tmpNode.Pods() {
                if _, ok := otherPods[uid]; !ok {
                        // pod is not in the set given to the filter method. We should remove it from the temp
                        // node, and mark that for later.
                        tmpNode.RemovePod(uid)
                        localNotInProposed = append(localNotInProposed, podInfo{
                                Namespace: pod.Namespace,
                                Name:      pod.Name,
                                UID:       pod.UID,
                        })
                } else {
                        // Otherwise, the pod *is* in the set for filter, and so we should remove it from
                        // 'otherPods' to mark it as already included.
                        delete(otherPods, uid)
                }
        }

        // all that remains in otherPods is the pods that were not removed from iterating through
        // tmpNode's pods.
        var proposedNotInLocalState []podInfo
        for _, p := range otherPods {
                proposedNotInLocalState = append(proposedNotInLocalState, podInfo{
                        Namespace: p.Pod.Namespace,
                        Name:      p.Pod.Name,
                        UID:       p.Pod.UID,
                })

                pod, err := state.PodStateFromK8sObj(p.Pod)
                if err != nil {
                        logger.Error(
                                "Ignoring extra Pod in Filter stage because extracting custom state failed",
                                reconcile.ObjectMetaLogField("Pod", p.Pod),
                                zap.Error(err),
                        )
                        continue
                }

                tmpNode.AddPod(pod)
        }

        // At this point:
        // * oldNode has the actual local state for the node
        // * tmpNode has the state as given by the filter pods
        // * localNotInProposed are the pods in oldNode but not tmpNode
        // * proposedNotInLocalState are the pods in tmpNode but not oldNode
        //
        // We'll use (another) Speculatively() to simultaneously show all these, plus the state
        // resulting from adding the Pod to filter.
        var canAddToNode bool
        tmpNode.Speculatively(func(n *state.Node) (commit bool) {
                n.AddPod(filterPod)
                canAddToNode = !n.OverBudget()

                var msg string
                if canAddToNode {
                        msg = "Allowing Pod placement onto this Node"
                } else {
                        msg = "Rejecting Pod placement onto this Node"
                }
                logger.Info(
                        msg,
                        zap.Object("Node", oldNode),
                        zap.Object("FilterNode", tmpNode),
                        zap.Object("FilterNodeWithPod", n),
                        zap.Object("Pod", filterPod),
                        zap.Any("LocalPodsNotInFilterState", localNotInProposed),
                        zap.Any("FilterPodsNotInLocalState", proposedNotInLocalState),
                )

                return false // don't commit. Doesn't really matter because we're operating on the temp node.
        })
        return canAddToNode
}

// Score allows our plugin to express which nodes should be preferred for scheduling new pods onto
//
// Even though this function is given (pod, node) pairs, our scoring is only really dependent on
// values of the node. However, we have special handling for when the pod no longer fits in the node
// (even though it might have during the Filter plugin) - we can't return a failure, because that
// would cause *all* scheduling of the pod to fail, so we instead return the minimum score.
//
// The scores might not be consistent with each other, due to ongoing changes in the node. That's
// ok, because nothing relies on strict correctness here, and they should be approximately correct
// anyways.
//
// Score implements framework.ScorePlugin.
func (e *AutoscaleEnforcer) Score(
        ctx context.Context,
        _state *framework.CycleState,
        pod *corev1.Pod,
        nodeName string,
) (_ int64, status *framework.Status) {
        ignored := e.state.config.ignoredNamespace(pod.Namespace)

        e.metrics.IncMethodCall("Score", pod, ignored)
        defer func() {
                e.metrics.IncFailIfnotSuccess("Score", pod, ignored, status)
        }()

        logger := e.logger.With(
                zap.String("method", "Score"),
                reconcile.ObjectMetaLogField("Pod", pod),
        )

        logger.Info("Handling Score request", logFieldForNodeName(nodeName))

        if status := e.checkSchedulerName(logger, pod); status != nil {
                return framework.MinNodeScore, status
        }

        podState, err := state.PodStateFromK8sObj(pod)
        if err != nil {
                msg := "Error extracting local information for Pod"
                logger.Error(msg, zap.Error(err))
                return framework.MinNodeScore, framework.NewStatus(
                        framework.UnschedulableAndUnresolvable,
                        fmt.Sprintf("%s: %s", msg, err.Error()),
                )
        }

        e.state.mu.Lock()
        defer e.state.mu.Unlock()

        ns, ok := e.state.nodes[nodeName]
        if !ok {
                msg := "Node not found in local state"
                logger.Error(msg)
                status := framework.NewStatus(framework.Error, msg)
                return framework.MinNodeScore, status
        }

        var score int64

        ns.node.Speculatively(func(tmp *state.Node) (commit bool) {
                tmp.AddPod(podState)

                overBudget := tmp.OverBudget()
                if overBudget {
                        score = framework.MinNodeScore
                        logger.Warn(
                                "No room for Pod on Node, giving minimum score (typically handled by Filter instead)",
                                zap.Int64("Score", score),
                                zap.Object("NodeWithPod", tmp),
                        )
                } else {
                        cfg := e.state.config.Scoring
                        cpuScore := calculateScore(cfg, tmp.CPU.Reserved, tmp.CPU.Total, e.state.maxNodeCPU)
                        memScore := calculateScore(cfg, tmp.Mem.Reserved, tmp.Mem.Total, e.state.maxNodeMem)
                        scoreFraction := min(cpuScore, memScore)

                        scoreLen := framework.MaxNodeScore - framework.MinNodeScore
                        score = framework.MinNodeScore + int64(float64(scoreLen)*scoreFraction)

                        logger.Info(
                                "Scored Pod placement for Node",
                                zap.Int64("Score", score),
                                zap.Float64("CPUFraction", cpuScore),
                                zap.Float64("MemFraction", memScore),
                                zap.Object("NodeWithPod", tmp),
                        )
                }

                return false // never commit, we're doing this just to check.
        })

        return score, nil
}

type floatable interface {
        AsFloat64() float64
}

// Refer to the comments in ScoringConfig for more. Also, see: https://www.desmos.com/calculator/wg8s0yn63s
func calculateScore[T floatable](
        cfg ScoringConfig,
        reserved T,
        total T,
        maxTotalSeen T,
) float64 {
        y0 := cfg.MinUsageScore
        y1 := cfg.MaxUsageScore
        xp := cfg.ScorePeak

        fraction := reserved.AsFloat64() / total.AsFloat64()
        scale := total.AsFloat64() / maxTotalSeen.AsFloat64()

        score := float64(1) // if fraction == nodeConf.ScorePeak
        if fraction < cfg.ScorePeak {
                score = y0 + (1-y0)/xp*fraction
        } else if fraction > cfg.ScorePeak {
                score = y1 + (1-y1)/(1-xp)*(1-fraction)
        }

        return score * scale
}

// NormalizeScore weights scores uniformly in the range [minScore, trueScore], where
// minScore is framework.MinNodeScore + 1.
//
// NormalizeScore implements framework.ScoreExtensions.
func (e *AutoscaleEnforcer) NormalizeScore(
        ctx context.Context,
        state *framework.CycleState,
        pod *corev1.Pod,
        scores framework.NodeScoreList,
) (status *framework.Status) {
        ignored := e.state.config.ignoredNamespace(pod.Namespace)

        e.metrics.IncMethodCall("NormalizeScore", pod, ignored)
        defer func() {
                e.metrics.IncFailIfnotSuccess("NormalizeScore", pod, ignored, status)
        }()

        logger := e.logger.With(
                zap.String("method", "NormalizeScore"),
                reconcile.ObjectMetaLogField("Pod", pod),
        )

        type scoring struct {
                Node     string
                OldScore int64
                NewScore int64
        }

        var scoreInfos []scoring

        for _, node := range scores {
                oldScore := node.Score

                // rand.Intn will panic if we pass in 0
                if oldScore == 0 {
                        scoreInfos = append(scoreInfos, scoring{
                                Node:     node.Name,
                                OldScore: oldScore,
                                NewScore: node.Score,
                        })
                        continue
                }

                // This is different from framework.MinNodeScore. We use framework.MinNodeScore
                // to indicate that a pod should not be placed on a node. The lowest
                // actual score we assign a node is thus framework.MinNodeScore + 1
                minScore := framework.MinNodeScore + 1

                // We want to pick a score in the range [minScore, score], so use score + 1 - minScore, as
                // rand.Intn picks a number in the *half open* range [0, n).
                newScore := minScore + int64(rand.Intn(int(oldScore+1-minScore)))
                node.Score = newScore
                scoreInfos = append(scoreInfos, scoring{
                        Node:     node.Name,
                        OldScore: oldScore,
                        NewScore: newScore,
                })
        }

        logger.Info("Randomized Node scores for Pod", zap.Any("scores", scoreInfos))
        return nil
}

// ScoreExtensions is required for framework.ScorePlugin, and can return nil if it's not used.
// However, we do use it, to randomize scores (when enabled).
func (e *AutoscaleEnforcer) ScoreExtensions() framework.ScoreExtensions {
        if e.state.config.Scoring.Randomize {
                return e
        } else {
                return nil
        }
}

// Reserve signals to our plugin that a particular pod will (probably) be bound to a node, giving us
// a chance to both (a) reserve the resources it needs within the node and (b) reject the pod if
// there aren't enough.
//
// Reserve implements framework.ReservePlugin.
func (e *AutoscaleEnforcer) Reserve(
        ctx context.Context,
        _state *framework.CycleState,
        pod *corev1.Pod,
        nodeName string,
) (status *framework.Status) {
        ignored := e.state.config.ignoredNamespace(pod.Namespace)

        e.metrics.IncMethodCall("Reserve", pod, ignored)
        defer func() {
                e.metrics.IncFailIfnotSuccess("Reserve", pod, ignored, status)
        }()

        logger := e.logger.With(
                zap.String("method", "Reserve"),
                reconcile.ObjectMetaLogField("Pod", pod),
        )

        if ignored {
                logger.Info("Skipping Reserve request for ignored namespace", logFieldForNodeName(nodeName))
                return nil
        }

        logger.Info("Handling Reserve request", logFieldForNodeName(nodeName))

        if status := e.checkSchedulerName(logger, pod); status != nil {
                return status
        }

        podState, err := state.PodStateFromK8sObj(pod)
        if err != nil {
                msg := "Error extracting local information for Pod"
                logger.Error(msg, zap.Error(err))
                return framework.NewStatus(
                        framework.UnschedulableAndUnresolvable,
                        fmt.Sprintf("%s: %s", msg, err.Error()),
                )
        }

        e.state.mu.Lock()
        defer e.state.mu.Unlock()

        if _, ok := e.state.tentativelyScheduled[pod.UID]; ok {
                msg := "Pod already exists in set of tentatively scheduled pods"
                logger.Error(msg)
                return framework.NewStatus(framework.UnschedulableAndUnresolvable, msg)
        }

        ns, ok := e.state.nodes[nodeName]
        if !ok {
                msg := "Node not found in local state"
                logger.Error(msg)
                return framework.NewStatus(framework.Error, msg)
        }

        // use Speculatively() to compare before/after
        //
        // Note that we always allow the change to go through, even though we *could* deny the Reserve()
        // if there isn't room. We don't deny, because that's ultimately less reliable.
        // For more, see https://github.com/neondatabase/autoscaling/issues/869
        ns.node.Speculatively(func(n *state.Node) (commit bool) {
                n.AddPod(podState)
                e.state.tentativelyScheduled[pod.UID] = nodeName

                logger.Info(
                        "Reserved tentatively scheduled Pod on Node",
                        zap.Object("Pod", podState),
                        zap.Object("OldNode", ns.node),
                        zap.Object("Node", n),
                )

                return true // Yes, commit these changes.
        })

        if ns.node.OverBudget() {
                e.metrics.IncReserveOverBudget(ignored, ns.node)
        }

        return nil
}

// Unreserve marks a pod as no longer on-track to being bound to a node, so we can release the
// resources we previously reserved for it.
//
// Note: the documentation for ReservePlugin indicates that Unreserve both (a) must be idempotent
// and (b) may be called without a previous call to Reserve for the same pod.
//
// Unreserve implements framework.ReservePlugin.
func (e *AutoscaleEnforcer) Unreserve(
        ctx context.Context,
        _state *framework.CycleState,
        pod *corev1.Pod,
        nodeName string,
) {
        ignored := e.state.config.ignoredNamespace(pod.Namespace)

        e.metrics.IncMethodCall("Unreserve", pod, ignored)

        logger := e.logger.With(
                zap.String("method", "Unreserve"),
                reconcile.ObjectMetaLogField("Pod", pod),
        )

        logger.Info("Handling Unreserve request", logFieldForNodeName(nodeName))

        e.state.mu.Lock()
        defer e.state.mu.Unlock()

        nn, ok := e.state.tentativelyScheduled[pod.UID]
        if !ok {
                logger.Warn("Cannot unreserve Pod, it isn't in the set of tentatively scheduled pods")
                return
        } else if nn != nodeName {
                logger.Panic(
                        "Pod is tentatively scheduled on an unexpected node",
                        zap.String("ExpectedNode", nodeName),
                        zap.String("ActualNode", nn),
                )
                return
        }

        ns, ok := e.state.nodes[nodeName]
        if !ok {
                logger.Error("Node not found in local state", logFieldForNodeName(nodeName))
                return
        }

        ns.node.Speculatively(func(n *state.Node) (commit bool) {
                p, ok := n.GetPod(pod.UID)
                if !ok {
                        logger.Panic("Pod unexpectedly doesn't exist on Node", logFieldForNodeName(nodeName))
                        return
                }
                n.RemovePod(pod.UID)
                delete(e.state.tentativelyScheduled, pod.UID)

                logger.Info(
                        "Unreserved tentatively scheduled Pod",
                        zap.Object("Pod", p),
                        zap.Object("OldNode", ns.node),
                        zap.Object("Node", n),
                )
                return true // yes, commit these changes.
        })
}

package plugin

// Root state for the plugin.

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "sync"
        "time"

        "github.com/prometheus/client_golang/prometheus"
        "go.uber.org/zap"

        corev1 "k8s.io/api/core/v1"
        apierrors "k8s.io/apimachinery/pkg/api/errors"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/apimachinery/pkg/types"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/plugin/metrics"
        "github.com/neondatabase/autoscaling/pkg/plugin/state"
        "github.com/neondatabase/autoscaling/pkg/util"
        "github.com/neondatabase/autoscaling/pkg/util/patch"
        "github.com/neondatabase/autoscaling/pkg/util/watch"
)

// PluginState stores the state of the scheduler plugin in its entirety
type PluginState struct {
        mu sync.Mutex

        config Config

        nodes map[string]*nodeState

        // tentativelyScheduled stores the UIDs of pods that have been approved for final scheduling
        // with the Reserve plugin method, but haven't yet been processed internally as finally being
        // assigned to those nodes.
        //
        // the string associated with each pod is the name of the node.
        tentativelyScheduled map[types.UID]string

        startupDone         bool
        requeueAfterStartup map[types.UID]struct{}

        // maxNodeCPU is the maximum amount of CPU we've seen available for a node.
        // We use this when scoring pod placements.
        maxNodeCPU vmv1.MilliCPU
        // maxNodeMem is the maximum amount of memory we've seen available for a node.
        // We use this when scoring pod placements.
        maxNodeMem api.Bytes

        metrics metrics.Plugin

        requeuePod      func(uid types.UID) error
        requeueNode     func(nodeName string) error
        createMigration func(*zap.Logger, *vmv1.VirtualMachineMigration) error
        deleteMigration func(*zap.Logger, *vmv1.VirtualMachineMigration) error
        patchVM         func(util.NamespacedName, []patch.Operation) error
}

type nodeState struct {
        node *state.Node

        // requestedMigrations stores the set of pods that we've decided we should migrate.
        //
        // When they are reconciled, we will (a) double-check that we should still migrate them, and (b)
        // if so, create a VirtualMachineMigration object to handle it.
        requestedMigrations map[types.UID]struct{}

        // podsVMPatchedAt stores the last time that the VirtualMachine object for a Pod was patched, so
        // that we can avoid spamming patch requests if the Pod is just slightly out of date.
        //
        // The map is keyed by the *Pod* UID, even though it stores when we patched the *VM*.
        podsVMPatchedAt map[types.UID]time.Time
}

func NewPluginState(
        config Config,
        vmClient vmclient.Interface,
        reg prometheus.Registerer,
        podWatchStore *watch.Store[corev1.Pod],
        nodeWatchStore *watch.Store[corev1.Node],
) *PluginState {
        crudTimeout := time.Second * time.Duration(config.K8sCRUDTimeoutSeconds)

        indexedNodeStore := watch.NewIndexedStore(nodeWatchStore, watch.NewFlatNameIndex[corev1.Node]())

        metrics := metrics.BuildPluginMetrics(config.NodeMetricLabels, reg)

        return &PluginState{
                mu: sync.Mutex{},

                config: config,

                nodes:                make(map[string]*nodeState),
                tentativelyScheduled: make(map[types.UID]string),

                startupDone:         false,
                requeueAfterStartup: make(map[types.UID]struct{}),

                // these values will be set as we handle node events:
                maxNodeCPU: 0,
                maxNodeMem: 0,

                metrics: metrics,
                requeuePod: func(uid types.UID) error {
                        ok := podWatchStore.NopUpdate(uid)
                        if !ok {
                                return errors.New("pod not found in watch store")
                        }
                        return nil
                },
                requeueNode: func(nodeName string) error {
                        node, ok := indexedNodeStore.GetIndexed(
                                func(i *watch.FlatNameIndex[corev1.Node]) (*corev1.Node, bool) {
                                        return i.Get(nodeName)
                                },
                        )
                        if !ok {
                                return errors.New("node not found in watch store")
                        }

                        _ = nodeWatchStore.NopUpdate(node.UID)
                        return nil
                },
                createMigration: func(logger *zap.Logger, vmm *vmv1.VirtualMachineMigration) error {
                        ctx, cancel := context.WithTimeout(context.TODO(), crudTimeout)
                        defer cancel()

                        _, err := vmClient.NeonvmV1().VirtualMachineMigrations(vmm.Namespace).
                                Create(ctx, vmm, metav1.CreateOptions{})
                        metrics.RecordK8sOp("Create", "VirtualMachineMigration", vmm.Name, err)
                        if err != nil && apierrors.IsAlreadyExists(err) {
                                logger.Warn("Migration already exists for this pod")
                                return nil
                        }
                        return err
                },
                deleteMigration: func(logger *zap.Logger, vmm *vmv1.VirtualMachineMigration) error {
                        ctx, cancel := context.WithTimeout(context.TODO(), crudTimeout)
                        defer cancel()

                        opts := metav1.DeleteOptions{
                                // Include the extra pre-condition that we're deleting exactly the migration object
                                // that was specified.
                                Preconditions: &metav1.Preconditions{
                                        UID:             &vmm.UID,
                                        ResourceVersion: nil,
                                },
                        }

                        err := vmClient.NeonvmV1().VirtualMachineMigrations(vmm.Namespace).
                                Delete(ctx, vmm.Name, opts)
                        metrics.RecordK8sOp("Delete", "VirtualMachineMigration", vmm.Name, err)
                        return err
                },
                patchVM: func(vm util.NamespacedName, patches []patch.Operation) error {
                        patchPayload, err := json.Marshal(patches)
                        if err != nil {
                                panic(fmt.Errorf("could not marshal JSON patch: %w", err))
                        }

                        ctx, cancel := context.WithTimeout(context.TODO(), crudTimeout)
                        defer cancel()

                        _, err = vmClient.NeonvmV1().VirtualMachines(vm.Namespace).
                                Patch(ctx, vm.Name, types.JSONPatchType, patchPayload, metav1.PatchOptions{})
                        metrics.RecordK8sOp("Patch", "VirtualMachine", vm.Name, err)
                        return err
                },
        }
}

package plugin

// Handling of Node events.

import (
        "fmt"
        "time"

        "go.uber.org/zap"

        corev1 "k8s.io/api/core/v1"
        "k8s.io/apimachinery/pkg/types"

        "github.com/neondatabase/autoscaling/pkg/plugin/reconcile"
        "github.com/neondatabase/autoscaling/pkg/plugin/state"
)

func (s *PluginState) HandleNodeEvent(logger *zap.Logger, kind reconcile.EventKind, node *corev1.Node) error {
        expectExists := kind == reconcile.EventKindModified || kind == reconcile.EventKindDeleted

        switch kind {
        case reconcile.EventKindAdded, reconcile.EventKindModified:
                return s.updateNode(logger, node, expectExists)
        case reconcile.EventKindDeleted, reconcile.EventKindEphemeral:
                return s.deleteNode(logger, node, expectExists)
        default:
                panic("unreachable")
        }
}

func (s *PluginState) updateNode(logger *zap.Logger, node *corev1.Node, expectExists bool) error {
        newNode, err := state.NodeStateFromK8sObj(node, s.config.Watermark, s.metrics.Nodes.InheritedLabels)
        if err != nil {
                return fmt.Errorf("could not get state from Node object: %w", err)
        }

        s.mu.Lock()
        defer s.mu.Unlock()

        var updated *nodeState

        oldNS, ok := s.nodes[node.Name]
        if !ok {
                if expectExists {
                        logger.Warn("Adding node that unexpectedly doesn't exist in local state")
                }

                s.maxNodeCPU = max(s.maxNodeCPU, newNode.CPU.Total)
                s.maxNodeMem = max(s.maxNodeMem, newNode.Mem.Total)

                entry := &nodeState{
                        node:                newNode,
                        requestedMigrations: make(map[types.UID]struct{}),
                        podsVMPatchedAt:     make(map[types.UID]time.Time),
                }

                logger.Info("Adding base node state", zap.Object("Node", entry.node))
                s.nodes[node.Name] = entry
                updated = entry
        } else /* oldNode DOES exist, let's update it */ {
                if !expectExists {
                        logger.Warn("Updating node that unexpectedly exists in local state")
                }

                // Use (*Node).Speculatively() so that we can log both states before committing, and provide
                // protection from panics if .Update() has issues.
                oldNS.node.Speculatively(func(n *state.Node) (commit bool) {
                        changed := n.Update(newNode)
                        if changed {
                                logger.Warn("Updating base node state", zap.Object("OldNode", oldNS.node), zap.Object("Node", n))
                        }
                        return true // yes, apply the change
                })
                updated = oldNS
        }

        return s.reconcileNode(logger, updated)
}

func (s *PluginState) deleteNode(logger *zap.Logger, node *corev1.Node, expectExists bool) error {
        s.mu.Lock()
        defer s.mu.Unlock()

        n, exists := s.nodes[node.Name]
        if exists && !expectExists {
                logger.Warn("Deleting node that unexpectedly exists in local state")
        } else if !exists && expectExists {
                logger.Warn("No-op deleting node that unexpectedly doesn't exist in local state")
        }

        if exists {
                s.cleanupNode(logger, n)
        }

        return nil
}

// reconcileNode makes any updates necessary given the current state of the node.
// In particular, this method:
//
// 1. Triggers live migration if reserved resources are above the watermark; and
// 2. Updates the prometheus metrics we expose about the node
//
// NOTE: this function expects that the caller has acquired s.mu.
func (s *PluginState) reconcileNode(logger *zap.Logger, ns *nodeState) error {
        defer s.metrics.Nodes.Update(ns.node)

        err := s.balanceNode(logger, ns)
        if err != nil {
                return fmt.Errorf("could not trigger live migrations: %w", err)
        }

        return nil
}

// updateNodeMetricsAndRequeue updates the node's metrics and puts it back in the reconcile queue.
//
// This is typically used to force nodes to stay up-to-date after we update a pod on the node, while
// helping with fairness between time spent reconciling the pod vs the node.
func (s *PluginState) updateNodeMetricsAndRequeue(logger *zap.Logger, ns *nodeState) {
        if err := s.requeueNode(ns.node.Name); err != nil {
                logger.Error("Failed to requeue Node", zap.Error(err))
        }
        s.metrics.Nodes.Update(ns.node)
}

func (s *PluginState) balanceNode(logger *zap.Logger, ns *nodeState) error {
        var err error
        // use Speculatively() to produce a temporary node that triggerMigrationsIfNecessary can use to
        // evaluate what the state *will* look like after the migrations are running.
        ns.node.Speculatively(func(tmpNode *state.Node) (commit bool) {
                originalNode := ns.node
                requestedMigrations := []types.UID{}
                for uid := range ns.requestedMigrations {
                        requestedMigrations = append(requestedMigrations, uid)
                }
                err = triggerMigrationsIfNecessary(
                        logger,
                        originalNode,
                        tmpNode,
                        requestedMigrations,
                        func(podUID types.UID) error {
                                if err := s.requeuePod(podUID); err != nil {
                                        return err
                                }
                                ns.requestedMigrations[podUID] = struct{}{}
                                return nil
                        },
                )

                return false // Never actually commit; we're just using Speculatively() for a cheap copy.
        })
        return err
}

// NOTE: this function expects that the caller has acquired s.mu.
func (s *PluginState) cleanupNode(logger *zap.Logger, ns *nodeState) {
        // remove any tentatively scheduled pods that are on this node
        for uid, nodeName := range s.tentativelyScheduled {
                if nodeName == ns.node.Name {
                        delete(s.tentativelyScheduled, uid)
                }
        }

        s.metrics.Nodes.Remove(ns.node)
        delete(s.nodes, ns.node.Name)

        logger.Info("Removed node", zap.Object("Node", ns.node))
}

package plugin

// Handling of Pod events.

import (
        "encoding/json"
        "errors"
        "fmt"
        "time"

        "github.com/samber/lo"
        "go.uber.org/zap"

        corev1 "k8s.io/api/core/v1"
        apierrors "k8s.io/apimachinery/pkg/api/errors"
        "k8s.io/apimachinery/pkg/api/resource"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/plugin/reconcile"
        "github.com/neondatabase/autoscaling/pkg/plugin/state"
        "github.com/neondatabase/autoscaling/pkg/util/patch"
)

func (s *PluginState) HandlePodEvent(
        logger *zap.Logger,
        kind reconcile.EventKind,
        pod *corev1.Pod,
) (*reconcile.Result, error) {
        if s.config.ignoredNamespace(pod.Namespace) {
                // We intentionally don't include ignored pods in the namespace.
                return nil, nil
        }

        expectExists := kind == reconcile.EventKindModified || kind == reconcile.EventKindDeleted

        switch kind {
        case reconcile.EventKindAdded, reconcile.EventKindModified:
                updateResult, err := s.updatePod(logger, pod, expectExists)

                var reconcileResult *reconcile.Result
                if updateResult != nil && updateResult.retryAfter != nil {
                        reconcileResult = &reconcile.Result{RetryAfter: *updateResult.retryAfter}
                }

                if err != nil {
                        return reconcileResult, err
                }

                var retryAfter time.Duration
                if updateResult != nil {
                        if updateResult.afterUnlock != nil {
                                if err := updateResult.afterUnlock(); err != nil {
                                        return reconcileResult, err
                                }
                        }

                        if updateResult.needsMoreResources {
                                // mark this as failing; don't try again sooner than 5 seconds later.
                                return &reconcile.Result{RetryAfter: 5 * time.Second}, errors.New("not enough resources to grant request for pod")
                        }

                        retryAfter = lo.FromPtr(updateResult.retryAfter)
                }
                return &reconcile.Result{RetryAfter: retryAfter}, nil

        case reconcile.EventKindDeleted, reconcile.EventKindEphemeral:
                err := s.deletePod(logger, pod, expectExists)
                return nil, err
        default:
                panic("unreachable")
        }
}

type podUpdateResult struct {
        needsMoreResources bool
        afterUnlock        func() error
        retryAfter         *time.Duration
}

func (s *PluginState) updatePod(
        logger *zap.Logger,
        pod *corev1.Pod,
        expectExists bool,
) (*podUpdateResult, error) {
        newPod, err := state.PodStateFromK8sObj(pod)
        if err != nil {
                return nil, fmt.Errorf("could not get state from Pod object: %w", err)
        }

        s.mu.Lock()
        defer s.mu.Unlock()

        var ns *nodeState // pre-declare this so we can update metrics in a defer
        defer func() {
                if ns != nil {
                        s.updateNodeMetricsAndRequeue(logger, ns)
                }
        }()

        tentativeNode, scheduled := s.tentativelyScheduled[pod.UID]
        if scheduled {
                if pod.Spec.NodeName == tentativeNode {
                        // oh hey, this pod has been properly scheduled now! Let's remove it from the
                        // "tentatively scheduled" set.
                        delete(s.tentativelyScheduled, pod.UID)
                        logger.Info("Pod was scheduled as expected")
                } else if pod.Spec.NodeName != "" {
                        logger.Panic(
                                "Pod was scheduled onto a different Node than tentatively recorded",
                                zap.String("OriginalNodeName", tentativeNode),
                                zap.String("NewNodeName", pod.Spec.NodeName),
                        )
                }
        }

        if !scheduled && pod.Spec.NodeName == "" {
                // still hasn't been scheduled, nothing to do yet.
                logger.Info("Skipping event for Pod that has not yet been scheduled")
                return nil, nil
        }

        nodeName := pod.Spec.NodeName
        if nodeName == "" {
                nodeName = tentativeNode
        }

        logger = logger.With(logFieldForNodeName(nodeName))

        var ok bool
        ns, ok = s.nodes[nodeName]
        if !ok {
                return nil, fmt.Errorf("pod's node %q is not present in local state", nodeName)
        }

        // make the changes in Speculatively() so that we can log both states before committing, and
        // provide protection from panics.
        ns.node.Speculatively(func(n *state.Node) (commit bool) {
                oldPod, exists := ns.node.GetPod(newPod.UID)
                // note: only warn if the pod unexpectedly *does* exist; the normal path is that pods are
                // modified to be assigned their node, so we can't reliably say when a pod should not have
                // *previously* been present.
                if exists && !expectExists {
                        logger.Warn("Updating Pod that unexpectedly exists in local state")
                }

                if exists {
                        podChanged := n.UpdatePod(oldPod, newPod)
                        if podChanged {
                                logger.Info(
                                        "Updated Pod in local state",
                                        zap.Object("OldPod", oldPod),
                                        zap.Object("Pod", newPod),
                                        zap.Object("OldNode", ns.node),
                                        zap.Object("Node", n),
                                )
                        }
                } else {
                        n.AddPod(newPod)
                        logger.Info(
                                "Added Pod to local state",
                                zap.Object("Pod", newPod),
                                zap.Object("OldNode", ns.node),
                                zap.Object("Node", n),
                        )
                }

                // Commit the changes so far, then keep going.
                return true
        })

        // At this point, our local state has been updated according to the Pod object from k8s.
        //
        // All that's left is to handle VMs that are the responsibility of *this* scheduler.
        if lo.IsEmpty(newPod.VirtualMachine) || pod.Spec.SchedulerName != s.config.SchedulerName {
                return nil, nil
        }

        if _, ok := ns.requestedMigrations[newPod.UID]; ok {
                // If the pod is already migrating, remove it from requestedMigrations.
                if newPod.Migrating {
                        delete(ns.requestedMigrations, newPod.UID)
                } else if !newPod.Migratable {
                        logger.Warn("Canceling previously wanted migration because Pod is not migratable")
                        delete(ns.requestedMigrations, newPod.UID)
                } else {
                        // Otherwise: the pod is not migrating, but *is* migratable. Let's trigger migration.
                        logger.Info("Creating migration for Pod")
                        return &podUpdateResult{
                                needsMoreResources: false,
                                // we need to release the lock to trigger the migration, otherwise we may slow down
                                // processing due to API delays.
                                afterUnlock: func() error {
                                        if err := s.createMigrationForPod(logger, newPod); err != nil {
                                                return fmt.Errorf("could not create migration for Pod: %w", err)
                                        }
                                        return nil
                                },
                                // All done for now; retry in 5s if the pod is not migrating yet.
                                retryAfter: lo.ToPtr(5 * time.Second),
                        }, nil
                }
        }

        if !newPod.Migrating {
                return s.reconcilePodResources(logger, ns, pod, newPod), nil
        }

        return nil, nil
}

func (s *PluginState) createMigrationForPod(logger *zap.Logger, pod state.Pod) error {
        vmm := &vmv1.VirtualMachineMigration{
                ObjectMeta: metadataForNewMigration(pod),
                Spec: vmv1.VirtualMachineMigrationSpec{
                        VmName: pod.VirtualMachine.Name,

                        // FIXME: NeonVM's VirtualMachineMigrationSpec has a bunch of boolean fields that aren't
                        // pointers, which means we need to explicitly set them when using the Go API.
                        PreventMigrationToSameHost: true,
                        CompletionTimeout:          3600,
                        Incremental:                true,
                        AutoConverge:               true,
                        MaxBandwidth:               resource.MustParse("1Gi"),
                        AllowPostCopy:              false,
                },
        }

        return s.createMigration(logger, vmm)
}

func (s *PluginState) reconcilePodResources(
        logger *zap.Logger,
        ns *nodeState,
        oldPodObj *corev1.Pod,
        oldPod state.Pod,
) *podUpdateResult {
        // Quick check: Does this pod have autoscaling enabled? if no, then we shouldn't set our
        // annotations on it -- particularly because we may end up with stale approved resources when
        // the VM scales, and that can cause issues if autoscaling is enabled later.
        if !api.HasAutoscalingEnabled(oldPodObj) {
                return nil
        }

        var needsMoreResources bool

        desiredPod := oldPod
        ns.node.Speculatively(func(n *state.Node) (commit bool) {
                // Do a pass of reconciling this pod, in case there's resources it's requested that we can
                // now grant.
                done := n.ReconcilePodReserved(&desiredPod)
                needsMoreResources = !done
                // Don't accept these changes -- more on that below.
                return false
        })

        _, hasApprovedAnnotation := oldPodObj.Annotations[api.InternalAnnotationResourcesApproved]

        // At this point, desiredPod has the updated state of the pod that *would* be the case if we
        // fully reconcile it.
        //
        // If there are changes, we have are a few things to consider.
        //
        // 1. If we haven't finished handling the initial state, we cannot accept the changes (maybe not
        //    all pods are present). We should requeue this Pod once we're acting on more complete
        //    information.
        //
        // 2. If we're changing the reserved resources, we'll need to update the annotation for approved
        //    resources on the VM object to communicate that. We need to release the lock on the state
        //    *before* doing that, because otherwise we'll add significant processing delays.
        //
        // 3. If we fail to update the VM object, we need to *not* have decreased the resources reserved
        //    for the Pod -- we may not know if the change actually took effect, and if it didn't, we
        //    should still admit the possibility that the resources previously reserved will go back to
        //    being used.
        //
        // So, putting all that together:
        //
        // - Don't do anything if we haven't completed startup.
        // - Set the pod state to the maximum reserved between oldPod and newPod, and *then* patch
        //   the VM object.

        if !s.startupDone {
                s.requeueAfterStartup[oldPod.UID] = struct{}{}
                // don't report anything, even if needsMoreResources. We're waiting for startup to finish!
                return nil
        }
        if oldPod == desiredPod && hasApprovedAnnotation {
                // no changes, nothing to do. Although, if we *do* need more resources, log something about
                // it so we're not failing silently.
                if needsMoreResources {
                        logger.Warn(
                                "Unable to satisfy requested resources for Pod",
                                zap.Object("Pod", oldPod),
                                zap.Object("Node", ns.node),
                        )
                }
                return &podUpdateResult{
                        needsMoreResources: needsMoreResources,
                        afterUnlock:        nil,
                        retryAfter:         nil,
                }
        }

        // Startup done. Either we have changes or the pod is missing the approved resources annotation.
        //
        // If it hasn't been too soon since the last patch:
        // Update the local state if necessary; release the lock; patch the VM.
        //
        // Otherwise, mark retryAfter with the wait time necessary.
        now := time.Now()
        lastPatch, previouslyPatched := ns.podsVMPatchedAt[oldPod.UID]

        canRetryAt := now
        if previouslyPatched {
                canRetryAt = lastPatch.Add(time.Second * time.Duration(s.config.PatchRetryWaitSeconds))
        }

        if now.Before(canRetryAt) {
                retryAfter := canRetryAt.Sub(now)
                logger.Warn(
                        "Want to patch VirtualMachine for reserved resources, but too soon to re-patch. Waiting.",
                        zap.Duration("retryAfter", retryAfter),
                )
                return &podUpdateResult{
                        needsMoreResources: needsMoreResources,
                        afterUnlock:        nil,
                        retryAfter:         &retryAfter,
                }
        }

        newPod := desiredPod
        newPod.CPU.Reserved = max(desiredPod.CPU.Reserved, oldPod.CPU.Reserved)
        newPod.Mem.Reserved = max(desiredPod.Mem.Reserved, oldPod.Mem.Reserved)

        if newPod == oldPod {
                if oldPod != desiredPod {
                        logger.Info(
                                "Reserved resources can be updated for Pod, patching VirtualMachine without updating local state",
                                zap.Object("Pod", oldPod),
                                zap.Object("DesiredPod", desiredPod),
                                zap.Object("Node", ns.node),
                        )
                } else /* implies !hasApprovedAnnotation */ {
                        logger.Info(
                                "Pod is missing approved resources annotation, patching VirtualMachine",
                                zap.Object("Pod", oldPod),
                        )
                }
        } else {
                ns.node.Speculatively(func(newNode *state.Node) (commit bool) {
                        newNode.UpdatePod(oldPod, newPod)
                        logger.Info(
                                "Reserved resources updated for Pod, patching VirtualMachine",
                                zap.Object("OldPod", oldPod),
                                zap.Object("DesiredPod", desiredPod),
                                zap.Object("Pod", newPod),
                                zap.Object("OldNode", ns.node),
                                zap.Object("Node", newNode),
                        )

                        return true // yes, commit the changes to use newPod
                })
        }

        ns.podsVMPatchedAt[oldPod.UID] = now

        return &podUpdateResult{
                needsMoreResources: needsMoreResources,
                afterUnlock: func() error {
                        return s.patchReservedResourcesForPod(logger, oldPodObj, desiredPod)
                },
                retryAfter: nil,
        }
}

func (s *PluginState) patchReservedResourcesForPod(
        logger *zap.Logger,
        oldPodObj *corev1.Pod,
        newPod state.Pod,
) error {
        // Broadly, the idea with the patch is that we only want to update the reserved resources if the
        // resources that were requested are still current.
        //
        // So, because JSON Patch allows "tests" to check equality, we'll use those here to check
        // against the requested resources.

        marshalJSON := func(value any) string {
                bs, err := json.Marshal(value)
                if err != nil {
                        panic(fmt.Sprintf("failed to marshal value: %s", err))
                }
                return string(bs)
        }

        var patches []patch.Operation

        // Check that the scaling unit and requested resources are the same:
        if scalingUnitJSON, ok := oldPodObj.Annotations[api.AnnotationAutoscalingUnit]; ok {
                patches = append(patches, patch.Operation{
                        Op: patch.OpTest,
                        Path: fmt.Sprintf(
                                "/metadata/annotations/%s",
                                patch.PathEscape(api.AnnotationAutoscalingUnit),
                        ),
                        Value: scalingUnitJSON,
                })
        }
        if requestedJSON, ok := oldPodObj.Annotations[api.InternalAnnotationResourcesRequested]; ok {
                patches = append(patches, patch.Operation{
                        Op: patch.OpTest,
                        Path: fmt.Sprintf(
                                "/metadata/annotations/%s",
                                patch.PathEscape(api.InternalAnnotationResourcesRequested),
                        ),
                        Value: requestedJSON,
                })
        }

        // ... and then if so, set the approved resources appropriately:
        reservedJSON := marshalJSON(api.Resources{
                VCPU: newPod.CPU.Reserved,
                Mem:  newPod.Mem.Reserved,
        })
        patches = append(patches, patch.Operation{
                Op: patch.OpReplace,
                Path: fmt.Sprintf(
                        "/metadata/annotations/%s",
                        patch.PathEscape(api.InternalAnnotationResourcesApproved),
                ),
                Value: reservedJSON,
        })

        _, hasApprovedAnnotation := oldPodObj.Annotations[api.InternalAnnotationResourcesApproved]

        hasKnownAnnotations := len(patches) > 1 || hasApprovedAnnotation

        // If there's no other known annotations at this point, it's possible that the VM's annotations
        // are completely empty. If so, any operations to add an annotation will fail because the
        // 'annotations' field doesn't exist!
        //
        // So we'll try a simple patch to create the annotations field as part of the operation, and
        // then fall through to the normal one if that gets a conflict:
        if !hasKnownAnnotations {
                addPatches := []patch.Operation{
                        {
                                Op:    patch.OpTest,
                                Path:  "/metadata/annotations",
                                Value: (*struct{})(nil), // typed nil, so that it shows up as 'null'
                        },
                        {
                                Op:    patch.OpAdd,
                                Path:  "/metadata/annotations",
                                Value: struct{}{},
                        },
                        patches[0],
                }
                err := s.patchVM(newPod.VirtualMachine, addPatches)
                if err != nil {
                        if apierrors.IsInvalid(err) {
                                logger.Warn(
                                        "Failed to add-path patch VirtualMachine because preconditions failed, trying again with normal path",
                                        zap.Any("patches", addPatches),
                                        zap.Error(err),
                                )
                                // fall through below...
                        } else {
                                logger.Error("Failed to add-path patch VirtualMachine", zap.Any("patches", addPatches), zap.Error(err))
                                return err
                        }
                } else {
                        // we successfully patched the VM!
                        logger.Info("Patched VirtualMachine for approved resources", zap.Any("patches", addPatches))
                        return nil
                }
        }

        err := s.patchVM(newPod.VirtualMachine, patches)
        // When a JSON patch "test" fails, the API server returns 422 which is internally represented in
        // the k8s error types as a "StatusReasonInvalid".
        // We'll special-case that here -- it's still an error but we want to be more clear about it.
        if err != nil {
                if apierrors.IsInvalid(err) {
                        logger.Warn("Failed to patch VirtualMachine because preconditions failed", zap.Any("patches", patches), zap.Error(err))
                        return errors.New("local pod state doesn't match most recent VM state")
                } else {
                        logger.Error("Failed to patch VirtualMachine", zap.Any("patches", patches), zap.Error(err))
                        return err
                }
        }
        logger.Info("Patched VirtualMachine for approved resources", zap.Any("patches", patches))
        return nil
}

func (s *PluginState) deletePod(logger *zap.Logger, pod *corev1.Pod, expectExists bool) error {
        s.mu.Lock()
        defer s.mu.Unlock()

        nodeName := pod.Spec.NodeName
        if nodeName == "" {
                var ok bool
                if nodeName, ok = s.tentativelyScheduled[pod.UID]; !ok {
                        logger.Info("Nothing to do for Pod deletion as it has no Node")
                        return nil
                }
        }

        logger = logger.With(logFieldForNodeName(nodeName))

        ns, ok := s.nodes[nodeName]
        if !ok {
                logger.Error("Deleting Pod from internal state on a Node that doesn't exist")
                return nil // nothing we can do, all the local state is node-scoped
        }

        defer s.updateNodeMetricsAndRequeue(logger, ns)

        // Check if the pod exists:
        oldPod, exists := ns.node.GetPod(pod.UID)

        if !exists && expectExists {
                logger.Warn("Deleting Pod that unexpectedly doesn't exist in local state")
        } else if exists && !expectExists {
                logger.Warn("Deleting Pod that unexpectedly exists in local state")
        }

        // Clear any extra state for this pod
        delete(ns.requestedMigrations, pod.UID)
        delete(ns.podsVMPatchedAt, pod.UID)
        if exists {
                // ... and run the actual removal in Speculatively() so we can log the before/after in a single
                // line, and for panic safety.
                oldNode := ns.node
                ns.node.Speculatively(func(n *state.Node) (commit bool) {
                        n.RemovePod(pod.UID)

                        logger.Info(
                                "Removed Pod from Node",
                                zap.Object("Pod", oldPod),
                                zap.Object("OldNode", oldNode),
                                zap.Object("Node", n),
                        )

                        return true
                })
        } else {
                logger.Info(
                        "Node unchanged from Pod deletion because it already isn't in local state",
                        zap.Object("Node", ns.node),
                )
        }

        // Remove from tentatively scheduled, if it's there.
        // We need to do this last because earlier stages depend on this, and we might end up with
        // incomplete deletions if we clear this first, and hit an error later.
        if tentativeNode, ok := s.tentativelyScheduled[pod.UID]; ok {
                if pod.Spec.NodeName != "" && tentativeNode != pod.Spec.NodeName {
                        logger.Panic(
                                "Pod was scheduled onto a different Node than tentatively recorded",
                                zap.String("OriginalNodeName", tentativeNode),
                                zap.String("NewNodeName", pod.Spec.NodeName),
                        )
                }
                delete(s.tentativelyScheduled, pod.UID)
        }

        return nil
}

package plugin

// Handling of VirtualMachineMigration events.

import (
        "fmt"

        "go.uber.org/zap"

        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/plugin/reconcile"
        "github.com/neondatabase/autoscaling/pkg/plugin/state"
        "github.com/neondatabase/autoscaling/pkg/util"
)

// LabelPluginCreatedMigration marks all VirtualMachineMigrations that are created automatically by
// the scheduler plugin.
const LabelPluginCreatedMigration = "autoscaling.neon.tech/created-by-scheduler"

func (s *PluginState) HandleMigrationEvent(
        logger *zap.Logger,
        kind reconcile.EventKind,
        vmm *vmv1.VirtualMachineMigration,
) error {
        logger = logger.With(zap.Object("VirtualMachine", util.NamespacedName{
                Name:      vmm.Spec.VmName,
                Namespace: vmm.Namespace,
        }))

        switch kind {
        case reconcile.EventKindDeleted, reconcile.EventKindEphemeral:
                // Migration was deleted. Nothing to do.
                return nil
        case reconcile.EventKindAdded, reconcile.EventKindModified:
                return s.deleteMigrationIfNeeded(logger, vmm)
        default:
                panic("unreachable")
        }
}

func metadataForNewMigration(pod state.Pod) metav1.ObjectMeta {
        return metav1.ObjectMeta{
                // NOTE: We derive the name of the migration from the name of the *pod* so that
                // we don't accidentally believe that there's already a migration ongoing for a
                // pod when it's actually a different pod of the same VM.
                Name:      fmt.Sprintf("schedplugin-%s", pod.Name),
                Namespace: pod.Namespace,
                Labels: map[string]string{
                        LabelPluginCreatedMigration: "true",
                },
        }
}

// deleteMigrationIfNeeded deletes the migration object if it was created by the scheduler plugin
// and has reached a terminal state (succeeded or failed).
//
// This is basically for automatic clenaup of migrations once they're finished, otherwise we'd leak
// migration objects.
func (s *PluginState) deleteMigrationIfNeeded(logger *zap.Logger, vmm *vmv1.VirtualMachineMigration) error {
        // Check that the migration is owned by the scheduler plugin:
        if _, ok := vmm.Labels[LabelPluginCreatedMigration]; !ok {
                return nil
        }

        // Check if the migration is in a terminal state:
        switch vmm.Status.Phase {
        case vmv1.VmmSucceeded, vmv1.VmmFailed:
                // terminal state! it should be cleaned up.
        default:
                // non-terminal state, do nothing.
                return nil
        }

        // Check if the migration is already going to be deleted
        if vmm.DeletionTimestamp != nil {
                return nil
        }

        // Ok: we own this migration, it's done, and not yet being deleted. Let's delete it.
        if vmm.Status.Phase == vmv1.VmmFailed {
                logger.Warn("Deleting failed VirtualMachineMigration", zap.Any("VirtualMachineMigration", vmm))
        } else {
                logger.Info("Deleting successful VirtualMachineMigration", zap.Any("VirtualMachineMigration", vmm))
        }

        if err := s.deleteMigration(logger, vmm); err != nil {
                return fmt.Errorf("could not delete migration: %w", err)
        }

        return nil
}

package initevents

// Reconcile middleware to allow us to know when all of a set of events have been handled.

import (
        "sync"
        "sync/atomic"

        "go.uber.org/zap"

        "github.com/neondatabase/autoscaling/pkg/plugin/reconcile"
)

// compile-time check that InitEventsMiddleware implements reconcile.Middleware
var _ reconcile.Middleware = (*InitEventsMiddleware)(nil)

// InitEventsMiddleware is middleware for reconcile.Queue that allows us to be notified when all of
// a set of events have been successfully processed.
//
// The initial setup of the scheduler plugin uses this to ensure that we don't make any decisions on
// partial state.
type InitEventsMiddleware struct {
        done       atomic.Bool
        notifyDone chan struct{}

        mu         sync.Mutex
        doneAdding bool
        remaining  map[reconcile.Key]struct{}
}

func NewInitEventsMiddleware() *InitEventsMiddleware {
        return &InitEventsMiddleware{
                done:       atomic.Bool{},
                notifyDone: make(chan struct{}),
                mu:         sync.Mutex{},
                doneAdding: false,
                remaining:  make(map[reconcile.Key]struct{}),
        }
}

// Call implements reconcile.Middleware.
func (m *InitEventsMiddleware) Call(
        logger *zap.Logger,
        params reconcile.ObjectParams,
        handler reconcile.MiddlewareHandlerFunc,
) (reconcile.Result, error) {
        result, err := handler(logger, params)

        if err == nil {
                m.success(params.Key())
        }

        return result, err
}

// AddRequired adds the object to the set that are required to reconcile successfully.
//
// This method can be called up until the first call to (*InitEventsMiddleware).Done(), after which
// the set of objects is sealed and further calls to this method will panic.
func (m *InitEventsMiddleware) AddRequired(obj reconcile.Object) {
        m.mu.Lock()
        defer m.mu.Unlock()

        if m.doneAdding {
                panic("AddRequired() called after Done()")
        }

        k := reconcile.Key{
                GVK: obj.GetObjectKind().GroupVersionKind(),
                UID: obj.GetUID(),
        }
        m.remaining[k] = struct{}{}
}

// Done returns a channel that will be closed when all of the required objects have been
// successfully reconciled.
func (m *InitEventsMiddleware) Done() <-chan struct{} {
        m.mu.Lock()
        defer m.mu.Unlock()

        m.doneAdding = true
        m.checkDone()

        return m.notifyDone
}

// Remaining returns the set of objects that we're waiting on to be successfully reconciled.
func (m *InitEventsMiddleware) Remaining() []reconcile.Key {
        m.mu.Lock()
        defer m.mu.Unlock()

        var keys []reconcile.Key
        for k := range m.remaining {
                keys = append(keys, k)
        }
        return keys
}

// helper function for when reconciling is successful
func (m *InitEventsMiddleware) success(k reconcile.Key) {
        // fast path: don't do anything if we're already done, avoiding waiting on an extra lock.
        if m.done.Load() {
                return
        }

        m.mu.Lock()
        defer m.mu.Unlock()

        delete(m.remaining, k)
        m.checkDone()
}

// NOTE: this method expects that the caller has acquired m.mu.
func (m *InitEventsMiddleware) checkDone() {
        // we've already signaled that we're done. Avoid double-closing the channel.
        if m.done.Load() {
                return
        }

        if m.doneAdding && len(m.remaining) == 0 {
                close(m.notifyDone)
                m.done.Store(true)
        }
}

package metrics

import (
        "strconv"

        "github.com/prometheus/client_golang/prometheus"

        corev1 "k8s.io/api/core/v1"
        "k8s.io/kubernetes/pkg/scheduler/framework"

        "github.com/neondatabase/autoscaling/pkg/plugin/state"
        "github.com/neondatabase/autoscaling/pkg/util"
)

type Framework struct {
        // inheritedNodeLabels are the labels on the node that are directly included in the metrics,
        // given in the order that they appear in the metric labels.
        inheritedNodeLabels []string

        methodCalls       *prometheus.CounterVec
        methodCallFails   *prometheus.CounterVec
        reserveOverBudget *prometheus.CounterVec
}

func (m *Framework) IncMethodCall(method string, pod *corev1.Pod, ignored bool) {
        az := util.PodPreferredAZIfPresent(pod)
        m.methodCalls.WithLabelValues(method, az, strconv.FormatBool(ignored)).Inc()
}

func (m *Framework) IncFailIfnotSuccess(method string, pod *corev1.Pod, ignored bool, status *framework.Status) {
        // it's normal for Filter to return Unschedulable, because that's its way of filtering out pods.
        if status.IsSuccess() || (method == "Filter" && status.Code() == framework.Unschedulable) {
                return
        }

        az := util.PodPreferredAZIfPresent(pod)
        m.methodCallFails.
                WithLabelValues(method, az, strconv.FormatBool(ignored), status.Code().String()).
                Inc()
}

func (m *Framework) IncReserveOverBudget(ignored bool, node *state.Node) {
        labelValues := []string{node.Name}
        for _, label := range m.inheritedNodeLabels {
                value, _ := node.Labels.Get(label)
                labelValues = append(labelValues, value)
        }
        labelValues = append(labelValues, strconv.FormatBool(ignored))

        m.reserveOverBudget.WithLabelValues(labelValues...).Inc()
}

func buildSchedFrameworkMetrics(labels nodeLabeling, reg prometheus.Registerer) Framework {
        reserveLabels := []string{"node"}
        reserveLabels = append(reserveLabels, labels.metricLabelNames...)
        reserveLabels = append(reserveLabels, "ignored_namespace")

        return Framework{
                inheritedNodeLabels: labels.k8sLabelNames,

                methodCalls: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_plugin_extension_calls_total",
                                Help: "Number of calls to scheduler plugin extension points",
                        },
                        []string{"method", "desired_availability_zone", "ignored_namespace"},
                )),
                methodCallFails: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_plugin_extension_call_fails_total",
                                Help: "Number of unsuccessful calls to scheduler plugin extension points",
                        },
                        []string{"method", "desired_availability_zone", "ignored_namespace", "status"},
                )),
                reserveOverBudget: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_plugin_reserve_should_deny_total",
                                Help: "Number of times the plugin should deny a reservation",
                        },
                        reserveLabels,
                )),
        }
}

package metrics

import (
        "fmt"
        "slices"
        "strings"

        "github.com/prometheus/client_golang/prometheus"
        "github.com/prometheus/client_golang/prometheus/collectors"

        "github.com/neondatabase/autoscaling/pkg/util"
)

func RegisterDefaultCollectors(reg prometheus.Registerer) {
        reg.MustRegister(collectors.NewGoCollector())
        reg.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
}

type Plugin struct {
        nodeLabels nodeLabeling

        Framework Framework
        Nodes     *Node
        Reconcile Reconcile

        ResourceRequests      *prometheus.CounterVec
        ValidResourceRequests *prometheus.CounterVec

        K8sOps *prometheus.CounterVec
}

func BuildPluginMetrics(nodeMetricLabels map[string]string, reg prometheus.Registerer) Plugin {
        nodeLabels := buildNodeLabels(nodeMetricLabels)

        return Plugin{
                nodeLabels: nodeLabels,
                Framework:  buildSchedFrameworkMetrics(nodeLabels, reg),
                Nodes:      buildNodeMetrics(nodeLabels, reg),
                Reconcile:  buildReconcileMetrics(reg),

                ResourceRequests: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_plugin_resource_requests_total",
                                Help: "Number of resource requests received by the scheduler plugin",
                        },
                        []string{"code"},
                )),
                ValidResourceRequests: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_plugin_resource_requests_results_total",
                                Help: "Number of resource requests to the scheduler plugin with various results",
                        },
                        []string{"code", "node"},
                )),

                K8sOps: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_plugin_k8s_ops_total",
                                Help: "Number of k8s API requests and their outcome",
                        },
                        []string{"op", "kind", "outcome"},
                )),
        }
}

func (m *Plugin) RecordK8sOp(opKind string, objKind string, objName string, err error) {
        if err == nil {
                m.K8sOps.WithLabelValues(opKind, objKind, "success").Inc()
                return
        }

        // error is non-nil; let's prepare it to be a metric label.
        errMsg := util.RootError(err).Error()
        // Some error messages contain the object name. We could try to filter them all out, but
        // it's probably more maintainable to just keep them as-is and remove the name.
        errMsg = strings.ReplaceAll(errMsg, objName, "<name>")

        outcome := fmt.Sprintf("error: %s", errMsg)

        m.K8sOps.WithLabelValues(opKind, objKind, outcome).Inc()
}

type nodeLabeling struct {
        // k8sLabelNames is the ordered list of labels on Node objects that we directly include in
        // node-related metrics.
        k8sLabelNames []string

        // metricLabelnames is the ordered list of the *metric* labels that we use to represent the
        // kubernetes labels from k8sLabelNames.
        //
        // Each metricLabelNames[i] is the metric label marking the value of the Node object's
        // .metadata.labels[k8sLabelNames[i]].
        metricLabelNames []string
}

func buildNodeLabels(nodeMetricLabels map[string]string) nodeLabeling {
        type labelPair struct {
                metricLabel string
                k8sLabel    string
        }
        labels := []labelPair{}
        for metricLabel, k8sLabel := range nodeMetricLabels {
                labels = append(labels, labelPair{
                        metricLabel: metricLabel,
                        k8sLabel:    k8sLabel,
                })
        }
        slices.SortFunc(labels, func(x, y labelPair) int {
                if x.metricLabel == y.metricLabel {
                        return strings.Compare(x.k8sLabel, y.k8sLabel)
                }
                return strings.Compare(x.metricLabel, y.metricLabel)
        })

        k8sLabels := []string{}
        metricLabels := []string{}
        for _, p := range labels {
                k8sLabels = append(k8sLabels, p.k8sLabel)
                metricLabels = append(metricLabels, p.metricLabel)
        }

        return nodeLabeling{
                k8sLabelNames:    k8sLabels,
                metricLabelNames: metricLabels,
        }
}

package metrics

import (
        "slices"
        "sync"

        "github.com/prometheus/client_golang/prometheus"

        "github.com/neondatabase/autoscaling/pkg/plugin/state"
        "github.com/neondatabase/autoscaling/pkg/util"
)

type Node struct {
        // InheritedLabels are the labels on the node that are directly used as part of the metrics
        InheritedLabels []string

        // mu locks access to lastLabels
        mu sync.Mutex
        // map of node name -> list of labels that were last used in metrics
        lastLabels map[string][]string

        cpu *prometheus.GaugeVec
        mem *prometheus.GaugeVec
}

func buildNodeMetrics(labels nodeLabeling, reg prometheus.Registerer) *Node {
        finalMetricLabels := []string{"node"}
        finalMetricLabels = append(finalMetricLabels, labels.metricLabelNames...)
        finalMetricLabels = append(finalMetricLabels, "field")

        return &Node{
                InheritedLabels: labels.k8sLabelNames,

                mu:         sync.Mutex{},
                lastLabels: make(map[string][]string),

                cpu: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: "autoscaling_plugin_node_cpu_resources_current",
                                Help: "Current amount of CPU for 'state.NodeResources' fields",
                        },
                        finalMetricLabels,
                )),
                mem: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: "autoscaling_plugin_node_mem_resources_current",
                                Help: "Current amount of memory (in bytes) for 'state.NodeResources' fields",
                        },
                        finalMetricLabels,
                )),
        }
}

func (m *Node) Update(node *state.Node) {
        commonLabels := []string{node.Name}
        for _, label := range m.InheritedLabels {
                value, _ := node.Labels.Get(label)
                commonLabels = append(commonLabels, value)
        }

        m.mu.Lock()
        defer m.mu.Unlock()

        if !slices.Equal(commonLabels, m.lastLabels[node.Name]) {
                // Remove old metrics before setting the new ones
                m.removeLocked(node)
        }

        for _, f := range node.CPU.Fields() {
                //nolint:gocritic // assigning append value to a different slice is intentional here
                labels := append(commonLabels, f.Name)
                m.cpu.WithLabelValues(labels...).Set(f.Value.AsFloat64())
        }
        for _, f := range node.Mem.Fields() {
                //nolint:gocritic // assigning append value to a different slice is intentional here
                labels := append(commonLabels, f.Name)
                m.mem.WithLabelValues(labels...).Set(f.Value.AsFloat64())
        }

        m.lastLabels[node.Name] = commonLabels
}

func (m *Node) Remove(node *state.Node) {
        m.mu.Lock()
        defer m.mu.Unlock()
        m.removeLocked(node)
}

func (m *Node) removeLocked(node *state.Node) {
        baseMatch := prometheus.Labels{"node": node.Name}
        m.cpu.DeletePartialMatch(baseMatch)
        m.mem.DeletePartialMatch(baseMatch)
        delete(m.lastLabels, node.Name)
}

package metrics

import (
        "github.com/prometheus/client_golang/prometheus"

        "github.com/neondatabase/autoscaling/pkg/util"
)

type Reconcile struct {
        WaitDurations    prometheus.Histogram
        ProcessDurations *prometheus.HistogramVec
        Failing          *prometheus.GaugeVec
        Panics           *prometheus.CounterVec
}

func buildReconcileMetrics(reg prometheus.Registerer) Reconcile {
        return Reconcile{
                WaitDurations: util.RegisterMetric(reg, prometheus.NewHistogram(
                        prometheus.HistogramOpts{
                                Name: "autoscaling_plugin_reconcile_queue_wait_durations",
                                Help: "Duration that items in the reconcile queue are waiting to be picked up",
                                Buckets: []float64{
                                        // 10µs, 100µs,
                                        0.00001, 0.0001,
                                        // 1ms, 5ms, 10ms, 50ms, 100ms, 250ms, 500ms, 750ms
                                        0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75,
                                        // 1s, 2.5s, 5s, 10s, 20s, 45s
                                        1.0, 2.5, 5, 10, 20, 45,
                                },
                        },
                )),
                ProcessDurations: util.RegisterMetric(reg, prometheus.NewHistogramVec(
                        prometheus.HistogramOpts{
                                Name: "autoscaling_plugin_reconcile_duration_seconds",
                                Help: "Duration that items take to be reconciled",
                                Buckets: []float64{
                                        // 10µs, 100µs,
                                        0.00001, 0.0001,
                                        // 1ms, 5ms, 10ms, 50ms, 100ms, 250ms, 500ms, 750ms
                                        0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75,
                                        // 1s, 2.5s, 5s, 10s, 20s, 45s
                                        1.0, 2.5, 5, 10, 20, 45,
                                },
                        },
                        []string{"kind", "outcome"},
                )),
                Failing: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: "autoscaling_plugin_reconcile_failing_objects",
                                Help: "Number of objects currently failing to be reconciled",
                        },
                        []string{"kind"},
                )),
                Panics: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: "autoscaling_plugin_reconcile_panics_count",
                                Help: "Number of times reconcile operations have panicked",
                        },
                        []string{"kind"},
                )),
        }
}

package plugin

// Decision-making for live migrations.

import (
        "fmt"
        "slices"

        "go.uber.org/zap"
        "go.uber.org/zap/zapcore"

        "k8s.io/apimachinery/pkg/types"

        "github.com/neondatabase/autoscaling/pkg/plugin/state"
)

// triggerMigrationsIfNecessary uses the state of the temporary node to request any migrations that
// may be ncessary to reduce the reserved resources below the watermark.
func triggerMigrationsIfNecessary(
        logger *zap.Logger,
        originalNode *state.Node,
        tmpNode *state.Node,
        requestedMigrations []types.UID,
        requestMigrationAndRequeue func(podUID types.UID) error,
) error {
        // To get an accurate count of the amount that's migrating, mark all the pods in
        // requestedMigrations as if they're already migrating.
        // They technically might not be! So we do this via Speculatively() in order to use the
        // existing node methods without actually committing these changes.
        for _, uid := range requestedMigrations {
                p, ok := tmpNode.GetPod(uid)
                if !ok {
                        logger.Warn(
                                "Node state marked pod as migrating that doesn't exist locally",
                                zap.Object("Pod", zapcore.ObjectMarshalerFunc(func(enc zapcore.ObjectEncoder) error {
                                        enc.AddString("UID", string(uid))
                                        return nil
                                })),
                        )
                        continue
                }

                // Mark the pod as migrating and update it in the (speculative) node
                newPod := p
                newPod.Migrating = true
                tmpNode.UpdatePod(p, newPod)
        }

        cpuAbove := tmpNode.CPU.UnmigratedAboveWatermark()
        memAbove := tmpNode.Mem.UnmigratedAboveWatermark()
        // if we're below the watermark (or already migrating enough to be below the watermark),
        // there's nothing to do:
        if cpuAbove == 0 && memAbove == 0 {
                return nil
        }

        logger.Info(
                "Not enough resources are being migrated to reduce to the watermark. Finding migration targets",
                zap.Object("Node", originalNode),
                zap.String("CPUToMigrate", fmt.Sprint(cpuAbove)),
                zap.String("MemToMigrate", fmt.Sprint(memAbove)),
        )

        var candidates []state.Pod
        for _, pod := range tmpNode.MigratablePods() {
                // Maybe this pod is currently being migrated. If so, don't include it on the list of
                // new candidates:
                if pod.Migrating {
                        continue
                }

                candidates = append(candidates, pod)
        }

        // Ok, we have some migration candidates. Let's sort them and keep triggering migrations
        // until it'll be enough to get below the watermark.
        slices.SortFunc(candidates, func(cx, cy state.Pod) int {
                return cx.BetterMigrationTargetThan(cy)
        })
        for _, pod := range candidates {
                podLogger := logger.With(zap.Any("CandidatePod", pod))

                // If we find a pod that is singularly above the watermark, don't migrate it! We'll
                // likely just end up above the watermark on the new node.
                // NOTE that this is NOT true in heterogeneous clusters (i.e., where the nodes are
                // different sizes), but scheduling is much more complex there, and we'd have to be
                // careful to only migrate when there's candidate nodes *with room* where the VM could
                // fit without going over the watermark.
                //
                // That's all quite complicated -- hence why we're taking the easy way out.
                tooBig := pod.CPU.Reserved > tmpNode.CPU.Watermark || pod.Mem.Reserved > tmpNode.Mem.Watermark
                if tooBig {
                        podLogger.Warn("Skipping potential migration of candidate Pod because it's too big")
                        continue
                }

                // Trigger migration of this pod!
                podLogger.Info("Internally triggering migration for candidate Pod")
                if err := requestMigrationAndRequeue(pod.UID); err != nil {
                        podLogger.Error("Failed to requeue reconciling of candidate Pod")
                        return fmt.Errorf("could not requeue pod %v with UID %s: %w", pod.NamespacedName, pod.UID, err)
                }
                // update the pod state in the speculative node
                newPod := pod
                newPod.Migrating = true
                tmpNode.UpdatePod(pod, newPod)

                // ... and then check if we need to keep migrating more ...
                cpuAbove = tmpNode.CPU.UnmigratedAboveWatermark()
                memAbove = tmpNode.Mem.UnmigratedAboveWatermark()

                if cpuAbove <= 0 && memAbove <= 0 {
                        // We've triggered enough migrations that it should get us below the watermark.
                        // We're done for now.
                        break
                }
        }

        if cpuAbove > 0 || memAbove > 0 {
                logger.Warn(
                        "Could not trigger enough migrations to get below watermark",
                        zap.Object("SpeculativeNode", tmpNode),
                )
        }

        return nil
}

package plugin

import (
        "context"
        "fmt"
        "time"

        "go.uber.org/zap"

        "github.com/neondatabase/autoscaling/pkg/plugin/reconcile"
)

func (s *PluginState) reconcileQueueWaitCallback(duration time.Duration) {
        s.metrics.Reconcile.WaitDurations.Observe(duration.Seconds())
}

func (s *PluginState) reconcileResultCallback(params reconcile.ObjectParams, duration time.Duration, err error) {
        outcome := "success"
        if err != nil {
                outcome = "failure"
        }
        s.metrics.Reconcile.ProcessDurations.
                WithLabelValues(params.GVK.Kind, outcome).
                Observe(duration.Seconds())
}

func (s *PluginState) reconcileErrorStatsCallback(logger *zap.Logger, params reconcile.ObjectParams, stats reconcile.ErrorStats) {
        // update count of current failing objects
        s.metrics.Reconcile.Failing.
                WithLabelValues(params.GVK.Kind).
                Set(float64(stats.TypedCount))

        // Make sure that repeatedly failing objects are sufficiently noisy
        if stats.SuccessiveFailures >= s.config.LogSuccessiveFailuresThreshold {
                logger.Warn(
                        fmt.Sprintf("%s has failed to reconcile >%d times in a row", params.GVK.Kind, s.config.LogSuccessiveFailuresThreshold),
                        zap.Int("SuccessiveFailures", stats.SuccessiveFailures),
                        zap.String("EventKind", string(params.EventKind)),
                        reconcile.ObjectMetaLogField(params.GVK.Kind, params.Obj),
                )
        }
}

func (s *PluginState) reconcilePanicCallback(params reconcile.ObjectParams) {
        s.metrics.Reconcile.Panics.WithLabelValues(params.GVK.Kind).Inc()
}

func reconcileWorker(ctx context.Context, logger *zap.Logger, queue *reconcile.Queue) {
        wait := queue.WaitChan()
        for {
                select {
                case <-ctx.Done():
                        return
                case _, ok := <-wait:
                        if !ok {
                                // channel closed; we're done.
                                return
                        }
                        callback, ok := queue.Next()
                        if !ok {
                                // Spurious wake-up; retry.
                                continue
                        }

                        callback(logger)
                }
        }
}

package reconcile

import (
        "fmt"
)

// EventKind is the kind of change that happened to the object that the handler is now tasked with
// responding to.
type EventKind string

const (
        EventKindAdded    EventKind = "Added"
        EventKindModified EventKind = "Modified"
        EventKindDeleted  EventKind = "Deleted"

        // EventKindEphemeral represents the combination when an addition was not handled before the
        // object was deleted, yet we still may want to process either of these.
        EventKindEphemeral EventKind = "Ephemeral"
)

// Merge returns the combination of the two events
//
// To be precise, the results are:
//
//   - Added + Modified = Added
//   - Modified + Modified = Modified
//   - Modified + Deleted = Deleted
//   - Added + Deleted = Ephemeral
//
// And Ephemeral events are expected not to be merged with anything.
//
// In all cases, the more recent state of the object is expected to be used.
func (k EventKind) Merge(other EventKind) EventKind {
        if k == EventKindEphemeral || other == EventKindEphemeral {
                panic(fmt.Sprintf("cannot merge(%s, %s) involving an ephemeral event", k, other))
        }

        // modified + anything is ok, for the most part
        if k == EventKindModified {
                return other
        } else if other == EventKindModified {
                return k
        }

        if k == EventKindAdded && other == EventKindDeleted || k == EventKindDeleted && other == EventKindAdded {
                return EventKindEphemeral
        }

        // All that's left is Added+Added and Deleted+Deleted, both of which don't make sense.
        panic(fmt.Sprintf("cannot merge(%s, %s)", k, other))
}

package reconcile

import (
        "fmt"
        "sync"
        "time"

        "go.uber.org/zap"
        "go.uber.org/zap/zapcore"

        "k8s.io/apimachinery/pkg/runtime/schema"
        "k8s.io/apimachinery/pkg/types"

        "github.com/neondatabase/autoscaling/pkg/util/stack"
)

// Middleware wraps a reconcile operation to insert its own logic
type Middleware interface {
        Call(*zap.Logger, ObjectParams, MiddlewareHandlerFunc) (Result, error)
}

// MiddlewareHandlerFunc is an enriched version of HandlerFunc that accepts more parameters, so that
// middleware functions don't need to know whether they're calling the base HandlerFunc or another
// piece of middleware.
type MiddlewareHandlerFunc = func(*zap.Logger, ObjectParams) (Result, error)

// ObjectParams stores the miscellaneous parameters about an object that are made available to all
// middleware.
type ObjectParams struct {
        GVK       schema.GroupVersionKind
        UID       types.UID
        Name      string
        Namespace string
        EventKind EventKind
        Obj       Object
}

// Key returns the fields uniquely identifying the associated object
func (p ObjectParams) Key() Key {
        return Key{GVK: p.GVK, UID: p.UID}
}

func applyMiddleware(middleware []Middleware, handler HandlerFunc) HandlerFunc {
        f := func(l *zap.Logger, p ObjectParams) (Result, error) {
                return handler(l, p.EventKind, p.Obj)
        }
        // Iterate backwards, so that the
        for i := len(middleware) - 1; i >= 0; i-- {
                // copy to avoid loop var escaping (maybe not needed after Go 1.22? it's hard to be certain)
                m := middleware[i]
                // copy 'f', so that we don't recurse -- otherwise, the function will be referenced by name.
                // See https://go.dev/play/p/8f4EgbL4Rm2 for an example of the difference.
                oldF := f
                f = func(l *zap.Logger, p ObjectParams) (Result, error) {
                        return m.Call(l, p, oldF)
                }
        }

        return func(logger *zap.Logger, k EventKind, obj Object) (Result, error) {
                // Extract the common parameters:
                params := ObjectParams{
                        GVK:       obj.GetObjectKind().GroupVersionKind(),
                        UID:       obj.GetUID(),
                        Namespace: obj.GetNamespace(),
                        Name:      obj.GetName(),
                        EventKind: k,
                        Obj:       obj,
                }
                return f(logger, params)
        }
}

func defaultMiddleware(
        types []schema.GroupVersionKind,
        resultCallback ResultCallback,
        errorCallback ErrorStatsCallback,
        panicCallback PanicCallback,
) []Middleware {
        return []Middleware{
                NewLogMiddleware(resultCallback),
                NewErrorBackoffMiddleware(types, errorCallback),
                NewCatchPanicMiddleware(panicCallback),
        }
}

// ResultCallback represents the signature of the optional callback that may be registered with the
// LogMiddleware to update metrics or similar based on the result of each reconcile operation.
type ResultCallback = func(params ObjectParams, duration time.Duration, err error)

// LogMiddleware is middleware for the reconcile queue that augments the logger with fields
// describing the object being reconciled, as well as logging the results of each reconcile
// operation.
//
// This middleware is always included. It's public to provide additional documentation.
type LogMiddleware struct {
        resultCallback ResultCallback
}

func NewLogMiddleware(callback ResultCallback) *LogMiddleware {
        return &LogMiddleware{
                resultCallback: callback,
        }
}

// ObjectMetaLogField returns a zap.Field for an object, in the same format that the default logging
// middleware uses.
//
// The returned zap.Field has the given key, and is a zap.Object with the namespace, name, and UID
// of the kubernetes object.
func ObjectMetaLogField(key string, obj Object) zap.Field {
        return objLogFieldFromParams(key, obj.GetNamespace(), obj.GetName(), obj.GetUID())
}

func objLogFieldFromParams(key string, namespace string, name string, uid types.UID) zap.Field {
        return zap.Object(key, zapcore.ObjectMarshalerFunc(func(enc zapcore.ObjectEncoder) error {
                if namespace != "" {
                        enc.AddString("Namespace", namespace)
                }
                enc.AddString("Name", name)
                enc.AddString("UID", string(uid))
                return nil
        }))
}

// Call implements Middleware
func (m *LogMiddleware) Call(
        logger *zap.Logger,
        params ObjectParams,
        handler MiddlewareHandlerFunc,
) (Result, error) {
        logger = logger.With(objLogFieldFromParams(params.GVK.Kind, params.Namespace, params.Name, params.UID))

        started := time.Now()
        result, err := handler(logger, params)
        duration := time.Since(started)
        if m.resultCallback != nil {
                m.resultCallback(params, duration, err)
        }
        if err != nil {
                logger.Error(
                        fmt.Sprintf("Failed to reconcile %s %s", params.EventKind, params.GVK.Kind),
                        zap.Duration("duration", duration),
                        zap.Error(err),
                )
        } else {
                logger.Info(
                        fmt.Sprintf("Reconciled %s %s", params.EventKind, params.GVK.Kind),
                        zap.Duration("duration", duration),
                )
        }
        return result, err
}

// PanicCallback represents the signature of the optional callback that may be registered with the
// CatchPanicMiddleware.
type PanicCallback = func(ObjectParams)

// CatchPanicMiddleware is middleware for the reconcile queue that turns panics into errors.
//
// It can optionally be provided a PanicCallback to exfiltrate information about the panics that
// occur.
//
// This middleware is always included. It's public to provide additional documentation.
type CatchPanicMiddleware struct {
        callback PanicCallback
}

// NewCatchPanicMiddleware returns middleware to turn panics into errors.
//
// If not nil, the callback will be called whenever there is a panic.
//
// Because CatchPanicMiddleware is automatically included in calls to NewQueue, providing the
// callback is best done with the WithPanicCallback QueueOption.
func NewCatchPanicMiddleware(callback PanicCallback) *CatchPanicMiddleware {
        return &CatchPanicMiddleware{
                callback: callback,
        }
}

// Call implements Middleware
func (m *CatchPanicMiddleware) Call(
        logger *zap.Logger,
        params ObjectParams,
        handler MiddlewareHandlerFunc,
) (_ Result, err error) {
        defer func() {
                if r := recover(); r != nil {
                        st := stack.GetStackTrace(nil, 0).String()
                        logger.Error("Reconcile panicked", zap.Any("payload", r), zap.String("stack", st))
                        if m.callback != nil {
                                m.callback(params)
                        }
                        err = fmt.Errorf("panic: %v", r)
                }
        }()

        return handler(logger, params)
}

// ErrorStatsCallback represents the signature of the optional callback that may be registered with
// the ErrorBackoffMiddleware.
type ErrorStatsCallback = func(ObjectParams, ErrorStats)

// ErrorStats are the values provided to the callback for ErrorBackoffMiddleware.
type ErrorStats struct {
        // GlobalCount is the total number of objects currently failing to be reconciled.
        GlobalCount int
        // TypedCount is the number of objects of this type that are failing to be reconciled.
        TypedCount int
        // SuccessiveFailures gives the number of times in a row that this object has failed to be
        // reconciled.
        // On success, this value is equal to zero.
        SuccessiveFailures int
}

// ErrorBackoffMiddleware performs two key functions:
//
//  1. It sets the RetryAfter time for reconcile operations that return error, using exponential
//     backoff if the object previously failed.
//  2. It exposes this information to the ErrorStatsCallback (if provided) for use in metrics.
type ErrorBackoffMiddleware struct {
        globalCounterMu sync.Mutex
        globalFailing   int

        byType map[schema.GroupVersionKind]*typedTimingSet

        callback ErrorStatsCallback
}

type typedTimingSet struct {
        mu    sync.Mutex
        byUID map[types.UID]backoff
}

type backoff struct {
        successiveFailures int
        waitDuration       time.Duration
}

const (
        initialErrorWait = 100 * time.Millisecond
        backoffFactor    = 2.03 // factor of 2.03 results in 0.1s -> 60s after 10 failures.
        maxErrorWait     = time.Minute
)

// NewErrorBackoffMiddleware creates a new ErrorBackoffMiddleware, using the set of known types
// provided and optionally a callback for observability.
//
// The callback is NOT assumed to be thread-safe.
func NewErrorBackoffMiddleware(typs []schema.GroupVersionKind, callback ErrorStatsCallback) *ErrorBackoffMiddleware {
        byType := make(map[schema.GroupVersionKind]*typedTimingSet)

        for _, gvk := range typs {
                byType[gvk] = &typedTimingSet{
                        mu:    sync.Mutex{},
                        byUID: make(map[types.UID]backoff),
                }
        }

        return &ErrorBackoffMiddleware{
                globalCounterMu: sync.Mutex{},
                globalFailing:   0,
                byType:          byType,
                callback:        callback,
        }
}

func (m *ErrorBackoffMiddleware) Call(
        logger *zap.Logger,
        params ObjectParams,
        handler MiddlewareHandlerFunc,
) (Result, error) {
        typed, ok := m.byType[params.GVK]
        if !ok {
                panic(fmt.Sprintf("received reconcile for unknown type %s", fmtGVK(params.GVK)))
        }

        result, err := handler(logger, params)

        typed.mu.Lock()
        defer typed.mu.Unlock()

        failed := err != nil
        b, wasFailing := typed.byUID[params.UID]

        var change int

        if failed {
                b.successiveFailures += 1
                if wasFailing {
                        b.waitDuration = min(maxErrorWait, time.Duration(float64(b.waitDuration)*backoffFactor))
                } else {
                        b.waitDuration = initialErrorWait
                }

                if result.RetryAfter != 0 {
                        // Cap the current wait duration with the requested retry, IF the handler left an
                        // explicit amount it wanted to wait.
                        // This is to avoid long retries on a spurious failure after the situation has resolved.
                        b.waitDuration = min(result.RetryAfter, b.waitDuration)
                }
                // use max(..) so that the backoff MUST be respected, but waits longer than it are allowed.
                result.RetryAfter = max(result.RetryAfter, b.waitDuration)

                typed.byUID[params.UID] = b
                if !wasFailing {
                        change = 1 // +1 item failing
                }
        } else /* !failed */ {
                // reset the counters, for below
                b = backoff{waitDuration: 0, successiveFailures: 0}

                // remove the tracking for this value, if it was present - it's not failing
                delete(typed.byUID, params.UID)
                if wasFailing {
                        change = -1 // -1 item failing
                }
        }

        if change != 0 {
                m.globalCounterMu.Lock()
                defer m.globalCounterMu.Unlock()

                m.globalFailing += change

                if m.callback != nil {
                        m.callback(params, ErrorStats{
                                GlobalCount:        m.globalFailing,
                                TypedCount:         len(typed.byUID),
                                SuccessiveFailures: b.successiveFailures,
                        })
                }
        }

        return result, err
}

package reconcile

import (
        "context"
        "time"
)

// QueueOption customizes the behavior of NewQueue.
type QueueOption struct {
        apply func(*queueSettings)
}

// queueSettings is the internal, temporary structure that we use to hold the results of applying
// the various QueueOptions
type queueSettings struct {
        baseContext    context.Context
        middleware     []Middleware
        waitCallback   QueueWaitDurationCallback
        resultCallback ResultCallback
        errorCallback  ErrorStatsCallback
        panicCallback  PanicCallback
}

func defaultQueueSettings() *queueSettings {
        return &queueSettings{
                baseContext:    context.Background(),
                middleware:     []Middleware{},
                waitCallback:   nil,
                resultCallback: nil,
                errorCallback:  nil,
                panicCallback:  nil,
        }
}

// WithBaseContext sets a context to use for the queue, equivalent to automatically calling
// (*Queue).Stop() when the context is canceled.
func WithBaseContext(ctx context.Context) QueueOption {
        return QueueOption{
                apply: func(s *queueSettings) {
                        s.baseContext = ctx
                },
        }
}

// WithMiddleware appends the specified middleware callback for the Queue.
//
// Additional middleware is executed later -- i.e., the first middleware provided will be given a
// callback representing all remaining middleware plus the final handler.
func WithMiddleware(mw Middleware) QueueOption {
        return QueueOption{
                apply: func(s *queueSettings) {
                        s.middleware = append(s.middleware, mw)
                },
        }
}

// QueueWaitDurationCallback represents the signature of the callback that may be provided to add
// observability for how long items are waiting in the queue before being reconciled.
type QueueWaitDurationCallback = func(time.Duration)

// WithQueueWaitDurationCallback sets the QueueWaitDurationCallback that will be called with the
// wait time from the desired reconcile time whenever a reconcile operation starts.
func WithQueueWaitDurationCallback(cb QueueWaitDurationCallback) QueueOption {
        return QueueOption{
                apply: func(s *queueSettings) {
                        s.waitCallback = cb
                },
        }
}

// WithResultCallback sets the ResultCallback to provide to the LogMiddleware.
//
// It will be called after every reconcile operation completes with the relevant information about
// the operation and its execution.
func WithResultCallback(cb ResultCallback) QueueOption {
        return QueueOption{
                apply: func(s *queueSettings) {
                        s.resultCallback = cb
                },
        }
}

// WithErrorStatsCallback sets the callback to provide to the ErrorBackoffMiddleware.
//
// It will be called whenever the error statistics change.
//
// Determining whether the reconcile operation failed is possible by checking
// ErrorStats.SuccessiveFailures -- if it's zero, the operation was successful.
func WithErrorStatsCallback(cb ErrorStatsCallback) QueueOption {
        return QueueOption{
                apply: func(s *queueSettings) {
                        s.errorCallback = cb
                },
        }
}

// WithPanicCallback sets the callback to provide to the CatchPanicMiddleware.
//
// It will be called on each panic.
func WithPanicCallback(cb PanicCallback) QueueOption {
        return QueueOption{
                apply: func(s *queueSettings) {
                        s.panicCallback = cb
                },
        }
}

package reconcile

import (
        "context"
        "fmt"
        "sync"
        "time"

        "go.uber.org/zap"

        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/apimachinery/pkg/runtime"
        "k8s.io/apimachinery/pkg/runtime/schema"
        "k8s.io/apimachinery/pkg/types"

        "github.com/neondatabase/autoscaling/pkg/util"
        "github.com/neondatabase/autoscaling/pkg/util/queue"
)

// Queue is the unified queue for managing and distributing reconcile operations for kubernetes
// objects
type Queue struct {
        mu sync.Mutex

        // queue is the changes that are due to be processed but have not yet been picked up by any
        // workers.
        queue queue.PriorityQueue[kv]
        // queued stores the handles for objects in the queue. This is needed so that we can update the
        // objects while they're in the queue, rather than requeueing on each change we receive from the
        // kubernetes API server.
        queued map[Key]queue.ItemHandle[kv]
        // pending stores the changes to objects that the Queue has received, but can't actually push to
        // the queue because there are ongoing operations for those objects that would cause conflicts.
        pending map[Key]value
        // ongoing tracks the set of all ongoing reconcile operations.
        // When we receive an update, we use ongoing to check whether we can add it to the queue, or if
        // we must instead wait for it to finish.
        ongoing map[Key]struct{}

        // next is a synchronous channel to distribute notifications that there are items in the queue.
        //
        // the sending half of the channel is owned by a separate goroutine running
        // (*Queue).handleNotifications().
        //
        // NOTE: This field is immutable.
        next <-chan struct{}

        // NOTE: This field is immutable.
        stopNotificationHandling func()
        // NOTE: This field is immutable.
        notifyEnqueued func()

        // NOTE: This field is immutable.
        handlers map[schema.GroupVersionKind]HandlerFunc

        // if not nil, a callback that records how long each item was waiting to be reconciled
        queueWaitCallback QueueWaitDurationCallback
}

type kv struct {
        k Key
        v value
}

// Key uniquely identifies a kubernetes object
type Key struct {
        GVK schema.GroupVersionKind
        UID types.UID
}

// value stores the information about a pending reconcile operation for a kubernetes object
type value struct {
        reconcileAt time.Time
        eventKind   EventKind
        object      Object
        handler     HandlerFunc
}

// HandlerFunc represents the signature of functions that will be called to reconcile objects in the
// queue.
//
// Handlers are reigstered in the call to NewQueue with the mapping from each type to its handler.
type HandlerFunc = func(*zap.Logger, EventKind, Object) (Result, error)

// Result is the outcome of reconciling, storing whether the reconcile operation should be retried
// (and if so, how long should we wait?)
type Result struct {
        // RetryAfter, if not zero, gives the duration that we should wait before retrying.
        //
        // RetryAfter != 0 implies Retry; there is no need to specify both.
        RetryAfter time.Duration
}

// NewQueue builds and returns a new Queue with the provided middleware and handlers for various
// types.
func NewQueue(handlers map[Object]HandlerFunc, opts ...QueueOption) (*Queue, error) {
        settings := defaultQueueSettings()

        for _, o := range opts {
                o.apply(settings)
        }

        types := []schema.GroupVersionKind{}
        handlersByType := make(map[schema.GroupVersionKind]HandlerFunc)
        for obj, handler := range handlers {
                // nb: second arg is whether the object is unversioned. That doesn't matter to us.
                gvk, err := util.LookupGVKForType(obj)
                if err != nil {
                        return nil, err
                }

                // Check that this isn't a duplicate
                if _, ok := handlersByType[gvk]; ok {
                        return nil, fmt.Errorf("duplicate handler for object type %T with GVK %q", obj, fmtGVK(gvk))
                }

                handlersByType[gvk] = handler
                types = append(types, gvk)
        }

        middleware := defaultMiddleware(types, settings.resultCallback, settings.errorCallback, settings.panicCallback)
        middleware = append(middleware, settings.middleware...)

        // Apply middleware to all handlers
        enrichedHandlers := make(map[schema.GroupVersionKind]HandlerFunc)
        for gvk, handler := range handlersByType {
                enrichedHandlers[gvk] = applyMiddleware(middleware, handler)
        }

        next := make(chan struct{})
        ctx, cancel := context.WithCancel(settings.baseContext)

        enqueuedSndr := util.NewBroadcaster()
        enqueuedRcvr := enqueuedSndr.NewReceiver()

        q := &Queue{
                mu: sync.Mutex{},
                queue: queue.New(func(x, y kv) bool {
                        return x.v.isHigherPriority(y.v)
                }),
                queued:  make(map[Key]queue.ItemHandle[kv]),
                pending: make(map[Key]value),
                ongoing: make(map[Key]struct{}),

                next: next,

                // note: context.WithCancel returns a thread-safe cancel function.
                stopNotificationHandling: cancel,
                notifyEnqueued:           enqueuedSndr.Broadcast,

                handlers: enrichedHandlers,

                queueWaitCallback: settings.waitCallback,
        }

        go q.handleNotifications(ctx, next, enqueuedRcvr)

        return q, nil
}

func (q *Queue) handleNotifications(ctx context.Context, next chan<- struct{}, enqueued util.BroadcastReceiver) {
        done := ctx.Done()

        timer := time.NewTimer(0)

        for {
                timer.Stop()

                // Check if the context is done. If so, bail early.
                select {
                case <-done:
                        return
                default:
                }

                // Wait until we can send a notification:
                var deadline time.Time
                func() {
                        q.mu.Lock()
                        defer q.mu.Unlock()

                        nextKV, ok := q.queue.Peek()
                        if !ok {
                                return
                        }

                        deadline = nextKV.v.reconcileAt
                        if deadline.IsZero() {
                                panic("item in queue has unexpected zero deadline")
                        }
                }()

                if deadline.IsZero() {
                        // Nothing in the queue. Wait until there is something.
                        select {
                        case <-done:
                                return
                        case <-enqueued.Wait():
                                enqueued.Awake() // record that we got the message
                                continue         // go through the loop again, so we get non-zero deadline.
                        }
                }

                now := time.Now()
                waitDuration := deadline.Sub(now)
                if waitDuration > 0 {
                        timer.Reset(waitDuration)

                        // Sleep until the deadline is reached.
                        select {
                        case <-done:
                                return
                        case <-enqueued.Wait():
                                enqueued.Awake() // record that we got the message
                                continue         // go through the loop again, in case the deadline changed.
                        case <-timer.C:
                                // we reached the deadline; we can wake up a worker to handle the item.
                        }
                }

                // Message that there's something to be handled
                select {
                case <-done:
                        return
                case next <- struct{}{}:
                }
        }
}

// Stop ceases the distribution of new reconcile operations, and additionally cleans up the
// long-running goroutine that's responsible for dispatching notifications.
//
// For usage with contexts, refer to the WithBaseContext QueueOption.
func (q *Queue) Stop() {
        q.stopNotificationHandling()
}

// ReconcileCallback represents the signature of functions that are handed out to reconcile individual items
//
// Callbacks are returned by calls to (Worker).Next().
type ReconcileCallback = func(*zap.Logger)

// WaitChan returns a channel on which at least one empty struct will be sent for each item waiting
// to be reconciled (note that sometimes there may be spurious wake-ups!)
//
// The channel is shared and persistent, and only closed when (*Queue).Stop() is called or the base
// context (if provided) is canceled.
func (q *Queue) WaitChan() <-chan struct{} {
        return q.next
}

func (v value) isHigherPriority(other value) bool {
        return v.reconcileAt.Before(other.reconcileAt)
}

func fmtGVK(gvk schema.GroupVersionKind) string {
        if gvk.Empty() {
                return "<empty>"
        } else if gvk.Group == "" {
                // v1 handling
                return fmt.Sprintf("%s.%s", gvk.Version, gvk.Kind)
        } else {
                return fmt.Sprintf("%s/%s.%s", gvk.Group, gvk.Version, gvk.Kind)
        }
}

type Object interface {
        runtime.Object
        metav1.Object
}

func (q *Queue) Enqueue(eventKind EventKind, obj Object) {
        now := time.Now()
        gvk := obj.GetObjectKind().GroupVersionKind()

        // Fetch the handler for this object. This doubles as checking that the type is known.
        // We want to do this up front, so that the callstack is clear if this fails.
        //
        // Note: this doesn't require locking the queue because the handlers field is immutable.
        handler, ok := q.handlers[gvk]
        if !ok {
                panic(fmt.Sprintf("unknown object GVK %s of type %T", fmtGVK(gvk), obj))
        }

        k := Key{
                GVK: gvk,
                UID: obj.GetUID(),
        }

        v := value{
                reconcileAt: now, // reconcile as soon as possible
                eventKind:   eventKind,
                object:      obj,
                handler:     handler,
        }

        q.mu.Lock()
        defer q.mu.Unlock()

        // If the object is already being reconciled, we should store the update in 'pending'
        _, ongoingReconcile := q.ongoing[k]
        if ongoingReconcile {
                q.enqueuePendingChange(k, v)
        } else {
                q.enqueueInactive(k, v)
        }
}

// Next returns a callback to execute the next waiting reconcile operation in the queue, or false if
// there are none.
func (q *Queue) Next() (_ ReconcileCallback, ok bool) {
        q.mu.Lock()
        defer q.mu.Unlock()

        kv, ok := q.queue.Peek()
        if !ok || kv.v.reconcileAt.After(time.Now()) {
                return nil, false
        }
        q.queue.Pop()
        delete(q.queued, kv.k)

        // mark the item as ongoing, and then return it:
        q.ongoing[kv.k] = struct{}{}

        callback := func(logger *zap.Logger) {
                q.reconcile(logger, kv.k, kv.v)
        }

        return callback, true
}

// reconcile is the outermost function that is called in order to reconcile an object.
//
// It calls the outermost middleware, which in turn calls the next, and so forth, until the original
// handler is run.
func (q *Queue) reconcile(logger *zap.Logger, k Key, v value) {
        if q.queueWaitCallback != nil {
                wait := time.Since(v.reconcileAt)
                q.queueWaitCallback(wait)
        }

        // Noteworthy functionality that we don't need to worry about here:
        //
        // - Catching panics is already handled by CatchPanicMiddleware.
        // - Retry backoff for errors is already handled by ErrorBackoffMiddleware.
        // - Logging the result is handled by LogResultMiddleware (so, we can ignore the error)
        //
        // All of these are included by default.
        result, _ := v.handler(logger, v.eventKind, v.object)

        requeue := result.RetryAfter != 0
        if requeue {
                retryAt := time.Now().Add(result.RetryAfter)

                // Now that we know when we're retrying, let's schedule that!
                v = value{
                        reconcileAt: retryAt,
                        eventKind:   v.eventKind,
                        object:      v.object,
                        handler:     v.handler,
                }
        }

        q.mu.Lock()
        defer q.mu.Unlock()

        q.finishAndMaybeRequeue(k, v, requeue)
}

func (v value) mergeWithNewer(newer value) value {
        var reconcileAt time.Time
        if v.reconcileAt.Before(newer.reconcileAt) {
                reconcileAt = v.reconcileAt
        } else {
                reconcileAt = newer.reconcileAt
        }

        return value{
                reconcileAt: reconcileAt,
                eventKind:   v.eventKind.Merge(newer.eventKind),
                object:      newer.object,
                handler:     newer.handler,
        }
}

// enqueues a change to an object that is already being reconciled.
//
// NOTE: this method assumes that the caller has acquired q.mu.
func (q *Queue) enqueuePendingChange(k Key, v value) {
        // if there's already something pending, merge with that:
        if pendingValue, ok := q.pending[k]; ok {
                v = pendingValue.mergeWithNewer(v)
        }
        q.pending[k] = v
}

// enqueues a change to an object that is not currently being reconciled
//
// NOTE: this method assumes that the caller has acquired q.mu.
func (q *Queue) enqueueInactive(k Key, v value) {
        // if there's already something in the queue, just merge with that:
        if queuedHandle, ok := q.queued[k]; ok {
                queuedHandle.Update(func(queuedValue *kv) {
                        queuedValue.v = queuedValue.v.mergeWithNewer(v)
                })
                // the value of reconcileAt for the item may have changed; we should notify just in case, so
                // it's not waiting.
                q.notifyEnqueued()
                return
        }

        // ... otherwise, add it to the queue!
        handle := q.queue.Push(kv{k, v})
        q.queued[k] = handle
        // and make sure that someone picks it up:
        q.notifyEnqueued()
}

// finalizes the state for an object that has just finished being reconciled, and requeues
//
// NOTE: this method assumes that the caller has acquired q.mu.
func (q *Queue) finishAndMaybeRequeue(k Key, v value, requeue bool) {
        // First, mark the item as no longer in progress:
        delete(q.ongoing, k)

        // Then, merge with anything pending, if we should requeue
        // Note that if there IS something pending, then it's actually newer than this value, so we
        // merge backwards to how we normally would.
        if pendingValue, ok := q.pending[k]; ok {
                if requeue {
                        // we should merge, because explicit requeueing was requested
                        v = v.mergeWithNewer(pendingValue)
                } else {
                        // just use what was in pending -- but mark that we should actually requeue, because
                        // there are pending changes.
                        v = pendingValue
                        requeue = true
                }
                delete(q.pending, k)
        }

        // now that everything has been cleared, we can actually add it to the queue, if desired
        if requeue {
                handle := q.queue.Push(kv{k, v})
                q.queued[k] = handle
                // ... and make sure someone picks it up:
                q.notifyEnqueued()
        }
}

package plugin

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "net/http"
        "strconv"
        "time"

        "github.com/tychoish/fun/srv"
        "go.uber.org/zap"

        corev1 "k8s.io/api/core/v1"
        "k8s.io/apimachinery/pkg/types"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/plugin/state"
        "github.com/neondatabase/autoscaling/pkg/util"
        "github.com/neondatabase/autoscaling/pkg/util/patch"
)

const (
        MaxHTTPBodySize  int64  = 1 << 10 // 1 KiB
        ContentTypeJSON  string = "application/json"
        ContentTypeError string = "text/plain"
)

const (
        MinPluginProtocolVersion api.PluginProtoVersion = api.PluginProtoV5_0
        MaxPluginProtocolVersion api.PluginProtoVersion = api.PluginProtoV5_0
)

// startPermitHandler runs the server for handling each resourceRequest from a pod
func (s *PluginState) startPermitHandler(
        ctx context.Context,
        logger *zap.Logger,
        getPod func(util.NamespacedName) (*corev1.Pod, bool),
        listenerForPod func(types.UID) (util.BroadcastReceiver, bool),
) error {
        mux := http.NewServeMux()
        mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
                logger := logger // copy locally, so that we can add fields and refer to it in defers

                var finalStatus int

                defer func() {
                        s.metrics.ResourceRequests.WithLabelValues(strconv.Itoa(finalStatus)).Inc()
                }()

                // Catch any potential panics and report them as 500s
                defer func() {
                        if err := recover(); err != nil {
                                msg := "request handler panicked"
                                logger.Error(msg, zap.String("error", fmt.Sprint(err)))
                                finalStatus = 500
                                w.WriteHeader(finalStatus)
                                _, _ = w.Write([]byte(msg))
                        }
                }()

                if r.Method != "POST" {
                        finalStatus = 400
                        w.WriteHeader(400)
                        _, _ = w.Write([]byte("must be POST"))
                        return
                }

                defer r.Body.Close()
                var req api.AgentRequest
                jsonDecoder := json.NewDecoder(io.LimitReader(r.Body, MaxHTTPBodySize))
                if err := jsonDecoder.Decode(&req); err != nil {
                        logger.Warn("Received bad JSON in request", zap.Error(err))
                        w.Header().Add("Content-Type", ContentTypeError)
                        finalStatus = 400
                        w.WriteHeader(400)
                        _, _ = w.Write([]byte("bad JSON"))
                        return
                }

                logger = logger.With(zap.Object("pod", req.Pod), zap.Any("request", req))

                resp, statusCode, err := s.handleAgentRequest(logger, req, getPod, listenerForPod)
                finalStatus = statusCode

                if err != nil {
                        logFunc := logger.Warn
                        if 500 <= statusCode && statusCode < 600 {
                                logFunc = logger.Error
                        }

                        logFunc(
                                "Responding to autoscaler-agent request with error",
                                zap.Int("status", statusCode),
                                zap.Error(err),
                        )

                        w.Header().Add("Content-Type", ContentTypeError)
                        w.WriteHeader(statusCode)
                        _, _ = w.Write([]byte(err.Error()))
                        return
                }

                responseBody, err := json.Marshal(&resp)
                if err != nil {
                        logger.Panic("Failed to encode response JSON", zap.Error(err))
                }

                w.Header().Add("Content-Type", ContentTypeJSON)
                w.WriteHeader(statusCode)
                _, _ = w.Write(responseBody)
        })

        orca := srv.GetOrchestrator(ctx)

        logger.Info("Starting resource request server")
        hs := srv.HTTP("resource-request", 5*time.Second, &http.Server{Addr: "0.0.0.0:10299", Handler: mux})
        if err := hs.Start(ctx); err != nil {
                return fmt.Errorf("Error starting resource request server: %w", err)
        }

        if err := orca.Add(hs); err != nil {
                return fmt.Errorf("Error adding resource request server to orchestrator: %w", err)
        }
        return nil
}

// Returns body (if successful), status code, error (if unsuccessful)
func (s *PluginState) handleAgentRequest(
        logger *zap.Logger,
        req api.AgentRequest,
        getPod func(util.NamespacedName) (*corev1.Pod, bool),
        listenerForPod func(types.UID) (util.BroadcastReceiver, bool),
) (_ *api.PluginResponse, status int, _ error) {
        nodeName := "<none>" // override this later if we have a node name

        defer func() {
                s.metrics.ValidResourceRequests.
                        WithLabelValues(strconv.Itoa(status), nodeName).
                        Inc()
        }()

        // Before doing anything, check that the version is within the range we're expecting.
        expectedProtoRange := api.VersionRange[api.PluginProtoVersion]{
                Min: MinPluginProtocolVersion,
                Max: MaxPluginProtocolVersion,
        }

        if !req.ProtoVersion.IsValid() {
                return nil, 400, fmt.Errorf("Invalid protocol version %v", req.ProtoVersion)
        }
        reqProtoRange := req.ProtocolRange()
        if _, ok := expectedProtoRange.LatestSharedVersion(reqProtoRange); !ok {
                return nil, 400, fmt.Errorf(
                        "Protocol version mismatch: Need %v but got %v", expectedProtoRange, reqProtoRange,
                )
        }

        // check that req.ComputeUnit has no zeros
        if err := req.ComputeUnit.ValidateNonZero(); err != nil {
                return nil, 400, fmt.Errorf("computeUnit fields must be non-zero: %w", err)
        }

        podObj, ok := getPod(req.Pod)
        if !ok {
                logger.Warn("Received request for Pod we don't know") // pod already in the logger's context
                return nil, 404, errors.New("pod not found")
        } else if podObj.Spec.NodeName == "" {
                logger.Warn("Received request for Pod we don't know where it was scheduled")
                return nil, 404, errors.New("pod's node is unknown")
        }

        nodeName = podObj.Spec.NodeName // set nodeName for deferred metrics

        vmRef, ok := vmv1.VirtualMachineOwnerForPod(podObj)
        if !ok {
                logger.Error("Received request for non-VM Pod")
                return nil, 400, errors.New("pod is not associated with a VM")
        }
        vmName := util.NamespacedName{
                Namespace: podObj.Namespace,
                Name:      vmRef.Name,
        }

        // From this point, we'll:
        //
        // 1. Update the annotations on the VirtualMachine object, if this request should change them;
        //    and
        //
        // 2. Wait for the annotations on the Pod object to change so that the approved resources are
        //    increased towards what was requested -- only if the amount requested was greater than what
        //    was last approved.

        patches, changed := vmPatchForAgentRequest(podObj, req)

        // Start listening *before* we update the VM.
        updateReceiver, podExists := listenerForPod(podObj.UID)

        // Only patch the VM object if it changed:
        if changed {
                if err := s.patchVM(vmName, patches); err != nil {
                        logger.Error("Failed to patch VM object", zap.Error(err))
                        return nil, 500, errors.New("failed to patch VM object")
                }
                logger.Info("Patched VirtualMachine for agent request", zap.Any("patches", patches))
        }

        // If we should be able to instantly approve the request, don't bother waiting to observe it.
        if req.LastPermit != nil && !req.Resources.HasFieldGreaterThan(*req.LastPermit) {
                resp := api.PluginResponse{
                        Permit:  req.Resources,
                        Migrate: nil,
                }
                status = 200
                logger.Info("Handled agent request", zap.Int("status", status), zap.Any("response", resp))
                return &resp, status, nil
        }

        // We want to wait for updates on the pod, but if it no longer exists, we should just return.
        if !podExists {
                logger.Warn("Pod for request no longer exists")
                return nil, 404, errors.New("pod not found")
        }

        // FIXME: make the timeout configurable.
        updateTimeout := time.NewTimer(time.Second)
        defer updateTimeout.Stop()

        for {
                timedOut := false

                // Only listen for updates if we need to wait.
                needToWait := req.LastPermit == nil || req.LastPermit.HasFieldLessThan(req.Resources)
                if needToWait {
                        select {
                        case <-updateTimeout.C:
                                timedOut = true
                        case <-updateReceiver.Wait():
                                updateReceiver.Awake()
                        }
                }

                podObj, ok := getPod(req.Pod)
                if !ok {
                        logger.Warn("Pod for request on longer exists")
                        return nil, 404, errors.New("pod not found")
                }

                podState, err := state.PodStateFromK8sObj(podObj)
                if err != nil {
                        logger.Error("Failed to extract Pod state from Pod object for agent request")
                        return nil, 500, errors.New("failed to extract state from pod")
                }

                // Reminder: We're only listening for updates if the requested resources are greater than
                // what was last approved.
                //
                // So, we should keep waiting until the approved resources have increased from the
                // LastPermit in the request.

                approved := api.Resources{
                        VCPU: podState.CPU.Reserved,
                        Mem:  podState.Mem.Reserved,
                }
                requested := api.Resources{
                        VCPU: podState.CPU.Requested,
                        Mem:  podState.Mem.Requested,
                }

                canReturn := requested == req.Resources
                var shouldReturn bool
                if req.LastPermit == nil {
                        _, hasApproved := podObj.Annotations[api.InternalAnnotationResourcesApproved]
                        shouldReturn = canReturn && hasApproved
                } else {
                        shouldReturn = canReturn && approved.HasFieldGreaterThan(*req.LastPermit)
                }

                // Return if we have results, or if we've timed out and it's good enough.
                if shouldReturn || (timedOut && canReturn) {
                        if timedOut {
                                logger.Warn("Timed out while waiting for updates to respond to agent request")
                        }
                        resp := api.PluginResponse{
                                Permit:  approved,
                                Migrate: nil,
                        }
                        status = 200
                        logger.Info("Handled agent request", zap.Int("status", status), zap.Any("response", resp))
                        return &resp, status, nil
                }

                // ... otherwise, if we timed out and our updates to the VM *haven't* yet been reflected on
                // the pod, we don't have anything we can return, so we should return an error.
                if timedOut {
                        logger.Error("Timed out while waiting for updates without suitable response to agent request")
                        return nil, 500, errors.New("timed out waiting for updates to be processed")
                }

                // ... other-otherwise, we'll wait for more updates.
                continue
        }
}

func vmPatchForAgentRequest(pod *corev1.Pod, req api.AgentRequest) (_ []patch.Operation, changed bool) {
        marshalJSON := func(value any) string {
                bs, err := json.Marshal(value)
                if err != nil {
                        panic(fmt.Sprintf("failed to marshal value: %s", err))
                }
                return string(bs)
        }

        var patches []patch.Operation

        computeUnitJSON := marshalJSON(req.ComputeUnit)
        if computeUnitJSON != pod.Annotations[api.AnnotationAutoscalingUnit] {
                changed = true
        }
        // Always include the patch, even if it's the same as current. We'll only execute it if
        // there's differences from what's there currently.
        patches = append(patches, patch.Operation{
                Op: patch.OpReplace,
                Path: fmt.Sprintf(
                        "/metadata/annotations/%s",
                        patch.PathEscape(api.AnnotationAutoscalingUnit),
                ),
                Value: computeUnitJSON,
        })

        requestedJSON := marshalJSON(req.Resources)
        if requestedJSON != pod.Annotations[api.InternalAnnotationResourcesRequested] {
                changed = true
        }
        patches = append(patches, patch.Operation{
                Op: patch.OpReplace,
                Path: fmt.Sprintf(
                        "/metadata/annotations/%s",
                        patch.PathEscape(api.InternalAnnotationResourcesRequested),
                ),
                Value: requestedJSON,
        })

        if req.LastPermit != nil {
                approvedJSON := marshalJSON(*req.LastPermit)
                if approvedJSON != pod.Annotations[api.InternalAnnotationResourcesApproved] {
                        changed = true
                }
                patches = append(patches, patch.Operation{
                        Op: patch.OpReplace,
                        Path: fmt.Sprintf(
                                "/metadata/annotations/%s",
                                patch.PathEscape(api.InternalAnnotationResourcesApproved),
                        ),
                        Value: approvedJSON,
                })
        }

        return patches, changed
}

package state

import (
        "errors"
        "fmt"
        "iter"

        "go.uber.org/zap/zapcore"
        "golang.org/x/exp/constraints"

        corev1 "k8s.io/api/core/v1"
        "k8s.io/apimachinery/pkg/types"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/util"
)

type Node struct {
        Name string

        Labels *XactMap[string, string]

        // pods stores the set of pods on the node.
        //
        // NOTE: It's important that this is a map of Pod and not *Pod, because it means that
        // speculative changes can't leak through to changes on the underlying Pods.
        pods *XactMap[types.UID, Pod]

        // migratablePods stores the UIDs of pods that are currently migratable. This is to allow more
        // efficiently fetching candidate pods to migrate, so that we don't perform a lot of unnecessary
        // work when nodes are over-full but don't have any migratable pods.
        migratablePods *XactMap[types.UID, struct{}]

        CPU NodeResources[vmv1.MilliCPU]
        Mem NodeResources[api.Bytes]
}

// MarshalLogObject implements zapcore.ObjectMarshaler so that Node can be used with zap.Object
// without emitting large lists of pods.
func (n Node) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        enc.AddString("Name", n.Name)
        err := enc.AddObject("Labels", zapcore.ObjectMarshalerFunc(func(e zapcore.ObjectEncoder) error {
                for label, value := range n.Labels.Entries() {
                        e.AddString(label, value)
                }
                return nil
        }))
        if err != nil {
                return err
        }
        if err := enc.AddReflected("CPU", n.CPU); err != nil {
                return err
        }
        if err := enc.AddReflected("Mem", n.Mem); err != nil {
                return err
        }
        return nil
}

type NodeResources[T constraints.Unsigned] struct {
        // Total is the total amount of T available on the node.
        //
        // This value does not change.
        Total T

        // Reserved is exactly equal to all Pods' <resource>.Reserved values.
        //
        // It SHOULD be less than or equal to Total, and - when live migration is enabled - we take
        // active measures to reduce it once it is above Watermark.
        //
        // Reserved can be greater than Total if:
        // * There is misbehavior between the autoscaler-agent and scheduler plugin;
        // * Eventual consistency causes us to operate on stale data; or
        // * Other pods are scheduled without going through our scheduler plugin
        Reserved T

        // Migrating is the amount of T that we expect will be removed by ongoing live migration.
        Migrating T

        // Watermark is the amount of T reserved to pods above which we attempt to reduce usage via
        // migration.
        //
        // This value does not change.
        Watermark T
}

type NodeResourceField[T any] struct {
        Name  string
        Value T
}

func (r NodeResources[T]) Fields() []NodeResourceField[T] {
        return []NodeResourceField[T]{
                {"Total", r.Total},
                {"Reserved", r.Reserved},
                {"Migrating", r.Migrating},
                {"Watermark", r.Watermark},
        }
}

func NodeStateFromK8sObj(
        node *corev1.Node,
        watermarkFraction float64,
        keepLabels []string,
) (*Node, error) {
        // Note that node.Status.Allocatable has the following docs:
        //
        //   "Allocatable represents the resources of a node that are available for scheduling. Defaults
        //   to Capacity."
        //
        // So we should be able to assume that the resources exist there.

        // cpuQ is the CPU amount as a k8s resource.Quantity
        cpuQ := node.Status.Allocatable.Cpu()
        if cpuQ == nil {
                return nil, errors.New("Node hsa no Allocatable CPU limit")
        }
        totalCPU := vmv1.MilliCPUFromResourceQuantity(*cpuQ)

        memQ := node.Status.Allocatable.Memory()
        if memQ == nil {
                return nil, errors.New("Node has no Allocatable Memory limit")
        }
        totalMem := api.BytesFromResourceQuantity(*memQ)

        labels := make(map[string]string)
        for _, lbl := range keepLabels {
                labels[lbl] = node.Labels[lbl]
        }

        return NodeStateFromParams(node.Name, totalCPU, totalMem, watermarkFraction, labels), nil
}

// NodeStateFromParams is a helper to construct a *Node, primarily for use in tests.
//
// For practical usage, see NodeStateFromK8sObj.
func NodeStateFromParams(
        name string,
        totalCPU vmv1.MilliCPU,
        totalMem api.Bytes,
        watermarkFraction float64,
        labels map[string]string,
) *Node {
        return &Node{
                Name: name,
                Labels: func() *XactMap[string, string] {
                        m := NewXactMap[string, string]()
                        for k, v := range labels {
                                m.Set(k, v)
                        }
                        return m
                }(),
                pods:           NewXactMap[types.UID, Pod](),
                migratablePods: NewXactMap[types.UID, struct{}](),
                CPU: NodeResources[vmv1.MilliCPU]{
                        Total:     totalCPU,
                        Reserved:  0,
                        Migrating: 0,
                        Watermark: vmv1.MilliCPU(float64(totalCPU) * watermarkFraction),
                },
                Mem: NodeResources[api.Bytes]{
                        Total:     totalMem,
                        Reserved:  0,
                        Migrating: 0,
                        Watermark: api.Bytes(float64(totalMem) * watermarkFraction),
                },
        }
}

// OverBudget returns whether this node has more resources reserved than in total
func (n *Node) OverBudget() bool {
        return n.CPU.Reserved > n.CPU.Total || n.Mem.Reserved > n.Mem.Total
}

// Speculatively allows attempting a modification to the node before deciding whether to actually
// commit that change.
//
// Any of the fields of the node can be updated, including its pods.
func (n *Node) Speculatively(modify func(n *Node) (commit bool)) (committed bool) {
        tmp := &Node{
                Name:           n.Name,
                Labels:         n.Labels.NewTransaction(),
                pods:           n.pods.NewTransaction(),
                migratablePods: n.migratablePods.NewTransaction(),
                CPU:            n.CPU,
                Mem:            n.Mem,
        }
        commit := modify(tmp)
        if commit {
                tmp.Labels.Commit()
                tmp.pods.Commit()
                tmp.migratablePods.Commit()
                n.CPU = tmp.CPU
                n.Mem = tmp.Mem
        }
        return commit
}

// Update sets the resource state of the node, corresponding to the changes in the totals present as
// part of newState.
//
// GENERALLY there should be no change here, but if something happens, it's better to accept the
// change and continue rather than to operate on stale data.
func (n *Node) Update(newState *Node) (changed bool) {
        if n.Name != newState.Name {
                panic(fmt.Sprintf("Node name changed from %q to %q", n.Name, newState.Name))
        }

        changed = newState.CPU.Total != n.CPU.Total || newState.Mem.Total != n.Mem.Total ||
                newState.CPU.Watermark != n.CPU.Watermark || newState.Mem.Watermark != n.Mem.Watermark

        // Propagate changes to labels:
        for label, value := range newState.Labels.Entries() {
                v, ok := n.Labels.Get(label)
                if !ok || v != value {
                        n.Labels.Set(label, value)
                        changed = true
                }
        }
        for label := range n.Labels.Entries() {
                // remove labels that no longer exist
                if _, ok := newState.Labels.Get(label); !ok {
                        n.Labels.Delete(label)
                        changed = true
                }
        }

        if !changed {
                return
        }

        *n = Node{
                Name:           n.Name,
                Labels:         n.Labels,
                pods:           n.pods,
                migratablePods: n.migratablePods,
                CPU: NodeResources[vmv1.MilliCPU]{
                        Total:     newState.CPU.Total,
                        Reserved:  n.CPU.Reserved,
                        Migrating: n.CPU.Migrating,
                        Watermark: newState.CPU.Watermark,
                },
                Mem: NodeResources[api.Bytes]{
                        Total:     newState.Mem.Total,
                        Reserved:  n.Mem.Reserved,
                        Migrating: n.Mem.Migrating,
                        Watermark: newState.Mem.Watermark,
                },
        }

        return
}

// GetPod returns a copy of the state for the Pod with the given UID, or false if no such Pod is
// present in the Node's state.
func (n *Node) GetPod(uid types.UID) (_ Pod, ok bool) {
        return n.pods.Get(uid)
}

// Pods returns an iterator over all pods on the node.
func (n *Node) Pods() iter.Seq2[types.UID, Pod] {
        return n.pods.Entries()
}

// MigratablePods returns an iterator through the migratable pods on the node.
//
// This method is provided as a specialized version of (*Node).Pods() in order to support more
// efficient look-ups when trying to balance nodes.
func (n *Node) MigratablePods() iter.Seq2[types.UID, Pod] {
        return func(yield func(types.UID, Pod) bool) {
                for uid := range n.migratablePods.Entries() {
                        pod, ok := n.pods.Get(uid)
                        if !ok {
                                panic(fmt.Sprintf("pod with UID %s preset in migratablePods map but not pods map", uid))
                        }

                        if !yield(uid, pod) {
                                break
                        }
                }
        }
}

// AddPod adds a pod to the node, updating resources as required.
func (n *Node) AddPod(pod Pod) {
        if _, ok := n.pods.Get(pod.UID); ok {
                panic("cannot add Pod that already exists")
        }

        n.CPU.add(&pod.CPU, pod.Migrating)
        n.Mem.add(&pod.Mem, pod.Migrating)
        n.pods.Set(pod.UID, pod)
        if pod.Migratable {
                n.migratablePods.Set(pod.UID, struct{}{})
        }
}

// UpdatePod updates the node based on the change in the pod from old to new.
//
// NOTE: this DOES NOT make any follow-up changes -- e.g., updating the reserved resources to match
// what was requested. Those must be done by a call to Reconcile().
func (n *Node) UpdatePod(oldPod, newPod Pod) (changed bool) {
        // Some safety checks:
        if oldPod.UID != newPod.UID {
                panic(fmt.Sprintf("Pod UID changed from %q to %q", oldPod.UID, newPod.UID))
        } else if oldPod.NamespacedName != newPod.NamespacedName {
                panic(fmt.Sprintf("Pod name changed from %v to %v", oldPod.NamespacedName, newPod.NamespacedName))
        } else if newPod.Migrating && !newPod.Migratable {
                panic("new pod state is migrating but not migratable")
        } else if _, ok := n.pods.Get(oldPod.UID); !ok {
                panic("cannot update Pod that doesn't exist in the node state")
        }

        // remove the old pod; replace it with the new one! Simple as.
        n.RemovePod(oldPod.UID)
        n.AddPod(newPod)

        return oldPod != newPod
}

// ReconcilePodReserved will make all possible progress on updating the Pod's reserved resources
// based on the what's requested for the pod and what's available.
//
// Currently, that's just updating the reserved resources to match what's requested (or, as much as
// is possible).
//
// The new values of the Pod's resources will be left in the provided Pod.
//
// This method will misbehave if the values of the Pod do not match the value of what's stored for
// that Pod in the node.
func (n *Node) ReconcilePodReserved(pod *Pod) (done bool) {
        if _, ok := n.pods.Get(pod.UID); !ok {
                panic("cannot reconcile reserved resources for Pod that doesn't exist in the node state")
        }

        cpuDone := n.CPU.reconcilePod(&pod.CPU, pod.Migrating)
        memDone := n.Mem.reconcilePod(&pod.Mem, pod.Migrating)
        n.pods.Set(pod.UID, *pod)

        return cpuDone && memDone
}

// RemovePod removes the pod from the node, given its UID, returning true iff the pod existed on the
// node.
//
// Resources are updated as required.
func (n *Node) RemovePod(uid types.UID) (exists bool) {
        pod, ok := n.pods.Get(uid)
        if !ok {
                return false
        }

        n.pods.Delete(uid)
        n.migratablePods.Delete(uid)
        n.CPU.remove(pod.CPU, pod.Migrating)
        n.Mem.remove(pod.Mem, pod.Migrating)
        return true
}

func (r *NodeResources[T]) add(p *PodResources[T], migrating bool) {
        r.Reserved += p.Reserved
        if migrating {
                r.Migrating += p.Reserved
        }
}

func (r *NodeResources[T]) remove(p PodResources[T], migrating bool) {
        r.Reserved -= p.Reserved
        if migrating {
                r.Migrating -= p.Reserved
        }
}

func (r *NodeResources[T]) reconcilePod(p *PodResources[T], migrating bool) (done bool) {
        if p.Requested == p.Reserved {
                return true // nothing to do!
        }

        if p.Requested < p.Reserved {
                // Easy enough - we can just make the reduction.
                r.remove(*p, migrating)
                p.Reserved = p.Requested
                r.add(p, migrating)
                return true // nothing to do!
        }

        // Difficult case: Requested is greater than Reserved -- how much can we give?
        desiredIncrease := p.Requested - p.Reserved
        remaining := util.SaturatingSub(r.Total, r.Reserved)

        // (X / M) * M is equivalent to floor(X / M) -- any amount that we give must be a multiple of
        // the factor (roughly, the compute unit).
        maxIncrease := (remaining / p.Factor) * p.Factor

        actualIncrease := min(maxIncrease, desiredIncrease)
        if actualIncrease != 0 {
                r.remove(*p, migrating)
                p.Reserved += actualIncrease
                r.add(p, migrating)
        }
        // We're done iff everything that was asked for has been granted
        return p.Reserved == p.Requested
}

// UnmigratedAboveWatermark returns the amount of T above Watermark that isn't already being
// migrated.
//
// This method will panic if r.Migrating is greater than r.Reserved.
func (r NodeResources[T]) UnmigratedAboveWatermark() T {
        if r.Migrating > r.Reserved {
                panic(fmt.Sprintf(
                        "unexpectedly migrating more resources than are reserved: %v > %v",
                        r.Migrating, r.Reserved,
                ))
        }

        unmigrating := r.Reserved - r.Migrating
        return util.SaturatingSub(unmigrating, r.Watermark)
}

package state

import (
        "errors"
        "time"

        "github.com/samber/lo"
        "go.uber.org/zap/zapcore"
        "golang.org/x/exp/constraints"

        corev1 "k8s.io/api/core/v1"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/apimachinery/pkg/types"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        "github.com/neondatabase/autoscaling/pkg/api"
        "github.com/neondatabase/autoscaling/pkg/util"
)

type Pod struct {
        // NOTE: It's important that Pod objects contain no references, otherwise speculative changes on
        // a node could accidentally leak through after choosing not to commit them.

        util.NamespacedName
        UID       types.UID
        CreatedAt time.Time

        // VirtualMachine, if not empty, gives the name of the VirtualMachine object that owns this Pod.
        VirtualMachine util.NamespacedName

        // Migratable is true if this Pod is owned by a VirtualMachine and it has the appropriate label
        // to enable automatic live migration.
        Migratable bool

        // AlwaysMigrate is true if this Pod is owned by a VirtualMachine and it has the (TESTING ONLY)
        // label to mark that this pod should be continuously migrated.
        AlwaysMigrate bool

        // Migrating is true iff there is a VirtualMachineMigration with this pod as the source.
        Migrating bool

        CPU PodResources[vmv1.MilliCPU]
        Mem PodResources[api.Bytes]
}

// MarshalLogObject implements zapcore.ObjectMarshaler so that Pod can be used with zap.Object.
func (p Pod) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        enc.AddString("Namespace", p.Namespace)
        enc.AddString("Name", p.Name)
        enc.AddString("UID", string(p.UID))
        enc.AddTime("CreatedAt", p.CreatedAt)
        if !lo.IsEmpty(p.VirtualMachine) {
                if err := enc.AddObject("VirtualMachine", p.VirtualMachine); err != nil {
                        return err
                }
                enc.AddBool("Migratable", p.Migratable)
                enc.AddBool("AlwaysMigrate", p.AlwaysMigrate)
                enc.AddBool("Migrating", p.Migrating)
        }
        if err := enc.AddReflected("CPU", p.CPU); err != nil {
                return err
        }
        if err := enc.AddReflected("Mem", p.Mem); err != nil {
                return err
        }
        return nil
}

// PodResources is the extracted resources reserved and desired by the pod
type PodResources[T constraints.Unsigned] struct {
        // Reserved is the amount of T that has been set aside for usage by this Pod.
        //
        // For a regular pod, this is simply the sum of the resource requests for its containers, but
        // for a VM, it is equal to the last resources that the scheduler has approved for the
        // autoscaler-agent OR the CPU/Memory '.use' field, if that has not yet happened.
        Reserved T

        // Requested is the amount of T that the Pod would like to have available to it.
        //
        // For a regular Pod, this is exactly equal to Reserved.
        //
        // For a VM, this is equal to the last amount that the autoscaler-agent requested -- or if that
        // hasn't happened yet, simply Reserved.
        //
        // If Requested is ever less than Reserved, the scheduler is expected to immediately reduce
        // Reserved -- in effect, it's been given back resources that it previously set aside.
        Requested T

        // Factor is the smallest incremental change in T that can be allocated to the pod.
        //
        // For pods that aren't VMs, this should be set to zero, as it has no impact.
        Factor T
}

func PodStateFromK8sObj(pod *corev1.Pod) (Pod, error) {
        if vmRef, ok := vmv1.VirtualMachineOwnerForPod(pod); ok {
                return podStateForVMRunner(pod, vmRef)
        } else {
                return podStateForNormalPod(pod), nil
        }
}

func podStateForNormalPod(pod *corev1.Pod) Pod {
        // this pod is *not* a VM runner pod -- we should use the standard kubernetes resources.

        var cpu vmv1.MilliCPU
        var mem api.Bytes
        for _, container := range pod.Spec.Containers {
                // For each resource, add the requests, if they're provided.
                //
                // NB: .Cpu()/.Memory() return a pointer to a value equal to zero if the resource is not
                // present. So we can just add it either way.
                cpu += vmv1.MilliCPUFromResourceQuantity(*container.Resources.Requests.Cpu())
                mem += api.BytesFromResourceQuantity(*container.Resources.Requests.Memory())
        }

        return Pod{
                NamespacedName: util.GetNamespacedName(pod),
                UID:            pod.UID,
                CreatedAt:      pod.CreationTimestamp.Time,

                VirtualMachine: lo.Empty[util.NamespacedName](),
                Migratable:     false,
                AlwaysMigrate:  false,
                Migrating:      false,

                CPU: PodResources[vmv1.MilliCPU]{
                        Reserved:  cpu,
                        Requested: cpu,
                        Factor:    0,
                },
                Mem: PodResources[api.Bytes]{
                        Reserved:  mem,
                        Requested: mem,
                        Factor:    0,
                },
        }
}

func podStateForVMRunner(pod *corev1.Pod, vmRef metav1.OwnerReference) (Pod, error) {
        // this pod is a VM runner pod
        vm := util.NamespacedName{Namespace: pod.Namespace, Name: vmRef.Name}

        _, migrationRole, ownedByMigration := vmv1.MigrationOwnerForPod(pod)

        alwaysMigrate := api.HasAlwaysMigrateLabel(pod)
        autoMigrate := api.HasAutoMigrationEnabled(pod)

        migrating := ownedByMigration && migrationRole == vmv1.MigrationRoleSource
        // allow ongoing migrations to continue. Don't allow migrations of current migration
        // targets. New migrations can be started when auto migrations are enabled, or if the
        // testing-only "always migrate" flag is enabled.
        migratable := migrating || (migrationRole != vmv1.MigrationRoleTarget && (autoMigrate || alwaysMigrate))

        autoscalable := api.HasAutoscalingEnabled(pod)

        res, err := vmv1.VirtualMachineResourcesFromPod(pod)
        if err != nil {
                return lo.Empty[Pod](), err
        }

        actualResources := &api.Resources{
                VCPU: res.CPUs.Use,
                Mem:  api.BytesFromResourceQuantity(res.MemorySlotSize) * api.Bytes(res.MemorySlots.Use),
        }

        var scalingUnit, requested, approved *api.Resources

        if !autoscalable {
                approved = actualResources
                requested = actualResources
        } else {
                scalingUnit, err = api.ExtractScalingUnit(pod)
                if err != nil {
                        return lo.Empty[Pod](), err
                }

                requested, err = api.ExtractRequestedScaling(pod)
                if err != nil {
                        return lo.Empty[Pod](), err
                } else if requested == nil {
                        requested = actualResources
                } else {
                        // We cannot have requested scaling but no scaling unit -- disallow that here.
                        if scalingUnit == nil {
                                return lo.Empty[Pod](), errors.New("Pod has requested scaling but no scaling unit annotation")
                        }
                }

                approved, err = api.ExtractApprovedScaling(pod)
                if err != nil {
                        return lo.Empty[Pod](), err
                } else if approved == nil {
                        approved = actualResources
                }
        }

        if scalingUnit == nil {
                // default the scaling unit to zero; if we got here, it's not needed.
                scalingUnit = &api.Resources{
                        VCPU: 0,
                        Mem:  0,
                }
        }

        return Pod{
                NamespacedName: util.GetNamespacedName(pod),
                UID:            pod.UID,
                CreatedAt:      pod.CreationTimestamp.Time,

                VirtualMachine: vm,
                Migratable:     migratable,
                AlwaysMigrate:  alwaysMigrate,
                Migrating:      migrating,

                CPU: PodResources[vmv1.MilliCPU]{
                        Reserved:  approved.VCPU,
                        Requested: requested.VCPU,
                        Factor:    scalingUnit.VCPU,
                },
                Mem: PodResources[api.Bytes]{
                        Reserved:  approved.Mem,
                        Requested: requested.Mem,
                        Factor:    scalingUnit.Mem,
                },
        }, nil
}

// BetterMigrationTargetThan returns <0 iff the pod is a better migration target than the 'other'
// pod.
func (p Pod) BetterMigrationTargetThan(other Pod) int {
        // For now, just prioritize migration for older pods, so that we naturally avoid continuously
        // re-migrating the same VMs.
        return p.CreatedAt.Compare(other.CreatedAt)
}

package state

import (
        "iter"
)

// XactMap is a map with support for transactions.
type XactMap[K comparable, V any] struct {
        parent *XactMap[K, V]

        newObjs map[K]V
        deletes map[K]struct{}
}

func NewXactMap[K comparable, V any]() *XactMap[K, V] {
        return &XactMap[K, V]{
                parent:  nil,
                newObjs: make(map[K]V),
                deletes: make(map[K]struct{}),
        }
}

// NewTransaction creates a new XactMap that acts as a shallow copy of the parent -- any changes in
// the child will not affect the parent until a call to Commit(), if desired.
//
// NOTE: Once you have already made some changes to the child, it is unsound to make changes to the
// parent and then continue using the child.
func (m *XactMap[K, V]) NewTransaction() *XactMap[K, V] {
        return &XactMap[K, V]{
                parent:  m,
                newObjs: make(map[K]V),
                deletes: make(map[K]struct{}),
        }
}

// Commit propagates all changes from this local XactMap into its parent.
//
// Afterwards, this map can continue to be used as normal, if you want.
func (m *XactMap[K, V]) Commit() {
        if m.parent == nil {
                panic("(*XactMap).Commit() called with nil parent")
        }

        for k := range m.deletes {
                m.parent.Delete(k)
        }
        for k, v := range m.newObjs {
                m.parent.Set(k, v)
        }

        // clean up our maps to make this safe for potential reuse.
        clear(m.newObjs)
        clear(m.deletes)
}

// Get returns the value for the key if it's present in the map, else (zero, false).
func (m *XactMap[K, V]) Get(key K) (V, bool) {
        var emptyValue V

        // Value is overridden here:
        if v, ok := m.newObjs[key]; ok {
                return v, ok
        }
        // Value is deleted here:
        if _, ok := m.deletes[key]; ok {
                return emptyValue, false
        }
        // fall through to the parent:
        if m.parent != nil {
                return m.parent.Get(key)
        }
        // otherwise, nothing.
        return emptyValue, false
}

func (m *XactMap[K, V]) Set(key K, value V) {
        m.newObjs[key] = value

        // un-delete the key, if necessary:
        delete(m.deletes, key)
}

// Delete removes the key from the map, if it's present.
func (m *XactMap[K, V]) Delete(key K) {
        delete(m.newObjs, key)

        // To make sure we don't leak memory at the base, we need deleted objects FULLY deleted if
        // there's no parent -- so we should only add the key to deletes if there's a parent:
        if m.parent != nil {
                if _, ok := m.parent.Get(key); ok {
                        // it exists in the parent -- delete it here.
                        m.deletes[key] = struct{}{}
                }
        }
}

// Entries returns an iterator over key-value pairs in the map.
//
// Deleting elements from the map during iteration is always sound. They will not be visited later.
func (m *XactMap[K, V]) Entries() iter.Seq2[K, V] {
        return m.entriesImpl
}

// entriesImpl is the inner implementation of (*XactMap).Entries(), extracted for recursive use.
func (m *XactMap[K, V]) entriesImpl(yield func(K, V) bool) {
        // General plan:
        // 1. Iterate through all the elements added here
        // 2. Iterate through all the elements in the parent, as long as they weren't added or deleted
        // in this map instead.

        for k, v := range m.newObjs {
                if !yield(k, v) {
                        return
                }
        }

        if m.parent != nil {
                m.parent.entriesImpl(func(k K, v V) bool {
                        if _, ok := m.newObjs[k]; ok {
                                return true
                        }
                        if _, ok := m.deletes[k]; ok {
                                return true
                        }

                        return yield(k, v)
                })
        }
}

package plugin

// Helper functions to set up the persistent listening for watch events.

import (
        "context"
        "fmt"
        "time"

        "go.uber.org/zap"

        corev1 "k8s.io/api/core/v1"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/apimachinery/pkg/runtime"
        coreclient "k8s.io/client-go/kubernetes"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
        vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
        "github.com/neondatabase/autoscaling/pkg/plugin/initevents"
        "github.com/neondatabase/autoscaling/pkg/plugin/reconcile"
        "github.com/neondatabase/autoscaling/pkg/util"
        "github.com/neondatabase/autoscaling/pkg/util/watch"
)

// watchHandlers builds the default handler functions to bridge between the k8s watch events and the
// plugin's internal reconcile queue.
func watchHandlers[P reconcile.Object](
        queue *reconcile.Queue,
        inits *initevents.InitEventsMiddleware,
) watch.HandlerFuncs[P] {
        return watch.HandlerFuncs[P]{
                AddFunc: func(obj P, preexisting bool) {
                        if preexisting && inits != nil {
                                inits.AddRequired(obj)
                        }
                        queue.Enqueue(reconcile.EventKindAdded, obj)
                },
                UpdateFunc: func(oldObj, newObj P) {
                        queue.Enqueue(reconcile.EventKindModified, newObj)
                },
                DeleteFunc: func(obj P, mayBeStale bool) {
                        queue.Enqueue(reconcile.EventKindDeleted, obj)
                },
        }
}

// helper function
func onlyErr[T any](_ T, err error) error {
        return err
}

func watchConfig[T any](metrics watch.Metrics) watch.Config {
        sampleObj := any(new(T)).(runtime.Object)
        gvk, err := util.LookupGVKForType(sampleObj)
        if err != nil {
                panic(err)
        }
        kind := gvk.Kind

        return watch.Config{
                ObjectNameLogField: kind,
                Metrics: watch.MetricsConfig{
                        Metrics:  metrics,
                        Instance: fmt.Sprint(kind, "s"),
                },
                // FIXME: make these configurable.
                RetryRelistAfter: util.NewTimeRange(time.Second, 3, 5),
                RetryWatchAfter:  util.NewTimeRange(time.Second, 3, 5),
        }
}

func watchNodeEvents(
        ctx context.Context,
        parentLogger *zap.Logger,
        client coreclient.Interface,
        metrics watch.Metrics,
        callbacks watch.HandlerFuncs[*corev1.Node],
) (*watch.Store[corev1.Node], error) {
        return watch.Watch(
                ctx,
                parentLogger.Named("watch-nodes"),
                client.CoreV1().Nodes(),
                watchConfig[corev1.Node](metrics),
                watch.Accessors[*corev1.NodeList, corev1.Node]{
                        Items: func(list *corev1.NodeList) []corev1.Node { return list.Items },
                },
                watch.InitModeSync,
                metav1.ListOptions{},
                callbacks,
        )
}

func watchPodEvents(
        ctx context.Context,
        parentLogger *zap.Logger,
        client coreclient.Interface,
        metrics watch.Metrics,
        callbacks watch.HandlerFuncs[*corev1.Pod],
) (*watch.Store[corev1.Pod], error) {
        return watch.Watch(
                ctx,
                parentLogger.Named("watch-pods"),
                client.CoreV1().Pods(corev1.NamespaceAll),
                watchConfig[corev1.Pod](metrics),
                watch.Accessors[*corev1.PodList, corev1.Pod]{
                        Items: func(list *corev1.PodList) []corev1.Pod { return list.Items },
                },
                watch.InitModeSync,
                metav1.ListOptions{},
                callbacks,
        )
}

func watchMigrationEvents(
        ctx context.Context,
        parentLogger *zap.Logger,
        client vmclient.Interface,
        metrics watch.Metrics,
        callbacks watch.HandlerFuncs[*vmv1.VirtualMachineMigration],
) error {
        return onlyErr(watch.Watch(
                ctx,
                parentLogger.Named("watch-migrations"),
                client.NeonvmV1().VirtualMachineMigrations(corev1.NamespaceAll),
                watchConfig[vmv1.VirtualMachineMigration](metrics),
                watch.Accessors[*vmv1.VirtualMachineMigrationList, vmv1.VirtualMachineMigration]{
                        Items: func(list *vmv1.VirtualMachineMigrationList) []vmv1.VirtualMachineMigration { return list.Items },
                },
                watch.InitModeSync,
                metav1.ListOptions{
                        // NB: Including just the label itself means that we select for objects that *have* the
                        // label, without caring about the actual value.
                        //
                        // See also:
                        // https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#set-based-requirement
                        LabelSelector: LabelPluginCreatedMigration,
                },
                callbacks,
        ))
}

package reporting

import (
        "encoding/json"
        "fmt"
)

var _ BatchBuilder[int] = (*JSONArrayBuilder[int])(nil)

// JSONArrayBuilder is a BatchBuilder where all the events in a batch are serialized as a single
// large JSON array.
type JSONArrayBuilder[E any] struct {
        buf          IOBuffer
        started      bool
        nestingCount int
}

// NewJSONArrayBatch creates a new JSONArrayBuilder using the underlying IOBuffer to potentially
// process the JSON encoding -- either with ByteBuffer for plaintext or GZIPBuffer for gzip
// compression.
func NewJSONArrayBuilder[E any](buf IOBuffer, nestedFields ...string) *JSONArrayBuilder[E] {
        for _, fieldName := range nestedFields {
                // note: use a discrete json.Marhsal here instead of json.Encoder because encoder adds a
                // newline at the end, and that'll make the formatting weird for us.
                encodedField, err := json.Marshal(fieldName)
                if err != nil {
                        panic(fmt.Sprintf("failed to JSON encode: %s", fieldName))
                }

                if _, err := buf.Write([]byte{'{'}); err != nil {
                        panic(fmt.Sprintf("failed to write: %s", err))
                }
                if _, err := buf.Write(encodedField); err != nil {
                        panic(fmt.Sprintf("failed to write: %s", err))
                }
                if _, err := buf.Write([]byte{':'}); err != nil {
                        panic(fmt.Sprintf("failed to write: %s", err))
                }
        }
        // open the array:
        if _, err := buf.Write([]byte{'['}); err != nil {
                panic(fmt.Sprintf("failed to write: %s", err))
        }

        return &JSONArrayBuilder[E]{
                buf:          buf,
                started:      false,
                nestingCount: len(nestedFields),
        }
}

func (b *JSONArrayBuilder[E]) Add(event E) {
        if b.started {
                if _, err := b.buf.Write([]byte("\n\t,")); err != nil {
                        panic(fmt.Sprintf("failed to write: %s", err))
                }
        }

        // note: we use a discrete json.Marshal here instead of json.Encoder becaues encoder adds a
        // newline at the end, and that'll make the formatting weird for us.
        tmpJSON, err := json.Marshal(event)
        if err != nil {
                panic(fmt.Sprintf("failed to JSON encode: %s", err))
        }

        if _, err := b.buf.Write(tmpJSON); err != nil {
                panic(fmt.Sprintf("failed to write: %s", err))
        }
        b.started = true
}

func (b *JSONArrayBuilder[E]) Finish() []byte {
        if _, err := b.buf.Write([]byte("\n]")); err != nil {
                panic(fmt.Sprintf("failed to write: %s", err))
        }
        for i := 0; i < b.nestingCount; i++ {
                if _, err := b.buf.Write([]byte("}")); err != nil {
                        panic(fmt.Sprintf("failed to write: %s", err))
                }
        }

        return b.buf.Collect()
}

package reporting

import (
        "encoding/json"
        "fmt"
)

var _ BatchBuilder[int] = (*JSONLinesBuilder[int])(nil)

// JSONLinesBuilder is a BatchBuilder where each event in the batch is serialized as a separate JSON
// object on its own line, adhering to the "JSON lines"/"jsonl" format.
type JSONLinesBuilder[E any] struct {
        buf IOBuffer
}

func NewJSONLinesBuilder[E any](buf IOBuffer) *JSONLinesBuilder[E] {
        return &JSONLinesBuilder[E]{
                buf: buf,
        }
}

func (b *JSONLinesBuilder[E]) Add(event E) {
        tmpJSON, err := json.Marshal(event)
        if err != nil {
                panic(fmt.Sprintf("failed to JSON encode: %s", err))
        }

        if _, err := b.buf.Write(tmpJSON); err != nil {
                panic(fmt.Sprintf("failed to write: %s", err))
        }
        if _, err := b.buf.Write([]byte{'\n'}); err != nil {
                panic(fmt.Sprintf("failed to write: %s", err))
        }
}

func (b *JSONLinesBuilder[E]) Finish() []byte {
        return b.buf.Collect()
}

package reporting

import (
        "sync"

        "github.com/prometheus/client_golang/prometheus"
)

// BatchBuilder is an interface for gradually converting []E to []byte, allowing us to construct
// batches of events without buffering them uncompressed, in memory.
//
// Implementations of BatchBuilder are defined in various 'batch_*.go' files.
type BatchBuilder[E any] interface {
        // Add appends an event to the in-progress batch.
        Add(event E)
        // Finish completes the in-progress batch, returning the events serialized as bytes.
        Finish() []byte
}

type eventBatcher[E any] struct {
        mu sync.Mutex

        targetBatchSize int

        newBatch    func() BatchBuilder[E]
        ongoing     BatchBuilder[E]
        ongoingSize int

        completed     []batch[E]
        onComplete    func()
        completedSize int

        sizeGauge prometheus.Gauge
}

type batch[E any] struct {
        serialized []byte
        count      int
}

func newEventBatcher[E any](
        targetBatchSize int,
        newBatch func() BatchBuilder[E],
        notifyCompletedBatch func(),
        sizeGauge prometheus.Gauge,
) *eventBatcher[E] {
        return &eventBatcher[E]{
                mu: sync.Mutex{},

                targetBatchSize: targetBatchSize,

                newBatch:    newBatch,
                ongoing:     newBatch(),
                ongoingSize: 0,

                completed:     []batch[E]{},
                onComplete:    notifyCompletedBatch,
                completedSize: 0,

                sizeGauge: sizeGauge,
        }
}

// enqueue adds an event to the current in-progress batch.
//
// If the target batch size is reached, the batch will be packaged up for consumption by
// (*eventBatcher[E]).peekCompleted() and b.onComplete() will be called.
func (b *eventBatcher[E]) enqueue(event E) {
        b.mu.Lock()
        defer b.mu.Unlock()

        b.ongoing.Add(event)
        b.ongoingSize += 1
        b.updateGauge()

        if b.ongoingSize >= b.targetBatchSize {
                b.finishCurrentBatch()
        }
}

// finishOngoing collects any events that have not yet been packaged up into a batch, adding them to
// a batch visible in (*eventBatcher[E]).peekCompleted().
//
// If there are outstanding events when this method is called, b.onComplete() will be called.
// Otherwise, it will not be called.
func (b *eventBatcher[E]) finishOngoing() {
        b.mu.Lock()
        defer b.mu.Unlock()

        if b.ongoingSize == 0 {
                return
        }

        b.finishCurrentBatch()
}

// completedCount returns the number of completed batches
func (b *eventBatcher[E]) completedCount() int {
        b.mu.Lock()
        defer b.mu.Unlock()
        return len(b.completed)
}

// peekLatestCompleted returns the most recently completed batch that has not yet been removed by
// (*eventBatcher[E]).dropLatestCompleted().
//
// The batcher is not modified by this call.
//
// Once done with this batch, you should call (*eventBatcher[E]).dropLatestCompleted() to remove it
// from future consideration.
func (b *eventBatcher[E]) peekLatestCompleted() batch[E] {
        b.mu.Lock()
        defer b.mu.Unlock()
        return b.completed[0]
}

// dropLatestCompleted drops the most recently completed batch from internal storage.
//
// This method will panic if (*eventBatcher[E]).completedCount() is zero.
func (b *eventBatcher[e]) dropLatestCompleted() {
        b.mu.Lock()
        defer b.mu.Unlock()

        batch := b.completed[0]
        b.completed = b.completed[1:]
        b.completedSize -= batch.count

        b.updateGauge()
}

// NB: must hold mu
func (b *eventBatcher[E]) updateGauge() {
        b.sizeGauge.Set(float64(b.ongoingSize + b.completedSize))
}

// NB: must hold mu
func (b *eventBatcher[E]) finishCurrentBatch() {
        b.completed = append(b.completed, batch[E]{
                serialized: b.ongoing.Finish(),
                count:      b.ongoingSize,
        })

        b.completedSize += b.ongoingSize
        b.ongoingSize = 0
        b.ongoing = b.newBatch()

        b.onComplete()
}

package reporting

import (
        "context"
        "fmt"

        "github.com/Azure/azure-sdk-for-go/sdk/azcore"
        "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy"
        "github.com/Azure/azure-sdk-for-go/sdk/azidentity"
        "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
        "go.uber.org/zap"
        "go.uber.org/zap/zapcore"
)

type AzureAuthSharedKey struct {
        AccountName string `json:"accountName"`
        AccountKey  string `json:"accountKey"`
}

type AzureBlobStorageClientConfig struct {
        // In Azure a Container is close to a bucket in AWS S3
        Container string `json:"container"`
        // Example Endpoint: "https://MYSTORAGEACCOUNT.blob.core.windows.net/"
        Endpoint string `json:"endpoint"`
}

type AzureClient struct {
        cfg    AzureBlobStorageClientConfig
        client *azblob.Client

        generateKey func() string
}

type AzureError struct {
        Err error
}

func (e AzureError) Error() string {
        return fmt.Sprintf("%s: %s", e.Simplified(), e.Err.Error())
}

func (e AzureError) Unwrap() error {
        return e.Err
}

func (e AzureError) Simplified() string {
        return "Azure Blob error"
}

func NewAzureBlobStorageClient(
        cfg AzureBlobStorageClientConfig,
        generateKey func() string,
) (*AzureClient, error) {
        //nolint:exhaustruct // It's part of Azure SDK
        clientOptions := &azblob.ClientOptions{
                ClientOptions: azcore.ClientOptions{
                        Telemetry: policy.TelemetryOptions{ApplicationID: "neon-autoscaler"},
                },
        }

        credential, err := azidentity.NewDefaultAzureCredential(nil)
        if err != nil {
                return nil, err
        }
        client, err := azblob.NewClient(cfg.Endpoint, credential, clientOptions)
        if err != nil {
                return nil, &AzureError{err}
        }

        return NewAzureBlobStorageClientWithBaseClient(client, cfg, generateKey), nil
}

func NewAzureBlobStorageClientWithBaseClient(
        client *azblob.Client,
        cfg AzureBlobStorageClientConfig,
        generateKey func() string,
) *AzureClient {
        return &AzureClient{
                cfg:         cfg,
                client:      client,
                generateKey: generateKey,
        }
}

// NewRequest implements BaseClient
func (c AzureClient) NewRequest() ClientRequest {
        return &azureRequest{
                AzureClient: c,
                key:         c.generateKey(),
        }
}

// azureRequest is the implementation of ClientRequest used by AzureClient
type azureRequest struct {
        AzureClient
        key string
}

// LogFields implements ClientRequest
func (r *azureRequest) LogFields() zap.Field {
        return zap.Inline(zapcore.ObjectMarshalerFunc(func(enc zapcore.ObjectEncoder) error {
                enc.AddString("container", r.cfg.Container)
                enc.AddString("key", r.key)
                enc.AddString("endpoint", r.cfg.Endpoint)
                return nil
        }))
}

// Send implements ClientRequest
func (r *azureRequest) Send(ctx context.Context, payload []byte) SimplifiableError {
        var err error

        opts := azblob.UploadBufferOptions{}
        _, err = r.client.UploadBuffer(ctx, r.cfg.Container, r.key, payload, &opts)
        if err != nil {
                return AzureError{Err: err}
        }

        return nil
}

package reporting

import (
        "bytes"
        "context"
        "fmt"
        "net/http"

        "github.com/lithammer/shortuuid"
        "go.uber.org/zap"
        "go.uber.org/zap/zapcore"

        "github.com/neondatabase/autoscaling/pkg/util"
)

type HTTPClient struct {
        client *http.Client
        cfg    HTTPClientConfig
}

type HTTPClientConfig struct {
        URL    string `json:"url"`
        Method string `json:"method"`
}

type httpRequestError struct {
        err error
}

func (e httpRequestError) Error() string {
        return fmt.Sprintf("Error making request: %s", e.err.Error())
}

func (e httpRequestError) Unwrap() error {
        return e.err
}

func (e httpRequestError) Simplified() string {
        return util.RootError(e.err).Error()
}

type httpUnexpectedStatusCodeError struct {
        statusCode int
}

func (e httpUnexpectedStatusCodeError) Error() string {
        return fmt.Sprintf("Unexpected HTTP status code %d", e.statusCode)
}

func (e httpUnexpectedStatusCodeError) Simplified() string {
        return fmt.Sprintf("HTTP code %d", e.statusCode)
}

func NewHTTPClient(client *http.Client, cfg HTTPClientConfig) HTTPClient {
        return HTTPClient{
                client: client,
                cfg:    cfg,
        }
}

// NewRequest implements BaseClient
func (c HTTPClient) NewRequest() ClientRequest {
        return &httpRequest{
                HTTPClient: c,
                traceID:    shortuuid.New(),
        }
}

// httpRequest is the implementation of ClientRequest used by HTTPClient
type httpRequest struct {
        HTTPClient
        traceID string
}

// Send implements ClientRequest
func (r *httpRequest) Send(ctx context.Context, payload []byte) SimplifiableError {
        req, err := http.NewRequestWithContext(ctx, r.cfg.Method, r.cfg.URL, bytes.NewReader(payload))
        if err != nil {
                return httpRequestError{err: err}
        }
        req.Header.Set("content-type", "application/json")
        req.Header.Set("x-trace-id", r.traceID)

        resp, err := r.client.Do(req)
        if err != nil {
                return httpRequestError{err: err}
        }
        defer resp.Body.Close()

        // theoretically if wanted/needed, we should use an http handler that
        // does the retrying, to avoid writing that logic here.
        if resp.StatusCode != http.StatusOK {
                return httpUnexpectedStatusCodeError{statusCode: resp.StatusCode}
        }

        return nil
}

// LogFields implements ClientRequest
func (r *httpRequest) LogFields() zap.Field {
        return zap.Inline(zapcore.ObjectMarshalerFunc(func(enc zapcore.ObjectEncoder) error {
                enc.AddString("url", r.cfg.URL)
                enc.AddString("method", r.cfg.Method)
                enc.AddString("traceID", r.traceID)
                return nil
        }))
}

package reporting

import (
        "bytes"
        "context"
        "fmt"
        "time"

        awsconfig "github.com/aws/aws-sdk-go-v2/config"
        "github.com/aws/aws-sdk-go-v2/service/s3"
        "go.uber.org/zap"
        "go.uber.org/zap/zapcore"
)

// S3Client is a BaseClient for S3
type S3Client struct {
        cfg    S3ClientConfig
        client *s3.Client

        generateKey func() string
}

type S3ClientConfig struct {
        Bucket   string `json:"bucket"`
        Region   string `json:"region"`
        Endpoint string `json:"endpoint"`
}

type S3Error struct {
        Err error
}

func (e S3Error) Error() string {
        return fmt.Sprintf("%s: %s", e.Simplified(), e.Err.Error())
}

func (e S3Error) Unwrap() error {
        return e.Err
}

func (e S3Error) Simplified() string {
        return "S3 error"
}

func NewS3Client(
        ctx context.Context,
        cfg S3ClientConfig,
        generateKey func() string,
) (*S3Client, error) {
        // Timeout in case we have hidden IO inside config creation
        ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
        defer cancel()

        s3Config, err := awsconfig.LoadDefaultConfig(ctx, awsconfig.WithRegion(cfg.Region))
        if err != nil {
                return nil, S3Error{Err: err}
        }

        client := s3.NewFromConfig(s3Config, func(o *s3.Options) {
                if cfg.Endpoint != "" {
                        o.BaseEndpoint = &cfg.Endpoint
                }
                o.UsePathStyle = true // required for minio
        })

        return &S3Client{
                cfg:    cfg,
                client: client,

                generateKey: generateKey,
        }, nil
}

// NewRequest implements BaseClient
func (c *S3Client) NewRequest() ClientRequest {
        return &s3Request{
                S3Client: c,
                key:      c.generateKey(),
        }
}

// s3Request is the implementation of ClientRequest used by S3Client
type s3Request struct {
        *S3Client
        key string
}

// LogFields implements ClientRequest
func (r *s3Request) LogFields() zap.Field {
        return zap.Inline(zapcore.ObjectMarshalerFunc(func(enc zapcore.ObjectEncoder) error {
                enc.AddString("bucket", r.cfg.Bucket)
                enc.AddString("key", r.key)
                enc.AddString("region", r.cfg.Region)
                enc.AddString("endpoint", r.cfg.Endpoint)
                return nil
        }))
}

// Send implements ClientRequest
func (r *s3Request) Send(ctx context.Context, payload []byte) SimplifiableError {
        var err error

        body := bytes.NewReader(payload)
        _, err = r.client.PutObject(ctx, &s3.PutObjectInput{ //nolint:exhaustruct // AWS SDK
                Bucket: &r.cfg.Bucket,
                Key:    &r.key,
                Body:   body,
        })
        if err != nil {
                return S3Error{Err: err}
        }

        return nil
}

package reporting

import (
        "bytes"
        "compress/gzip"
        "fmt"
        "io"
)

var (
        _ IOBuffer = ByteBuffer{} //nolint:exhaustruct // just for typechecking
        _ IOBuffer = GZIPBuffer{} //nolint:exhaustruct // just for typechecking
)

type IOBuffer interface {
        io.Writer
        Collect() []byte
}

// ByteBuffer is an IOBuffer that does nothing special, just wrapping a bytes.Buffer to return the
// bytes when done.
type ByteBuffer struct {
        buf *bytes.Buffer
}

func NewByteBuffer() ByteBuffer {
        return ByteBuffer{
                buf: &bytes.Buffer{},
        }
}

func (b ByteBuffer) Write(bytes []byte) (int, error) {
        return b.buf.Write(bytes)
}

func (b ByteBuffer) Collect() []byte {
        return b.buf.Bytes()
}

// WithByteBuffer is a convenience function to produce a generator for a type that requires an
// IOBuffer as input by providing a ByteBuffer.
//
// For example, this can be used with something like NewJSONLinesBuilder to create a BatchBuilder
// generator, e.g. WithByteBuffer(NewJSONLinesBuilder).
func WithByteBuffer[T any](mk func(IOBuffer) T) func() T {
        return func() T {
                return mk(NewByteBuffer())
        }
}

// GZIPBuffer is an IOBuffer that GZIP compresses all data that's written to it.
type GZIPBuffer struct {
        buf *bytes.Buffer
        gz  *gzip.Writer
}

func NewGZIPBuffer() GZIPBuffer {
        buf := &bytes.Buffer{}
        gz := gzip.NewWriter(buf)

        return GZIPBuffer{
                buf: buf,
                gz:  gz,
        }
}

func (b GZIPBuffer) Write(bytes []byte) (int, error) {
        return b.gz.Write(bytes)
}

func (b GZIPBuffer) Collect() []byte {
        if err := b.gz.Close(); err != nil {
                panic(fmt.Sprintf("unexpected gzip error: %s", err))
        }

        return b.buf.Bytes()
}

// WithGZIPBuffer is a convenience function to produce a generator for a type that requires an
// IOBuffer as input by providing a GZIPBuffer.
//
// For example, this can be used with something like NewJSONLinesBuilder to create a BatchBuilder
// generator, e.g. WithGZIPBuffer(NewJSONLinesBuilder).
func WithGZIPBuffer[T any](mk func(IOBuffer) T) func() T {
        return func() T {
                return mk(NewGZIPBuffer())
        }
}

package reporting

import (
        "context"
        "time"

        "go.uber.org/zap"
)

type eventSender[E any] struct {
        client Client[E]

        metrics *EventSinkMetrics

        queue *eventBatcher[E]
        // batchComplete is a buffered channel with an item placed into it whenever a batch is finished
        // and ready for consumption.
        // This means that notifications will be coalesced, so it is the eventSender's responsibility to
        // drain all batches whenever there is a notification.
        batchComplete <-chan struct{}

        // lastSendDuration tracks the "real" last full duration of (eventSender).sendAllCompletedBatches().
        //
        // It's separate from metrics.lastSendDuration because (a) we'd like to include the duration of
        // ongoing calls to sendAllCompletedBatches, but (b) we don't want the bias towards lower
        // durations that comes with that.
        //
        // Here's some more detail:
        //
        // To make sure that long-running sendAllCompletedBatches() loops show up in the metrics while
        // they're still running, we want to periodically update metrics.lastSendDuration before the
        // loop has finished. A side-effect of doing this naively is that the gauge will sometimes
        // return durations that are much shorter than the *actual* previous send loop duration.
        //
        // In order to fix this, we store that *actual* previous duration in this field, but only
        // update the metric when either (a) the loop is done, or (b) the duration so far is already
        // longer than the previous one.
        //
        // This means that we remove the bias towards shorter durations, at the expense of sometimes
        // returning higher durations for too long. IMO that's ok, and we'd rather have our metrics give
        // a pessimistic but more accurate view.
        lastSendDuration time.Duration
}

func (s eventSender[E]) senderLoop(ctx context.Context, logger *zap.Logger) {
        heartbeat := time.Second * time.Duration(s.client.BaseConfig.PushEverySeconds)

        // note: Why a timer and not a ticker? The idea here is that we should only allow the
        // "heartbeat" to trigger sending a batch if we haven't *already* sent something in the last
        // PushEverySeconds. So instead, we reset the timer every time we send a batch, meaning that
        // we'll only ever get a signal from the timer if it's been too long since the last batch.
        timer := time.NewTimer(heartbeat)
        defer timer.Stop()

        for {
                final := false

                select {
                case <-ctx.Done():
                        logger.Info("Received notification that events submission is done")
                        final = true
                        // finish up any in-progress batch, so that we can send it before we exit.
                        s.queue.finishOngoing()
                case <-s.batchComplete:
                        // We've been notified that there's completed batches to be sent!
                        // Do that below...
                case <-timer.C:
                        // Timer has expired without any notification of a completed batch. Let's explicitly ask
                        // for in-progress events to be wrapped up into a batch so we ship them fast enough and
                        // reset the timer.
                        //
                        // consume s.batchComplete on repeat if there were events; otherwise wait until the
                        // timer expires again.
                        s.queue.finishOngoing()
                        timer.Reset(heartbeat)
                        continue
                }

                // Make sure that if there are no more events within the next heartbeat duration, that we'll
                // push the events that have been accumulated so far.
                timer.Reset(heartbeat)

                s.sendAllCompletedBatches(logger)

                if final {
                        logger.Info("Ending events sender loop")
                        return
                }
        }
}

func (s eventSender[E]) sendAllCompletedBatches(logger *zap.Logger) {
        logger.Info("Pushing all available event batches")

        if s.queue.completedCount() == 0 {
                logger.Info("No event batches to push")
                s.lastSendDuration = 0
                s.metrics.lastSendDuration.WithLabelValues(s.client.Name).Set(1e-6) // small value, to indicate that nothing happened
                return
        }

        totalEvents := 0
        totalBatches := 0
        startTime := time.Now()

        // while there's still batches of events in the queue, send them
        //
        // If batches are being added to the queue faster than we can send them, this loop will not
        // terminate. For the most part, that's ok: worst-case, we miss that the parent context has
        // expired, which isn't the end of the world (eventually the autoscaler-agent will just be
        // force-killed). Any long-running call to this function will be reported by
        // s.metrics.lastSendDuration as we go (provided the request timeout isn't too long), so we
        // should get observability for it either way.
        for {
                remainingBatchesCount := s.queue.completedCount()

                if remainingBatchesCount != 0 {
                        logger.Info("Current queue size is non-zero", zap.Int("batchCount", remainingBatchesCount))
                } else {
                        totalTime := time.Since(startTime)
                        s.lastSendDuration = totalTime
                        s.metrics.lastSendDuration.WithLabelValues(s.client.Name).Set(totalTime.Seconds())

                        logger.Info(
                                "All available event batches have been sent",
                                zap.Int("totalEvents", totalEvents),
                                zap.Int("totalBatches", totalBatches),
                                zap.Duration("totalTime", totalTime),
                        )
                        return
                }

                batch := s.queue.peekLatestCompleted()

                req := s.client.Base.NewRequest()

                logger.Info(
                        "Pushing events batch",
                        zap.Int("count", batch.count),
                        req.LogFields(),
                )

                reqStart := time.Now()
                err := func() SimplifiableError {
                        reqCtx, cancel := context.WithTimeout(
                                context.TODO(),
                                time.Second*time.Duration(s.client.BaseConfig.PushRequestTimeoutSeconds),
                        )
                        defer cancel()

                        return req.Send(reqCtx, batch.serialized)
                }()
                reqDuration := time.Since(reqStart)

                if err != nil {
                        // Something went wrong and we're going to abandon attempting to push any further
                        // events.
                        logger.Error(
                                "Failed to push billing events",
                                zap.Int("count", batch.count),
                                zap.Duration("after", reqDuration),
                                req.LogFields(),
                                zap.Int("totalEvents", totalEvents),
                                zap.Int("totalBatches", totalBatches),
                                zap.Duration("totalTime", time.Since(startTime)),
                                zap.Error(err),
                        )

                        rootErr := err.Simplified()
                        s.metrics.sendErrorsTotal.WithLabelValues(s.client.Name, rootErr).Inc()

                        s.lastSendDuration = 0
                        s.metrics.lastSendDuration.WithLabelValues(s.client.Name).Set(0.0) // use 0 as a flag that something went wrong; there's no valid time here.
                        return
                }

                s.queue.dropLatestCompleted() // mark this batch as complete
                totalEvents += batch.count
                totalBatches += 1
                currentTotalTime := time.Since(startTime)

                logger.Info(
                        "Successfully pushed some events",
                        zap.Int("count", batch.count),
                        zap.Duration("after", reqDuration),
                        req.LogFields(),
                        zap.Int("totalEvents", totalEvents),
                        zap.Int("totalBatches", totalBatches),
                        zap.Duration("totalTime", currentTotalTime),
                )

                if currentTotalTime > s.lastSendDuration {
                        s.lastSendDuration = currentTotalTime
                        s.metrics.lastSendDuration.WithLabelValues(s.client.Name).Set(currentTotalTime.Seconds())
                }
        }
}

package reporting

// public API for event reporting

import (
        "context"
        "fmt"

        "github.com/prometheus/client_golang/prometheus"
        "go.uber.org/zap"

        "github.com/neondatabase/autoscaling/pkg/util"
        "github.com/neondatabase/autoscaling/pkg/util/taskgroup"
)

type EventSink[E any] struct {
        queueWriters []*eventBatcher[E]

        runSenders func(context.Context) error
}

// NewEventSink creates a new EventSink with the given clients to dispatch events into.
//
// You MUST call (*EventSink[E]).Run() if you wish for any enqueued events to actually be sent via
// the clients.
func NewEventSink[E any](logger *zap.Logger, metrics *EventSinkMetrics, clients ...Client[E]) *EventSink[E] {
        var queueWriters []*eventBatcher[E]
        var senders []eventSender[E]

        for _, c := range clients {
                batchComplete := make(chan struct{}, 1)
                notifyComplete := func() {
                        // Send into the channel, only if it doesn't already have an item in it.
                        select {
                        case batchComplete <- struct{}{}:
                        default:
                        }
                }

                sizeGauge := metrics.queueSizeCurrent.WithLabelValues(c.Name)

                batcher := newEventBatcher[E](int(c.BaseConfig.MaxBatchSize), c.NewBatchBuilder, notifyComplete, sizeGauge)
                queueWriters = append(queueWriters, batcher)

                // Create the sender -- we'll save starting it for the call to Run()
                senders = append(senders, eventSender[E]{
                        client:           c,
                        metrics:          metrics,
                        queue:            batcher,
                        batchComplete:    batchComplete,
                        lastSendDuration: 0,
                })
        }

        var runSenders func(context.Context) error
        if len(senders) > 0 {
                runSenders = func(ctx context.Context) error {
                        tg := taskgroup.NewGroup(logger, taskgroup.WithParentContext(ctx))

                        for _, sender := range senders {
                                taskName := fmt.Sprintf("send-%s", sender.client.Name)
                                tg.Go(taskName, func(logger *zap.Logger) error {
                                        sender.senderLoop(tg.Ctx(), logger)
                                        return nil
                                })
                        }

                        return tg.Wait()
                }
        } else {
                // Special case when there's no clients -- we want our run function to just wait until the
                // context is complete, matching what the behavior *would* be if there were actually sender
                // threads we were waiting on.
                runSenders = func(ctx context.Context) error {
                        <-ctx.Done()
                        return nil
                }
        }

        return &EventSink[E]{
                queueWriters: queueWriters,
                runSenders:   runSenders,
        }
}

// Run executes the client threads responsible for actually pushing enqueued events to the
// appropriate places.
//
// The clients will periodically push events until the context expires, at which point they will
// push any remaining events. Run() only completes after these final events have been pushed.
//
// Calling Run() more than once is unsound.
func (s *EventSink[E]) Run(ctx context.Context) error {
        return s.runSenders(ctx)
}

// Enqueue submits the event to the internal client sending queues, returning without blocking.
func (s *EventSink[E]) Enqueue(event E) {
        for _, q := range s.queueWriters {
                q.enqueue(event)
        }
}

type EventSinkMetrics struct {
        queueSizeCurrent *prometheus.GaugeVec
        lastSendDuration *prometheus.GaugeVec
        sendErrorsTotal  *prometheus.CounterVec
}

func NewEventSinkMetrics(prefix string, reg prometheus.Registerer) *EventSinkMetrics {
        return &EventSinkMetrics{
                queueSizeCurrent: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: fmt.Sprintf("%s_queue_size", prefix),
                                Help: "Size of the billing subsystem's queue of unsent events",
                        },
                        []string{"client"},
                )),
                lastSendDuration: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: fmt.Sprintf("%s_last_send_duration_seconds", prefix),
                                Help: "Duration, in seconds, that it took to send the latest set of billing events (or current time if ongoing)",
                        },
                        []string{"client"},
                )),
                sendErrorsTotal: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: fmt.Sprintf("%s_send_errors_total", prefix),
                                Help: "Total errors from attempting to send billing events",
                        },
                        []string{"client", "cause"},
                )),
        }
}

package util

// Helper arithmetic methods

import (
        "golang.org/x/exp/constraints"
)

// SaturatingSub returns x - y if x >= y, otherwise zero
func SaturatingSub[T constraints.Unsigned](x, y T) T {
        if x >= y {
                return x - y
        } else {
                var zero T
                return zero
        }
}

// AbsDiff returns the absolute value of the difference between x and y
func AbsDiff[T constraints.Unsigned](x, y T) T {
        if x > y {
                return x - y
        } else {
                return y - x
        }
}

// AtomicInt represents the shared interface provided by various atomic.<NAME> integers
//
// This interface type is primarily used by AtomicMax.
type AtomicInt[I any] interface {
        Add(delta I) (new I)                      //nolint:predeclared // same var names as methods
        CompareAndSwap(old, new I) (swapped bool) //nolint:predeclared // same var names as methods
        Load() I
        Store(val I)
        Swap(new I) (old I) //nolint:predeclared // same var names as methods
}

// AtomicMax atomically sets a to the maximum of *a and i, returning the old value at a.
//
// On ISAs without atomic maximum/minimum instructions, a fallback is typically implemented as the
// Load + CompareAndSwap loop that this function uses. At time of writing (Go 1.20), the Go standard
// library does not include atomic maximum/minimum functions.
//
// This function is lock-free but not wait-free.
func AtomicMax[A AtomicInt[I], I constraints.Integer](a A, i I) I {
        for {
                current := a.Load()
                if current >= i {
                        return current
                }
                if a.CompareAndSwap(current, i) {
                        return current
                }
        }
}

package util

// A channel-based sync.Cond-like interface, with support for broadcast operations (but some
// additional restrictions). Refer to the documentation of Wait for detailed usage.

import (
        "sync"
)

func NewBroadcaster() *Broadcaster {
        return &Broadcaster{
                mu:   sync.Mutex{},
                ch:   make(chan struct{}),
                sent: 0,
        }
}

type Broadcaster struct {
        mu sync.Mutex
        ch chan struct{}

        sent uint64
}

type BroadcastReceiver struct {
        b *Broadcaster

        viewed uint64
}

// Broadcast sends a signal to all receivers
func (b *Broadcaster) Broadcast() {
        b.mu.Lock()
        defer b.mu.Unlock()

        close(b.ch)
        b.ch = make(chan struct{})
        b.sent += 1
}

// NewReceiver creates a new BroadcastReceiver that will receive only future broadcasted events.
//
// It's generally not recommended to call (*BroadcastReceiver).Wait() on a single BroadcastReceiver
// from more than one thread at a time, although it *is* thread-safe.
func (b *Broadcaster) NewReceiver() BroadcastReceiver {
        b.mu.Lock()
        defer b.mu.Unlock()

        return BroadcastReceiver{
                b:      b,
                viewed: b.sent,
        }
}

var closedChannel = func() <-chan struct{} {
        ch := make(chan struct{})
        close(ch)
        return ch
}()

// Wait returns a channel that will be closed once there has been an event broadcasted since
// the BroadcastReceiver was created, or the last call to Awake().
//
// Typical usage of Wait will involve selecting on the channel returned and calling Awake
// immediately in the branch handling the event, for example:
//
//        select {
//        case <-ctx.Done():
//            return
//        case <-receiver.Wait():
//            receiver.Awake()
//            ...
//        }
func (r *BroadcastReceiver) Wait() <-chan struct{} {
        r.b.mu.Lock()
        defer r.b.mu.Unlock()

        if r.b.sent == r.viewed {
                return r.b.ch
        } else {
                return closedChannel
        }
}

// Awake marks the most recent broadcast event as received, so that the next call to Wait returns a
// channel that will only be closed once there's been a new event after this call to Awake.
func (r *BroadcastReceiver) Awake() {
        r.b.mu.Lock()
        defer r.b.mu.Unlock()

        r.viewed = r.b.sent
}

package util

// Implementation of a channel-based mutex, so that it can be combined with Context.Done and other
// select-able methods, without dealing with the hassle of creating separate goroutines

import (
        "context"
        "fmt"
        "time"
)

// ChanMutex is a select-able mutex
//
// It is fair if and only if receiving on a channel is fair. As of Go 1.19/2022-01-17, receiving on
// a channel appears to be fair. However: this is a runtime implementation detail, and so it may
// change without notice in the future.
//
// Unlike sync.Mutex, ChanMutex requires initialization before use because it's basically just a
// channel.
//
// Also unlike sync.Mutex, a ChanMutex may be copied without issue (again, because it's just a
// channel).
type ChanMutex struct {
        ch chan struct{}
}

// NewChanMutex creates a new ChanMutex
func NewChanMutex() ChanMutex {
        ch := make(chan struct{}, 1)
        ch <- struct{}{}
        return ChanMutex{ch}
}

// Lock locks m
//
// This method is semantically equivalent to sync.Mutex.Lock
func (m *ChanMutex) Lock() {
        if m.ch == nil {
                panic("called Lock on uninitialized ChanMutex")
        }
        <-m.ch
}

// WaitLock is like Lock, but instead returns a channel
//
// If receiving on the channel succeeds, the caller "holds" the lock and must now be responsible for
// Unlock-ing it.
func (m *ChanMutex) WaitLock() <-chan struct{} {
        if m.ch == nil {
                panic("called WaitLock on uninitialized ChanMutex")
        }
        return m.ch
}

// TryLock blocks until locking m succeeds or the context is cancelled
//
// If the context is cancelled while waiting to lock m, the lock will be left unchanged and
// ctx.Err() will be returned.
func (m *ChanMutex) TryLock(ctx context.Context) error {
        if m.ch == nil {
                panic("called TryLock on uninitialized ChanMutex")
        }
        select {
        case <-m.ch:
                return nil
        case <-ctx.Done():
                return ctx.Err()
        }
}

// Unlock unlocks m
//
// This method is semantically equivalent to sync.Mutex.Unlock
func (m *ChanMutex) Unlock() {
        select {
        case m.ch <- struct{}{}:
        default:
                panic("ChanMutex.Unlock called while already unlocked")
        }
}

// DeadlockChecker creates a function that, when called, periodically attempts to acquire the lock,
// panicking if it fails
//
// The returned function exits when the context is done.
func (m *ChanMutex) DeadlockChecker(timeout, delay time.Duration) func(ctx context.Context) {
        return func(ctx context.Context) {
                for {
                        // Delay between checks
                        select {
                        case <-ctx.Done():
                                return
                        case <-time.After(delay):
                        }

                        select {
                        case <-ctx.Done():
                                return
                        case <-m.WaitLock():
                                m.Unlock()
                        case <-time.After(timeout):
                                panic(fmt.Errorf("likely deadlock detected, could not get lock after %s", timeout))
                        }
                }
        }
}

package util

import (
        "encoding/base64"
        "encoding/binary"
        "os"
        "path/filepath"
        "sort"

        "golang.org/x/crypto/blake2b"
)

// Calculate the checksum over all files in a directory, assuming the directory is flat (contains no subdirs).
func ChecksumFlatDir(path string) (string, error) {
        files, err := ReadAllFiles(path)
        if err != nil {
                return "", err
        }

        // sort the file names for a reproducible hash
        var keys []string
        for k := range files {
                keys = append(keys, k)
        }
        sort.Strings(keys)

        // note: any changes to the hash need to be sychronised between neonvm-runner and neonvm-daemon.
        // Since they are updated independantly, this is not trivial.
        // If in doubt, make a new function and don't touch this one.
        hasher, err := blake2b.New256(nil)
        if err != nil {
                return "", err
        }

        for _, filename := range keys {
                data := files[filename]

                // File hash with the following encoding: "{name}\0{len(data)}{data}".
                //
                // This format prevents any possible (even if unrealistic) hash confusion problems.
                // If we only hashed filename and data, then there's no difference between:
                //          name = "file1"
                //          data = []
                // and
                //   name = "file"
                //   data = [b'1']
                //
                // We are trusting that filenames on linux cannot have a nul character.
                hasher.Write([]byte(filename))
                hasher.Write([]byte{0})
                hasher.Write(binary.LittleEndian.AppendUint64([]byte{}, uint64(len(data))))
                hasher.Write(data)
        }

        sum := hasher.Sum(nil)
        sumBase64 := base64.RawStdEncoding.EncodeToString(sum)
        return sumBase64, nil
}

// Read all files in a directory, assuming the directory is flat (contains no subdirs).
func ReadAllFiles(path string) (map[string][]byte, error) {
        entries, err := os.ReadDir(path)
        if err != nil {
                return nil, err
        }

        output := make(map[string][]byte)

        for _, entry := range entries {
                if entry.IsDir() {
                        continue
                }

                data, err := os.ReadFile(filepath.Join(path, entry.Name()))
                if err != nil {
                        return nil, err
                }

                output[entry.Name()] = data
        }

        return output, nil
}

package util

// Utilities for errors

import (
        "errors"
)

// RootError returns the root cause of the error, calling errors.Unwrap until it returns nil
func RootError(err error) error {
        for {
                next := errors.Unwrap(err)
                if next == nil {
                        return err
                }
                err = next
        }
}

package util

// Wrapper file for the AddHandler function

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "net/http"

        "go.uber.org/zap"
)

// AddHandler is a helper function to wrap the handle function with JSON [de]serialization and check
// that the HTTP method is correct
//
// The provided logPrefix is prepended to every log line emitted by the wrapped handler function, to
// offer distinction where that's useful.
func AddHandler[T any, R any](
        logger *zap.Logger,
        mux *http.ServeMux,
        endpoint string,
        method string,
        reqTypeName string,
        handle func(context.Context, *zap.Logger, *T) (_ *R, statusCode int, _ error),
) {
        errBadMethod := []byte("request method must be " + method)

        logger = logger.With(zap.String("endpoint", endpoint))
        hlogger := logger.Named("http")

        mux.HandleFunc(endpoint, func(w http.ResponseWriter, r *http.Request) {
                if r.Method != method {
                        w.WriteHeader(http.StatusMethodNotAllowed)
                        _, _ = w.Write(errBadMethod)
                        return
                }

                defer r.Body.Close()
                var req T
                if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
                        hlogger.Error("Failed to read request body as JSON", zap.String("type", reqTypeName), zap.Error(err))
                        w.WriteHeader(400)
                        _, _ = w.Write([]byte("bad JSON"))
                        return
                }

                hlogger.Info(
                        "Received request",
                        zap.String("endpoint", endpoint),
                        zap.String("client", r.RemoteAddr),
                        zap.Any("request", req),
                )

                resp, status, err := handle(r.Context(), logger.With(zap.Any("request", req)), &req)

                if err == nil && status != http.StatusOK {
                        err = errors.New("HTTP handler error: status != 200 OK, but no error message")
                        status = 500
                }

                var respBody []byte
                var respBodyFormatted zap.Field
                var logFunc func(string, ...zap.Field)

                if err != nil {
                        if 500 <= status && status < 600 {
                                logFunc = hlogger.Error
                        } else if 400 <= status && status < 500 {
                                logFunc = hlogger.Warn
                        } else /* unexpected status */ {
                                err = fmt.Errorf("HTTP handler error: invalid status %d for error response: %w", status, err)
                                logFunc = hlogger.Error
                        }
                        respBodyFormatted = zap.NamedError("response", err)
                        respBody = []byte(err.Error())
                } else {
                        if status == 0 {
                                hlogger.Warn("non-error response with status = 0")
                        }

                        respBodyFormatted = zap.Any("response", resp)

                        respBody, err = json.Marshal(resp)
                        if err != nil {
                                hlogger.Error("Failed to encode JSON response", respBodyFormatted)
                                w.WriteHeader(500)
                                _, _ = w.Write([]byte("Error encoding JSON response"))
                                return
                        }
                        logFunc = hlogger.Info
                }

                logFunc(
                        "Responding to request",
                        zap.String("endpoint", endpoint), zap.Int("status", status), respBodyFormatted,
                )

                w.WriteHeader(status)
                _, _ = w.Write(respBody)
        })
}

package util

// Kubernetes-specific utility functions

import (
        "fmt"

        corev1 "k8s.io/api/core/v1"
        "k8s.io/apimachinery/pkg/runtime"
        "k8s.io/apimachinery/pkg/runtime/schema"
        "k8s.io/client-go/kubernetes/scheme"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)

// PodReady returns true iff the pod is marked as ready (as determined by the pod's
// Status.Conditions)
func PodReady(pod *corev1.Pod) bool {
        for _, c := range pod.Status.Conditions {
                if c.Type == corev1.PodReady {
                        return c.Status == corev1.ConditionTrue
                }
        }

        return false
}

// PodCompleted returns true iff all of the Pod's containers have stopped and will not be restarted
func PodCompleted(pod *corev1.Pod) bool {
        return pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed
}

// PodStartedBefore returns true iff Pod p started before Pod q
func PodStartedBefore(p, q *corev1.Pod) bool {
        return p.Status.StartTime.Before(q.Status.StartTime)
}

func azForTerm(term corev1.NodeSelectorTerm) string {
        for _, expr := range term.MatchExpressions {
                isAZ := expr.Key == "topology.kubernetes.io/zone" &&
                        expr.Operator == corev1.NodeSelectorOpIn &&
                        len(expr.Values) == 1
                if isAZ {
                        return expr.Values[0]
                }
        }

        return ""
}

// PodPreferredAZIfPresent returns the desired availability zone of the Pod, if it has one
func PodPreferredAZIfPresent(pod *corev1.Pod) string {
        if pod.Spec.Affinity == nil || pod.Spec.Affinity.NodeAffinity == nil {
                return ""
        }

        affinity := pod.Spec.Affinity.NodeAffinity

        // First, check required affinities for AZ:
        if affinity.RequiredDuringSchedulingIgnoredDuringExecution != nil {
                for _, term := range affinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms {
                        if az := azForTerm(term); az != "" {
                                return az
                        }
                }
        }

        // Then, check preferred:
        for _, term := range affinity.PreferredDuringSchedulingIgnoredDuringExecution {
                if az := azForTerm(term.Preference); az != "" {
                        return az
                }
        }

        // no AZ present
        return ""
}

// TryPodOwnerVirtualMachine returns the name of the VirtualMachine that owns the pod, if there is
// one that does. Otherwise returns nil.
func TryPodOwnerVirtualMachine(pod *corev1.Pod) *NamespacedName {
        ref, ok := vmv1.VirtualMachineOwnerForPod(pod)
        if !ok {
                return nil
        }

        // note: OwnerReferences are not permitted to have a different namespace than the owned
        // object, so because VirtualMachineMigrations are namespaced, it must have the same
        // namespace as the Pod.
        return &NamespacedName{Namespace: pod.Namespace, Name: ref.Name}
}

// TryPodOwnerVirtualMachineMigration returns the name of the VirtualMachineMigration that owns the
// pod, if there is one. Otherwise returns nil.
func TryPodOwnerVirtualMachineMigration(pod *corev1.Pod) *NamespacedName {
        ref, _, ok := vmv1.MigrationOwnerForPod(pod)
        if !ok {
                return nil
        }

        return &NamespacedName{Namespace: pod.Namespace, Name: ref.Name}
}

// LookupGVKForType determines the GroupVersionKind for the type by checking against pre-registered
// types in the client-go scheme.
//
// This internally requires some reflection, so it's advisable to pre-calculate this if possible.
func LookupGVKForType(sampleObj runtime.Object) (schema.GroupVersionKind, error) {
        var empty schema.GroupVersionKind

        gvks, _, err := scheme.Scheme.ObjectKinds(sampleObj)
        if err != nil {
                return empty, fmt.Errorf("could not get GVKs for object type %T: %w", sampleObj, err)
        }
        if len(gvks) == 0 {
                return empty, fmt.Errorf("no GVKs found for object type %T", sampleObj)
        } else if len(gvks) > 1 {
                return empty, fmt.Errorf("more than one GVK found for object type %T", sampleObj)
        }
        return gvks[0], nil
}

package util

import (
        "context"
        "errors"
        "fmt"
        "net"
        "net/http"

        "github.com/prometheus/client_golang/prometheus"
        "github.com/prometheus/client_golang/prometheus/promhttp"
        "go.uber.org/zap"
)

func RegisterMetric[P prometheus.Collector](reg prometheus.Registerer, collector P) P {
        reg.MustRegister(collector)
        return collector
}

// Prometheus metrics server common to >1 component

// Starts the prometheus server in a background thread. Returns error if binding on the port fails.
func StartPrometheusMetricsServer(ctx context.Context, logger *zap.Logger, port uint16, reg *prometheus.Registry) error {
        // Separate binding from serving, so that we can catch any error in this thread, rather than the
        // server's.
        listener, err := net.ListenTCP("tcp", &net.TCPAddr{IP: net.IPv4zero, Port: int(port)})
        if err != nil {
                return fmt.Errorf("Error listening on TCP port %d: %w", port, err)
        }

        shutdownCtx, shutdown := context.WithCancel(ctx)
        mux := http.NewServeMux()
        mux.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg}))

        baseContext := context.Background()
        srv := &http.Server{Handler: mux, BaseContext: func(net.Listener) context.Context { return baseContext }}

        go func() {
                <-shutdownCtx.Done()
                if err := srv.Shutdown(context.Background()); err != nil {
                        logger.Error("Error shutting down prometheus server", zap.Error(err))
                }
        }()

        go func() {
                // shutdown the shutdown watcher if we exit before it
                defer shutdown()
                if err := srv.Serve(listener); !errors.Is(err, http.ErrServerClosed) {
                        logger.Error("Prometheus server exited with unexpected error", zap.Error(err))
                }
        }()

        return nil
}

package util

// same as k8s.io/apimachinery/pkg/types/namespacedname.go, but with JSON (de)serialization

import (
        "fmt"

        "go.uber.org/zap/zapcore"

        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

const Separator = '/'

// NamespacedName represents a resource name with the namespace it's in.
//
// When printed with '%v', NamespacedName is rendered as "<namespace>/<name>". Printing with
// '%+v' or '%#v' renders as it would normally.
type NamespacedName struct {
        Namespace string `json:"namespace"`
        Name      string `json:"name"`
}

func GetNamespacedName(obj metav1.ObjectMetaAccessor) NamespacedName {
        meta := obj.GetObjectMeta()
        return NamespacedName{Namespace: meta.GetNamespace(), Name: meta.GetName()}
}

func (n NamespacedName) Format(state fmt.State, verb rune) {
        switch {
        case verb == 'v' && state.Flag('+'):
                // Show fields, e.g. `{Namespace:foo Name:bar}`
                _, _ = state.Write([]byte(string("{Namespace:")))
                _, _ = state.Write([]byte(n.Namespace))
                _, _ = state.Write([]byte(string(" Name:")))
                _, _ = state.Write([]byte(n.Name))
                _, _ = state.Write([]byte{'}'})
        case verb == 'v' && state.Flag('#'):
                // Go syntax representation, e.g. `util.NamespacedName{Namespace:"foo", Name:"bar"}`
                _, _ = state.Write([]byte(fmt.Sprintf("util.NamespacedName{Namespace:%q, Name:%q}", n.Namespace, n.Name)))
        default:
                // Pretty-printed representation, e.g. `foo/bar`
                _, _ = state.Write([]byte(n.Namespace))
                _, _ = state.Write([]byte(string(Separator)))
                _, _ = state.Write([]byte(n.Name))
        }
}

// MarshalLogObject implements zapcore.ObjectMarshaler, so that NamespacedName can be used with zap.Object
func (n NamespacedName) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        enc.AddString("namespace", n.Namespace)
        enc.AddString("name", n.Name)
        return nil
}

// Construction of JSON patches. See https://jsonpatch.com/

package patch

import (
        "strings"
)

// OpKind is the kind of operation being performed in a single step
type OpKind string

const (
        OpAdd     OpKind = "add"
        OpRemove  OpKind = "remove"
        OpReplace OpKind = "replace"
        OpMove    OpKind = "move"
        OpCopy    OpKind = "copy"
        OpTest    OpKind = "test"
)

type JSONPatch = []Operation

// Operation is a single step in the overall JSON patch
type Operation struct {
        // Op is the kind of operation being performed in this step. See [OpKind] for more.
        Op OpKind `json:"op"`
        // Path is a [JSON pointer] to the target location of the operation.
        //
        // In general, nesting is separated by '/'s, with special characters escaped by '~'.
        // [PathEscape] is provided to handle escaping, because it can get a little gnarly.
        //
        // As an example, if you want to add a field "foo" to the first element of an array,
        // you'd use the path `/0/foo`. The jsonpatch website has more details (and clearer examples),
        // refer there for more information: https://jsonpatch.com/#json-pointer
        //
        // [JSON pointer]: https://datatracker.ietf.org/doc/html/rfc6901/
        Path string `json:"path"`
        // From gives the source location for "copy" or "move" operations.
        From string `json:"from,omitempty"`
        // Value is the new value to use, for "add", "replace", or "test" operations.
        Value any `json:"value,omitempty"`
}

var pathEscaper = strings.NewReplacer("~", "~0", "/", "~1")

// PathEscape escapes a string for use in a segment of the Path field of an Operation
//
// This is useful, for example, when using arbitrary strings as map keys (like Kubernetes labels or
// annotations).
func PathEscape(s string) string {
        return pathEscaper.Replace(s)
}

package util

import (
        "net/http"
        "net/http/pprof"
        "time"
)

func MakePPROF(addr string) *http.Server {
        mux := http.NewServeMux()
        mux.HandleFunc("/debug/pprof/", pprof.Index)
        mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
        mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
        mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
        mux.HandleFunc("/debug/pprof/trace", pprof.Trace)

        return &http.Server{
                Addr:              addr,
                Handler:           mux,
                ReadHeaderTimeout: time.Second,
        }
}

package queue

// Wrapper around Go's container/heap so that it works with generic types

import (
        "container/heap"
)

// PriorityQueue is a generic priority queue of type T, given a function to determine when a value
// should be given sooner.
type PriorityQueue[T any] struct {
        inner *innerQueue[T]
}

type innerQueue[T any] struct {
        values []*item[T]
        less   func(T, T) bool
}

// ItemHandle is a stable reference to a particular value in the queue
type ItemHandle[T any] struct {
        queue *innerQueue[T]
        item  *item[T]
}

type item[T any] struct {
        v     T
        index int
}

// New creates a new queue, given a function that returns if a value should be returned sooner than
// another.
func New[T any](sooner func(T, T) bool) PriorityQueue[T] {
        return PriorityQueue[T]{
                inner: &innerQueue[T]{
                        values: []*item[T]{},
                        less:   sooner,
                },
        }
}

// Len returns the length of the queue
func (q PriorityQueue[T]) Len() int {
        return q.inner.Len()
}

// Push adds the value to the queue and returns a stable handle to it
func (q PriorityQueue[T]) Push(value T) ItemHandle[T] {
        item := &item[T]{
                v:     value,
                index: -1,
        }
        heap.Push(q.inner, item)
        return ItemHandle[T]{
                queue: q.inner,
                item:  item,
        }
}

// Peek returns the value at the front of the queue without removing it, returning false iff the
// queue is empty.
func (p PriorityQueue[T]) Peek() (_ T, ok bool) {
        var empty T
        if p.Len() == 0 {
                return empty, false
        }
        // container/heap guarantees that the value at index 0 is the one that will be returned by Pop.
        // See the docs for Pop -- "Pop is equivalent to Remove(h, 0)."
        return p.inner.values[0].v, true
}

// Pop returns a value from the queue, returning false iff the queue is empty
func (q PriorityQueue[T]) Pop() (_ T, ok bool) {
        if q.inner.Len() == 0 {
                var empty T
                return empty, false
        }
        item := heap.Pop(q.inner).(*item[T])
        return item.v, true
}

// Value returns the value associated with the item
//
// NOTE: Any updates to the value here will not be reflected by changing its position in the queue.
// For that, you must use (ItemHandle[T]).Update().
func (it ItemHandle[T]) Value() T {
        return it.item.v
}

// Update sets the value of this item, updating its position in the queue accordingly
func (it ItemHandle[T]) Update(update func(value *T)) {
        if it.item.index == -1 {
                panic("item has since been removed from the queue")
        }

        update(&it.item.v)
        heap.Fix(it.queue, it.item.index)
}

///////////////////////////////////////////////////////////
//      INTERNAL METHODS, FOR container/heap TO USE      //
///////////////////////////////////////////////////////////

// Len implements heap.Interface
func (q *innerQueue[T]) Len() int {
        return len(q.values)
}

// Less implements heap.Interface
func (q *innerQueue[T]) Less(i, j int) bool {
        return q.less(q.values[i].v, q.values[j].v)
}

// Swap implements heap.Interface
func (q *innerQueue[T]) Swap(i, j int) {
        // copied from the example in container/heap
        q.values[i], q.values[j] = q.values[j], q.values[i]
        q.values[i].index = i
        q.values[j].index = j
}

// Push implements heap.Interface
func (q *innerQueue[T]) Push(x any) {
        // copied from the example in container/heap
        n := len(q.values)
        item := x.(*item[T])
        item.index = n
        q.values = append(q.values, item)
}

// Pop implements heap.Interface
func (q *innerQueue[T]) Pop() any {
        // copied from the example in container/heap
        old := q.values
        n := len(old)
        item := old[n-1]
        old[n-1] = nil  // don't stop the GC from reclaiming the item eventually
        item.index = -1 // for safety
        q.values = old[0 : n-1]
        return item
}

package util

import "time"

// RecentCounter is a struct that keeps track of recent timestamps within a given interval.
type RecentCounter struct {
        interval   time.Duration
        timestamps []time.Time
}

func NewRecentCounter(interval time.Duration) *RecentCounter {
        return &RecentCounter{
                interval:   interval,
                timestamps: make([]time.Time, 0),
        }
}

// cleanup removes all timestamps that are beyond the interval from the current time.
func (rc *RecentCounter) cleanup(now time.Time) {
        checkpoint := now.Add(-rc.interval)
        i := 0
        for ; i < len(rc.timestamps); i++ {
                if rc.timestamps[i].After(checkpoint) {
                        break
                }
        }
        rc.timestamps = rc.timestamps[i:]
}

// inc is separated from its exported version to provide more flexibity around testing.
func (rc *RecentCounter) inc(now time.Time) {
        rc.cleanup(now)
        rc.timestamps = append(rc.timestamps, now)
}

// get is separated from its exported version to provide more flexibity around testing.
func (rc *RecentCounter) get(now time.Time) uint {
        rc.cleanup(now)
        return uint(len(rc.timestamps))
}

// Inc increments the counter and adds the current timestamp to the list of timestamps.
func (rc *RecentCounter) Inc() {
        rc.inc(time.Now())
}

// Get returns the number of recent timestamps stored in the RecentCounter.
func (rc *RecentCounter) Get() uint {
        return rc.get(time.Now())
}

package util

// Signalling primitives: single-signal sender/receiver pair and sync.Cond-ish exposed over a
// channel instead

import (
        "sync"
)

func NewSingleSignalPair[T any]() (SignalSender[T], SignalReceiver[T]) {
        sigCh := make(chan T, 1)
        once := &sync.Once{}
        closeSigCh := func() { once.Do(func() { close(sigCh) }) }

        return SignalSender[T]{
                send: func(data T) {
                        once.Do(func() {
                                sigCh <- data
                                close(sigCh)
                        })
                },
        }, SignalReceiver[T]{sigCh: sigCh, closeSigCh: closeSigCh}
}

type SignalSender[T any] struct {
        send func(T)
}

type SignalReceiver[T any] struct {
        sigCh      chan T
        closeSigCh func()
}

func (s SignalSender[T]) Send(data T) {
        s.send(data)
}

func (s SignalReceiver[T]) Recv() <-chan T {
        return s.sigCh
}

func (s SignalReceiver[T]) Close() {
        s.closeSigCh()
}

// NewCondChannelPair creates a sender/receiver pair for a sync.Cond-like interface
//
// The differences from sync.Cond are that receiving is exposed through a channel (so it can be
// select-ed) and there is no equivalent to (*Cond).Broadcast()
func NewCondChannelPair() (CondChannelSender, CondChannelReceiver) {
        ch := make(chan struct{}, 1)
        return CondChannelSender{ch: ch}, CondChannelReceiver{ch: ch}
}

// CondChannelSender is the sending half of a sync.Cond-like interface
type CondChannelSender struct {
        ch chan struct{}
}

// CondChannelReceiver is the receiving half of a sync.Cond-like interface
type CondChannelReceiver struct {
        ch chan struct{}
}

// Send performs a non-blocking notify of the associated CondChannelReceiver
//
// If there is currently a receiver waiting via Recv, then this will immediately wake them.
// Otherwise, the next receive on the channel returned by Recv will complete immediately.
func (c *CondChannelSender) Send() {
        select {
        case c.ch <- struct{}{}:
        default:
        }
}

// Unsend cancels an existing signal that has been sent but not yet received.
//
// It returns whether there was a signal to be cancelled.
func (c *CondChannelSender) Unsend() bool {
        select {
        case <-c.ch:
                return true
        default:
                return false
        }
}

// Consume removes any existing signal created by Send, requiring an additional Send to be made
// before the receiving on Recv will unblock
//
// This method is non-blocking.
func (c *CondChannelReceiver) Consume() {
        select {
        case <-c.ch:
        default:
        }
}

// Recv returns a channel for which receiving will complete either (a) immediately, if Send has been
// called without Consume or another receive since; or (b) as soon as Send is next called
//
// This method is non-blocking but receiving on the returned channel may block.
func (c *CondChannelReceiver) Recv() <-chan struct{} {
        return c.ch
}

package stack

// Originally taken from https://github.com/sharnoff/chord

// TODO - want to have some kind of "N skipped" when (a) there's lots of frames and (b) many of
// those frames are duplicates

import (
        "runtime"
        "strconv"
        "sync"
)

// StackTrace represents a collected stack trace, possibly with a parent (i.e caller)
//
// StackTraces are designed to make it easy to track callers across goroutines. They are typically
// produced by [GetStackTrace]; refer to that function for more information.
type StackTrace struct {
        // Frames provides the frames of this stack trace. Each frame's caller is at the index following
        // it; the first frame is the direct caller.
        Frames []StackFrame
        // Parent, if not nil, provides the "parent" stack trace - typically the stack trace at the
        // point this goroutine was spawned.
        Parent *StackTrace
}

// Individual stack frame, contained in a [StackTrace], produced by [GetStackTrace].
type StackFrame struct {
        // Function provides the name of the function being called, or the empty string if unknown.
        Function string
        // File gives the name of the file, or an empty string if the file is unknown.
        File string
        // Line gives the line number (starting from 1), or zero if the line number is unknown.
        Line int
}

// GetStackTrace produces a StackTrace, optionally with a parent's stack trace to append.
//
// skip sets the number of initial calling stack frames to exclude. Setting skip to zero will
// produce a StackTrace where the first [StackFrame] represents the location where GetStackTrace was
// called.
func GetStackTrace(parent *StackTrace, skip uint) StackTrace {
        frames := getFrames(skip + 1) // skip the additional frame introduced by GetStackTrace
        return StackTrace{Frames: frames, Parent: parent}
}

// String produces a string representation of the stack trace, roughly similar to the default panic
// handler's.
//
// For some examples of formatting, refer to the StackTrace tests.
func (st StackTrace) String() string {
        var buf []byte

        for {
                if len(st.Frames) == 0 {
                        buf = append(buf, "<empty stack>\n"...)
                } else {
                        for _, f := range st.Frames {
                                var function, functionTail, file, fileLineSep, line string

                                if f.Function == "" {
                                        function = "<unknown function>"
                                } else {
                                        function = f.Function
                                        functionTail = "(...)"
                                }

                                if f.File == "" {
                                        file = "<unknown file>"
                                } else {
                                        file = f.File
                                        if f.Line != 0 {
                                                fileLineSep = ":"
                                                line = strconv.Itoa(f.Line)
                                        }
                                }

                                buf = append(buf, function...)
                                buf = append(buf, functionTail...)
                                buf = append(buf, "\n\t"...)
                                buf = append(buf, file...)
                                buf = append(buf, fileLineSep...)
                                buf = append(buf, line...)
                                buf = append(buf, byte('\n'))
                        }
                }

                if st.Parent == nil {
                        break
                }

                st = *st.Parent
                buf = append(buf, "called by "...)
                continue
        }

        return string(buf)
}

var pcBufPool = sync.Pool{
        New: func() any {
                buf := make([]uintptr, 128)
                return &buf
        },
}

func putPCBuffer(buf *[]uintptr) {
        if len(*buf) < 1024 {
                pcBufPool.Put(buf)
        }
}

func getFrames(skip uint) []StackFrame {
        skip += 2 // skip the frame introduced by this function and runtime.Callers

        pcBuf := pcBufPool.Get().(*[]uintptr)
        defer putPCBuffer(pcBuf)
        if len(*pcBuf) == 0 {
                panic("internal error: len(*pcBuf) == 0")
        }

        // read program counters into the buffer, repeating until buffer is big enough.
        //
        // This is O(n log n), where n is the true number of program counters.
        var pc []uintptr
        for {
                n := runtime.Callers(0, *pcBuf)
                if n == 0 {
                        panic("runtime.Callers(0, ...) returned zero")
                }

                if n < len(*pcBuf) {
                        pc = (*pcBuf)[:n]
                        break
                } else {
                        *pcBuf = make([]uintptr, 2*len(*pcBuf))
                }
        }

        framesIter := runtime.CallersFrames(pc)
        var frames []StackFrame
        more := true
        for more {
                var frame runtime.Frame
                frame, more = framesIter.Next()

                if skip > 0 {
                        skip -= 1
                        continue
                }

                frames = append(frames, StackFrame{
                        Function: frame.Function,
                        File:     frame.File,
                        Line:     frame.Line,
                })
        }

        return frames
}

// Originally taken from https://github.com/ptxmac/multierrgroup

// Package taskgroup provides a mix of multierr and errgroup
// See documentation for https://pkg.go.dev/go.uber.org/multierr and https://pkg.go.dev/golang.org/x/sync/errgroup
package taskgroup

import (
        "context"
        "fmt"
        "sync"

        "go.uber.org/multierr"
        "go.uber.org/zap"

        "github.com/neondatabase/autoscaling/pkg/util/stack"
)

// Group manages goroutines and collect all the errors.
// See https://pkg.go.dev/golang.org/x/sync/errgroup#group for more information
type Group interface {
        Ctx() context.Context
        Wait() error
        Go(name string, f func(logger *zap.Logger) error)
}

type group struct {
        cancel       context.CancelFunc
        ctx          context.Context
        logger       *zap.Logger
        panicHandler func(any)

        wg sync.WaitGroup

        errMutex sync.Mutex
        err      error
}

type GroupOption func(*group)

// WithParentContext sets the parent context for the group.
func WithParentContext(ctx context.Context) GroupOption {
        return func(g *group) {
                g.ctx, g.cancel = context.WithCancel(ctx)
        }
}

// WithPanicHandler sets a panic handler for the group.
func WithPanicHandler(f func(any)) GroupOption {
        return func(g *group) {
                g.panicHandler = f
        }
}

// NewGroup returns a new Group.
func NewGroup(logger *zap.Logger, opts ...GroupOption) Group {
        g := &group{
                cancel:       nil, // Set separately by Ctx
                ctx:          nil, // Set separately by Ctx
                panicHandler: nil, // Set separately by WithPanicHandler
                logger:       logger,
                wg:           sync.WaitGroup{},

                errMutex: sync.Mutex{},
                err:      nil,
        }

        for _, opt := range opts {
                opt(g)
        }
        if g.ctx == nil {
                // If parent context is not set, use background context
                WithParentContext(context.Background())(g)
        }

        return g
}

// Ctx returns a context that will be canceled if any of the following happens:
// 1. Any of the tasks in the group fail.
// 2. All tasks in the group have completed.
// 3. Parent context is canceled, if it was set.
func (g *group) Ctx() context.Context {
        return g.ctx
}

// Wait blocks until all goroutines have completed.
//
// All errors returned from the goroutines will be combined into one using multierr and returned from this method.
func (g *group) Wait() error {
        g.wg.Wait()
        if g.cancel != nil {
                g.cancel()
        }
        return g.err
}

func (g *group) call(f func() error) (err error) {
        defer func() {
                if r := recover(); r != nil {
                        if g.panicHandler != nil {
                                g.panicHandler(r)
                        }
                        // Omit 1 frame - the f() call below
                        st := stack.GetStackTrace(nil, 1).String()
                        g.logger.Error("Task panicked", zap.Any("payload", r), zap.String("stack", st))
                        err = fmt.Errorf("panic: %v", r)
                }
        }()
        err = f()
        return err
}

// Go calls the function in a new goroutine.
// If a non-nil errors is returned, the context is canceled and
// the error is collected using multierr and will be returned by Wait.
func (g *group) Go(name string, f func(logger *zap.Logger) error) {
        g.wg.Add(1)

        go func() {
                defer g.wg.Done()
                logger := g.logger.Named(name)
                cb := func() error {
                        return f(logger)
                }
                if err := g.call(cb); err != nil {
                        err = fmt.Errorf("task %s failed: %w", name, err)
                        g.errMutex.Lock()
                        g.err = multierr.Append(g.err, err)
                        g.errMutex.Unlock()
                        logger.Error(err.Error())
                        if g.cancel != nil {
                                g.cancel()
                        }
                }
        }()
}

package util

import (
        "errors"
        "math/rand"
        "time"
)

type TimeRange struct {
        min   int
        max   int
        units time.Duration
}

func NewTimeRange(units time.Duration, minTime, maxTime int) *TimeRange {
        if minTime < 0 {
                panic(errors.New("bad time range: min < 0"))
        } else if minTime == 0 && maxTime == 0 {
                panic(errors.New("bad time range: min and max = 0"))
        } else if maxTime < minTime {
                panic(errors.New("bad time range: max < min"))
        }

        return &TimeRange{min: minTime, max: maxTime, units: units}
}

// Random returns a random time.Duration within the range
func (r TimeRange) Random() time.Duration {
        if r.max == r.min {
                return time.Duration(r.min) * r.units
        }

        count := rand.Intn(r.max-r.min) + r.min
        return time.Duration(count) * r.units
}

package util

// Helper for creating a zap.Field for a VM

import (
        "go.uber.org/zap"
        "go.uber.org/zap/zapcore"

        corev1 "k8s.io/api/core/v1"

        vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)

type nameFields struct {
        virtualmachine NamespacedName
        pod            NamespacedName
}

// MarshalLogObject implements zapcore.ObjectMarshaler
func (f nameFields) MarshalLogObject(enc zapcore.ObjectEncoder) error {
        if err := enc.AddObject("virtualmachine", f.virtualmachine); err != nil {
                return err
        }
        if err := enc.AddObject("pod", f.pod); err != nil {
                return err
        }
        return nil
}

func VMNameFields(vm *vmv1.VirtualMachine) zap.Field {
        vmName := GetNamespacedName(vm)

        // If the VM has a pod, log both the VM and the pod, otherwise just the VM.
        if vm.Status.PodName == "" {
                return zap.Object("virtualmachine", vmName)
        } else {
                podName := NamespacedName{Namespace: vm.Namespace, Name: vm.Status.PodName}

                return zap.Inline(nameFields{
                        virtualmachine: vmName,
                        pod:            podName,
                })
        }
}

func PodNameFields(pod *corev1.Pod) zap.Field {
        podName := GetNamespacedName(pod)

        if vmName, ok := pod.Labels[vmv1.VirtualMachineNameLabel]; ok {
                vmName := NamespacedName{Namespace: pod.Namespace, Name: vmName}

                return zap.Inline(nameFields{
                        virtualmachine: vmName,
                        pod:            podName,
                })
        } else {
                return zap.Object("pod", podName)
        }
}

package watch

// Metrics for Watch()

import (
        "fmt"

        "github.com/prometheus/client_golang/prometheus"

        "k8s.io/apimachinery/pkg/watch"

        "github.com/neondatabase/autoscaling/pkg/util"
)

// Metrics holds some common prometheus collectors that are used by Watch
//
// The metrics used are:
//
// - client_calls_total (number of calls to k8s client.{Watch,List}, labeled by method)
// - relist_requests_total (number of "relist" requests from the Store)
// - events_total (number of K8s watch.Events that have occurred, including errors)
// - errors_total (number of errors, either error events or re-List errors, labeled by source: ["List", "Watch", "Watch.Event"])
// - alive_current (1 iff the watcher is currently running or failing, else 0)
// - failing_current (1 iff the watcher's last request failed *and* it's waiting to retry, else 0)
//
// Prefixes are typically of the form "COMPONENT_watchers" (e.g. "autoscaling_agent_watchers").
// Separate reporting per call to Watch is automatically done with the "watcher_instance" label
// attached to the metrics, using MetricsConfig.
//
// A brief note about "alive" and "failing": Reading from a pair of collectors is fundamentally
// racy. It may be possible to temporarily view "failing" but not "alive".
type Metrics struct {
        clientCallsTotal    *prometheus.CounterVec
        relistRequestsTotal *prometheus.CounterVec
        eventsTotal         *prometheus.CounterVec
        errorsTotal         *prometheus.CounterVec
        aliveCurrent        *prometheus.GaugeVec
        failingCurrent      *prometheus.GaugeVec

        // note: all usage of Metrics is by value, so this field gets copied in on each Watch call.
        // It gives us a bit of state to use for the failing and unfailing functions.
        isFailing bool
}

type MetricsConfig struct {
        Metrics
        // Instance provides the value of the "watcher_instance" label that will be applied to all
        // metrics collected for the Watch call
        Instance string
}

const metricInstanceLabel = "watcher_instance"

// NewMetrics creates a new set of metrics for many Watch calls within the same service.
//
// The metrics will be registered with prometheus.Registerer.
//
// All metric names will be prefixed with the provided string, and when used by Watch, will be
// additionally labeled with the particular Watch instance.
func NewMetrics(prefix string, reg prometheus.Registerer) Metrics {
        return Metrics{
                isFailing: false,

                clientCallsTotal: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: fmt.Sprint(prefix, "_client_calls_total"),
                                Help: "Number of calls to k8s client.{Watch,List}, labeled by method",
                        },
                        []string{metricInstanceLabel, "method"},
                )),
                relistRequestsTotal: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: fmt.Sprint(prefix, "_relist_requests_total"),
                                Help: "Number of internal manual relisting requests",
                        },
                        []string{metricInstanceLabel},
                )),
                eventsTotal: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: fmt.Sprint(prefix, "_events_total"),
                                Help: "Number of k8s watch.Events that have occurred, including errors, labeled by type",
                        },
                        []string{metricInstanceLabel, "type"},
                )),
                errorsTotal: util.RegisterMetric(reg, prometheus.NewCounterVec(
                        prometheus.CounterOpts{
                                Name: fmt.Sprint(prefix, "_errors_total"),
                                Help: "Number of errors, either error events or re-list errors, labeled by source",
                        },
                        []string{metricInstanceLabel, "source"},
                )),
                aliveCurrent: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: fmt.Sprint(prefix, "_alive_current"),
                                Help: "For each watcher, 1 iff the watcher is currently running or failing, else 0",
                        },
                        []string{metricInstanceLabel},
                )),
                failingCurrent: util.RegisterMetric(reg, prometheus.NewGaugeVec(
                        prometheus.GaugeOpts{
                                Name: fmt.Sprint(prefix, "_failing_current"),
                                Help: "For each watcher, 1 iff the watcher's last request failed *and* it's waiting to retry, else 0",
                        },
                        []string{metricInstanceLabel},
                )),
        }
}

///////////////////////////////////////////////
// Internal helper methods for MetricsConfig //
///////////////////////////////////////////////

func (m *MetricsConfig) alive() {
        m.aliveCurrent.WithLabelValues(m.Instance).Inc()
        // Explicitly set the 'failing' count so that it's present (and set to zero)
        m.failingCurrent.WithLabelValues(m.Instance).Add(0.0)
}

func (m *MetricsConfig) unalive() {
        m.aliveCurrent.WithLabelValues(m.Instance).Dec()
}

func (m *MetricsConfig) failing() {
        if !m.isFailing {
                m.failingCurrent.WithLabelValues(m.Instance).Inc()
        }
        m.isFailing = true
}

func (m *MetricsConfig) unfailing() {
        if m.isFailing {
                m.failingCurrent.WithLabelValues(m.Instance).Dec()
        }
        m.isFailing = false
}

func (m *MetricsConfig) startList() {
        m.clientCallsTotal.WithLabelValues(m.Instance, "List").Inc()
}

func (m *MetricsConfig) startWatch() {
        m.clientCallsTotal.WithLabelValues(m.Instance, "Watch").Inc()
}

func (m *MetricsConfig) relistRequested() {
        m.relistRequestsTotal.WithLabelValues(m.Instance).Inc()
}

func (m *MetricsConfig) doneList(err error) {
        if err != nil {
                m.errorsTotal.WithLabelValues(m.Instance, "List").Inc()
        }
}

func (m *MetricsConfig) doneWatch(err error) {
        if err != nil {
                m.errorsTotal.WithLabelValues(m.Instance, "Watch").Inc()
        }
}

func (m *MetricsConfig) recordEvent(ty watch.EventType) {
        m.eventsTotal.WithLabelValues(m.Instance, string(ty)).Inc()

        if ty == watch.Error {
                m.errorsTotal.WithLabelValues(m.Instance, "Watch.Event").Inc()
        }
}

package watch

import (
        "context"
        "errors"
        "fmt"
        stdruntime "runtime"
        "sync"
        "sync/atomic"
        "time"

        "github.com/samber/lo"
        "go.uber.org/zap"

        apierrors "k8s.io/apimachinery/pkg/api/errors"
        metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/apimachinery/pkg/runtime"
        "k8s.io/apimachinery/pkg/types"
        "k8s.io/apimachinery/pkg/watch"

        "github.com/neondatabase/autoscaling/pkg/util"
)

// Client is implemented by the specific interfaces of kubernetes clients, like
// `Clientset.CoreV1().Pods(namespace)` or `..Nodes()`
//
// This interface should be *already implemented* by whatever the correct client is.
type Client[L any] interface {
        List(context.Context, metav1.ListOptions) (L, error)
        Watch(context.Context, metav1.ListOptions) (watch.Interface, error)
}

// Config is the miscellaneous configuration used by Watch
type Config struct {
        // ObjectNameLogField determines the key given to the logger to use when describing the type
        // being watched -- for example, "pod" or "virtualmachine"
        //
        // This can help with standardizing keys between the watcher and everything else using it.
        ObjectNameLogField string

        // Metrics will be used by the Watch call to report some information about its internal
        // operations
        //
        // Refer to the Metrics and MetricsConfig types for more information.
        Metrics MetricsConfig

        // RetryRelistAfter gives a retry interval when a re-list fails. If left nil, then Watch will
        // not retry.
        RetryRelistAfter *util.TimeRange
        // RetryWatchAfter gives a retry interval when a non-initial watch fails. If left nil, then
        // Watch will not retry.
        RetryWatchAfter *util.TimeRange
}

// Accessors provides the "glue" functions for Watch to go from a list L (returned by the
// client's List) to the underlying slice of items []T
type Accessors[L any, T any] struct {
        Items func(L) []T
}

// Object is implemented by pointers to T, where T is typically the resource that we're
// actually watching.
//
// Example implementers: *corev1.Pod, *corev1.Node
type Object[T any] interface {
        ~*T
        runtime.Object
        metav1.ObjectMetaAccessor
}

// HandlerFuncs provides the set of callbacks to use for events from Watch
type HandlerFuncs[P any] struct {
        AddFunc    func(obj P, preexisting bool)
        UpdateFunc func(oldObj P, newObj P)
        DeleteFunc func(obj P, mayBeStale bool)
}

// Index represents types that provide some kind of additional index on top of the base listing
//
// Indexing is functionally implemented in the same way that WatchHandlerFuncs is, with the main
// difference being that more things are done for you with WatchIndexes. In particular, indexes can
// be added and removed after the Watch has already started, and the locking behavior is explicit.
type Index[T any] interface {
        Add(obj *T)
        Update(oldObj, newObj *T)
        Delete(obj *T)
}

// InitMode dictates the behavior of Watch with respect to any initial calls to
// handlers.AddFunc before returning
//
// If set to InitWatchModeSync, then AddFunc will be called while processing the initial listing,
// meaning that the returned WatchStore is guaranteed contain the state of the cluster (although it
// may update before any access).
//
// Otherwise, if set to InitWatchModeDefer, then AddFunc will not be called until after Watch
// returns. Correspondingly, the WatchStore will not update until then either.
type InitMode string

const (
        InitModeSync  InitMode = "sync"
        InitModeDefer InitMode = "defer"
)

// Watch starts a goroutine for watching events, using the provided WatchHandlerFuncs as the
// callbacks for each type of event.
//
// The type C is the kubernetes client we use to get the objects, L representing a list of these,
// T representing the object type, and P as a pointer to T.
func Watch[C Client[L], L metav1.ListMetaAccessor, T any, P Object[T]](
        ctx context.Context,
        logger *zap.Logger,
        client C,
        config Config,
        accessors Accessors[L, T],
        mode InitMode,
        listOpts metav1.ListOptions,
        handlers HandlerFuncs[P],
) (*Store[T], error) {
        if accessors.Items == nil {
                panic(errors.New("accessors.Items == nil"))
        }

        // Workaround for https://github.com/kubernetes/kubernetes/issues/98925 :
        //
        // Pre-calculate the GVK for the object types, because List() operations only set the
        // Kind+APIVersion on the List type, and not the individual elements.
        sampleObj := P(new(T))
        gvk, err := util.LookupGVKForType(sampleObj)
        if err != nil {
                return nil, err
        }

        // do the conversion from P -> *T. We wanted the handlers to be provided with P so that the
        // caller doesn't need to manually specify the generics, but in order to store the callbacks
        // inside the watch store, we need to convert them so we're not carrying around more generic
        // parameters than we need.
        actualHandlers := HandlerFuncs[*T]{
                AddFunc:    func(obj *T, preexisting bool) {},
                UpdateFunc: func(oldObj, newObj *T) {},
                DeleteFunc: func(obj *T, mayBeStale bool) {},
        }
        if handlers.AddFunc != nil {
                actualHandlers.AddFunc = func(obj *T, preexisting bool) {
                        handlers.AddFunc(P(obj), preexisting)
                }
        }
        if handlers.UpdateFunc != nil {
                actualHandlers.UpdateFunc = func(oldObj, newObj *T) {
                        handlers.UpdateFunc(P(oldObj), P(newObj))
                }
        }
        if handlers.DeleteFunc != nil {
                actualHandlers.DeleteFunc = func(obj *T, mayBeStale bool) {
                        handlers.DeleteFunc(P(obj), mayBeStale)
                }
        }

        // use a copy of the options for watching vs listing:
        // We want to avoid setting some values for the list requests - specifically, in order to
        // provide synchronization guarantees that the contents of the store are up-to-date strictly
        // *after* the start of an explicit Relist() request, we need to *not* set a resource version in
        // the request to get the most recent data.
        // For more, see: https://kubernetes.io/docs/reference/using-api/api-concepts/#resource-versions
        watchOpts := listOpts

        // Handling bookmarks means that sometimes the API server will be kind, allowing us to continue
        // the watch instead of resyncing.
        watchOpts.AllowWatchBookmarks = true

        // Perform an initial listing
        config.Metrics.startList()
        initialList, err := client.List(ctx, listOpts)
        config.Metrics.doneList(err)
        if err != nil {
                return nil, fmt.Errorf("Initial list failed: %w", err)
        }

        // set ResourceVersion so that the client.Watch request(s) show only the changes since we made
        // the initial list
        watchOpts.ResourceVersion = initialList.GetListMeta().GetResourceVersion()

        sendStop, stopSignal := util.NewSingleSignalPair[struct{}]()

        store := Store[T]{
                mutex:         sync.Mutex{},
                objects:       make(map[types.UID]*T),
                listeners:     make(map[types.UID]*util.Broadcaster),
                handlers:      actualHandlers,
                triggerRelist: make(chan struct{}, 1), // ensure sends are non-blocking
                relisted:      make(chan struct{}),
                nextIndexID:   0,
                indexes:       make(map[uint64]Index[T]),
                stopSignal:    sendStop,
                stopped:       atomic.Bool{},
                failing:       atomic.Bool{},

                deepCopy: func(t *T) *T {
                        return (*T)(P(t).DeepCopyObject().(P))
                },
        }

        items := accessors.Items(initialList)

        var deferredAdds []T

        if mode == InitModeDefer {
                deferredAdds = items
        } else {
                for i := range items {
                        obj := &items[i]
                        P(obj).GetObjectKind().SetGroupVersionKind(gvk)
                        uid := P(obj).GetObjectMeta().GetUID()
                        store.objects[uid] = obj
                        store.handlers.AddFunc(obj, true)

                        // Check if the context has been cancelled. This can happen in practice if AddFunc may
                        // take a long time to complete.
                        if err := ctx.Err(); err != nil {
                                return nil, err
                        }
                }
        }
        items = nil // reset to allow GC

        // Start watching
        config.Metrics.startWatch()
        watcher, err := client.Watch(ctx, watchOpts)
        config.Metrics.doneWatch(err)
        if err != nil {
                return nil, fmt.Errorf("Initial watch failed: %w", err)
        }

        // Lock the store to pass it into the goroutine, so that we don't have to worry about immediate
        // operations on the store racing with any deferred additions.
        store.mutex.Lock()

        // With the successful Watch call underway, we hand off responsibility to a new goroutine.
        go func() {
                holdingInitialLock := true
                defer func() {
                        if holdingInitialLock {
                                store.mutex.Unlock()
                        }
                }()

                // note: instead of deferring watcher.Stop() directly, wrapping it in an outer function
                // means that we'll always Stop the most recent watcher.
                defer func() {
                        watcher.Stop()
                }()

                // explicitly stop on exit so that it's possible to know when the store is stopped
                defer store.Stop()

                config.Metrics.alive()
                defer config.Metrics.unalive()

                if len(deferredAdds) != 0 {
                        logger.Info("Handling deferred adds")
                }

                // Handle any deferred calls to AddFunc
                // NB: This is only sound because we're still holding store.mutex; otherwise we'd have to
                // deal with possible racy operations (including adding an index).
                for i := range deferredAdds {
                        obj := &deferredAdds[i]
                        P(obj).GetObjectKind().SetGroupVersionKind(gvk)
                        uid := P(obj).GetObjectMeta().GetUID()
                        store.objects[uid] = obj
                        store.handlers.AddFunc(obj, true)

                        if err := ctx.Err(); err != nil {
                                logger.Warn("Ending: because Context expired", zap.Error(ctx.Err()))
                                return
                        }
                }

                holdingInitialLock = false
                store.mutex.Unlock()

                defer config.Metrics.unfailing()

                logger.Info("All setup complete, entering event loop")

                for {
                        // this is used exclusively for relisting, but must be defined up here so that our gotos
                        // don't jump over variables.
                        var signalRelistComplete []chan struct{}
                        for {
                                select {
                                case <-stopSignal.Recv():
                                        logger.Info("Ending: because we got a stop signal")
                                        return
                                case <-ctx.Done():
                                        logger.Info("Ending: because Context expired", zap.Error(ctx.Err()))
                                        return
                                case <-store.triggerRelist:
                                        config.Metrics.relistRequested()
                                        goto relist
                                case event, ok := <-watcher.ResultChan():
                                        if !ok {
                                                logger.Info("Watcher ended gracefully, restarting")
                                                goto newWatcher
                                        }

                                        config.Metrics.recordEvent(event.Type)

                                        if event.Type == watch.Error {
                                                err := apierrors.FromObject(event.Object)
                                                // note: we can get 'too old resource version' errors when there's been a
                                                // lot of resource updates that our ListOptions filtered out.
                                                if apierrors.IsResourceExpired(err) {
                                                        logger.Warn("Received error event", zap.Error(err))
                                                } else {
                                                        logger.Error("Received error event", zap.Error(err))
                                                }
                                                goto relist
                                        }

                                        obj, ok := event.Object.(P)
                                        if !ok {
                                                var p P
                                                logger.Error(
                                                        "Error casting event object to desired type",
                                                        zap.String("eventType", string(event.Type)),
                                                        zap.String("eventObjectType", fmt.Sprintf("%T", event.Object)),
                                                        zap.String("desiredObjectType", fmt.Sprintf("%T", p)),
                                                )
                                                continue
                                        }
                                        P(obj).GetObjectKind().SetGroupVersionKind(gvk)

                                        meta := obj.GetObjectMeta()
                                        // Update ResourceVersion so subsequent calls to client.Watch won't include this
                                        // event, which we're currently processing.
                                        watchOpts.ResourceVersion = meta.GetResourceVersion()

                                        // Wrap the remainder in a function, so we can have deferred unlocks.
                                        uid := meta.GetUID()
                                        err := store.handleEvent(event.Type, uid, obj)
                                        if err != nil {
                                                name := util.NamespacedName{Namespace: meta.GetNamespace(), Name: meta.GetName()}
                                                logger.Error(
                                                        "failed to handle event",
                                                        zap.Error(err),
                                                        zap.String("UID", string(uid)),
                                                        zap.Object(config.ObjectNameLogField, name),
                                                )
                                                goto relist
                                        }
                                }
                        }

                relist:
                        // Every time we make a new request, we create a channel for it. That's because we need
                        // to make sure that any user's call to WatchStore.Relist() that happens *while* we're
                        // actually making the request to K8s won't get overwritten by that request. Basically,
                        // we need to make sure that relisting is only marked as complete if there was a request
                        // that occurred *after* the call to Relist() returned.
                        //
                        // There's probably other ways we could do this - it's an area for possible improvement.
                        //
                        // Note: if we didn't do this at all, the alternative would be to ignore additional
                        // relist requests, having them handled naturally as we get around to watching again.
                        // This can amplify request failures - particularly if the K8s API server is overloaded.
                        signalRelistComplete = make([]chan struct{}, 0, 1)

                        // When we get to this point in the control flow, it's not guaranteed that the watcher
                        // has stopped.
                        //
                        // As of 2023-12-05, the implementation of the API's watchers (internally handled by
                        // k8s.io/apimachinery@.../pkg/watch/streamwatcher.go) explicitly allows multiple calls
                        // to Stop().
                        //
                        // This all means that it's always safe for us to call Stop() here, and sometimes we
                        // MUST call it here (to avoid leaking watchers after relisting), so it's worth just
                        // always calling it.
                        watcher.Stop()

                        logger.Info("Relisting")
                        for first := true; ; first = false {
                                func() {
                                        store.mutex.Lock()
                                        defer store.mutex.Unlock()

                                        newRelistTriggered := false

                                        // Consume any additional relist request.
                                        // All usage of triggerRelist from within (*Store[T]).Relist() is asynchronous,
                                        // because triggerRelist has capacity=1 and has an item in it iff relisting has
                                        // been requested, so if Relist() *would* block on sending, the signal has
                                        // already been given.
                                        // That's all to say: Receiving only once from triggerRelist is sufficient.
                                        select {
                                        case <-store.triggerRelist:
                                                newRelistTriggered = true
                                                config.Metrics.relistRequested()
                                        default:
                                        }

                                        if first || newRelistTriggered {
                                                signalRelistComplete = append(signalRelistComplete, store.relisted)
                                                store.relisted = make(chan struct{})
                                        }
                                }()

                                config.Metrics.startList()
                                relistList, err := client.List(ctx, listOpts) // don't include resource version, so it's guaranteed most recent
                                config.Metrics.doneList(err)
                                if err != nil {
                                        logger.Error("Relist failed", zap.Error(err))
                                        if config.RetryRelistAfter == nil {
                                                logger.Info("Ending: because relist failed and RetryWatchAfter is nil")
                                                return
                                        }
                                        retryAfter := config.RetryRelistAfter.Random()
                                        logger.Info("Retrying relist after delay", zap.Duration("delay", retryAfter))

                                        store.failing.Store(true)
                                        config.Metrics.failing()

                                        select {
                                        case <-time.After(retryAfter):
                                                logger.Info("Relist delay reached, retrying", zap.Duration("delay", retryAfter))
                                                continue
                                        case <-ctx.Done():
                                                logger.Info("Ending: because Context expired", zap.Error(ctx.Err()))
                                                return
                                        case <-stopSignal.Recv():
                                                logger.Info("Ending: because we got a stop signal")
                                                return
                                        }
                                }

                                store.failing.Store(false)
                                config.Metrics.unfailing()

                                // err == nil, process relistList
                                relistItems := accessors.Items(relistList)

                                func() {
                                        store.mutex.Lock()
                                        defer store.mutex.Unlock()

                                        // Copy the current contents of objects, and start tracking which ones have
                                        // since been deleted.
                                        oldObjects := make(map[types.UID]*T)
                                        deleted := make(map[types.UID]struct{}) // set of UIDs that have been deleted
                                        for uid, obj := range store.objects {
                                                oldObjects[uid] = obj
                                                deleted[uid] = struct{}{} // initially mark everything as deleted, until we find it isn't
                                        }

                                        // Mark all items we still have as not deleted
                                        for i := range relistItems {
                                                uid := P(&relistItems[i]).GetObjectMeta().GetUID()
                                                delete(deleted, uid)
                                        }

                                        // Generate deletion events for all objects that are no longer present. We do
                                        // this first so that when there's externally-enforced uniqueness that isn't
                                        // unique *across time* (e.g. object names), users can still rely on uniqueness
                                        // at any time that handlers are called.
                                        for uid := range deleted {
                                                obj := store.objects[uid]
                                                if broadcaster, ok := store.listeners[uid]; ok {
                                                        broadcaster.Broadcast()
                                                        delete(store.listeners, uid)
                                                }
                                                delete(store.objects, uid)
                                                for _, index := range store.indexes {
                                                        index.Delete(obj)
                                                }
                                                store.handlers.DeleteFunc(obj, true)
                                        }

                                        for i := range relistItems {
                                                obj := &relistItems[i]
                                                uid := P(obj).GetObjectMeta().GetUID()
                                                P(obj).GetObjectKind().SetGroupVersionKind(gvk)

                                                store.objects[uid] = obj
                                                oldObj, hasObj := oldObjects[uid]

                                                if hasObj {
                                                        if broadcaster, ok := store.listeners[uid]; ok {
                                                                broadcaster.Broadcast()
                                                        }
                                                        for _, index := range store.indexes {
                                                                index.Update(oldObj, obj)
                                                        }
                                                        store.handlers.UpdateFunc(oldObj, obj)
                                                } else {
                                                        for _, index := range store.indexes {
                                                                index.Add(obj)
                                                        }
                                                        store.handlers.AddFunc(obj, false)
                                                }
                                        }
                                }()

                                // Update ResourceVersion, recreate watcher.
                                watchOpts.ResourceVersion = relistList.GetListMeta().GetResourceVersion()
                                logger.Info("Relist complete, restarting watcher")
                                for _, ch := range signalRelistComplete {
                                        close(ch)
                                }
                                goto newWatcher
                        }

                newWatcher:
                        // In the loop, retry the API call to watch.
                        //
                        // It's possible that we attempt to watch with a resource version that's too old, in
                        // which case the API call *does* succeed, but the first event is an error (which we use
                        // to trigger relisting).
                        for {
                                config.Metrics.startWatch()
                                watcher, err = client.Watch(ctx, watchOpts)
                                config.Metrics.doneWatch(err)
                                if err != nil {
                                        logger.Error("Re-watch failed", zap.Error(err))
                                        if config.RetryWatchAfter == nil {
                                                logger.Info("Ending: because re-watch failed and RetryWatchAfter is nil")
                                                return
                                        }
                                        retryAfter := config.RetryWatchAfter.Random()
                                        logger.Info("Retrying re-watch after delay", zap.Duration("delay", retryAfter))

                                        store.failing.Store(true)
                                        config.Metrics.failing()

                                        select {
                                        case <-time.After(retryAfter):
                                                logger.Info("Re-watch delay reached, retrying", zap.Duration("delay", retryAfter))
                                                continue
                                        case <-ctx.Done():
                                                logger.Info("Ending: because Context expired", zap.Error(ctx.Err()))
                                                return
                                        case <-stopSignal.Recv():
                                                logger.Info("Ending: because we got a stop signal")
                                                return
                                        }
                                }

                                // err == nil
                                store.failing.Store(false)
                                config.Metrics.unfailing()
                                break
                        }
                }
        }()

        return &store, nil
}

// helper for Watch. Error events are expected to already have been handled by the caller.
func (store *Store[T]) handleEvent(
        eventType watch.EventType,
        uid types.UID,
        obj *T,
) error {
        // Some of the cases below don't actually require locking the store. Most of the events that we
        // receive *do* though, so we're better off doing it here for simplicity.
        store.mutex.Lock()
        defer store.mutex.Unlock()

        switch eventType {
        case watch.Added:
                if _, ok := store.objects[uid]; ok {
                        return fmt.Errorf("received add event for object we already have")
                }
                store.objects[uid] = obj
                for _, index := range store.indexes {
                        index.Add(obj)
                }
                store.handlers.AddFunc(obj, false)
        case watch.Deleted:
                // We're given the state of the object immediately before deletion, which
                // *may* be different to what we currently have stored.
                old, ok := store.objects[uid]
                if !ok {
                        return errors.New("received delete event for object that's not present")
                }
                // If there is a listener, *do* notify them, and then delete the listeners just like we're
                // deleting the object from the map.
                if broadcaster, ok := store.listeners[uid]; ok {
                        broadcaster.Broadcast()
                        delete(store.listeners, uid)
                }
                // Update:
                for _, index := range store.indexes {
                        index.Update(old, obj)
                }
                store.handlers.UpdateFunc(old, obj)
                // Delete:
                delete(store.objects, uid)
                for _, index := range store.indexes {
                        index.Delete(obj)
                }
                store.handlers.DeleteFunc(obj, false)
        case watch.Modified:
                old, ok := store.objects[uid]
                if !ok {
                        return errors.New("received update event for object that's not present")
                }
                store.objects[uid] = obj
                if broadcaster, ok := store.listeners[uid]; ok {
                        broadcaster.Broadcast()
                }
                for _, index := range store.indexes {
                        index.Update(old, obj)
                }
                store.handlers.UpdateFunc(old, obj)
        case watch.Bookmark:
                // Nothing to do, just serves to give us a new ResourceVersion, which should be handled by
                // the caller.
        case watch.Error:
                panic(errors.New("handleEvent unexpectedly called with eventType Error"))
        default:
                panic(errors.New("unknown watch event"))
        }
        return nil
}

// Store provides an interface for getting information about a list of Ts using the event
// listener from a previous call to Watch
type Store[T any] struct {
        mutex     sync.Mutex
        objects   map[types.UID]*T
        listeners map[types.UID]*util.Broadcaster

        handlers HandlerFuncs[*T]

        // helper function, created in Watch() using knowledge that *T (or, something based on it) is a
        // runtime.Object.
        // This is required for the implementation of (*Store[T]).NopUpdate() in order to produce a
        // second object without having any guarantees about T.
        deepCopy func(*T) *T

        // triggerRelist has capacity=1 and *if* the channel contains an item, then relisting has been
        // requested by some call to (*Store[T]).Relist().
        triggerRelist chan struct{}
        // relisted is replaced and closed whenever relisting happens. Refer to its usage in Watch or
        // (*Store[T]).Relist() for more detail.
        relisted chan struct{}

        nextIndexID uint64
        indexes     map[uint64]Index[T]

        stopSignal util.SignalSender[struct{}]
        stopped    atomic.Bool
        failing    atomic.Bool
}

// Relist triggers re-listing the WatchStore, returning a channel that will be closed once the
// re-list is complete
func (w *Store[T]) Relist() <-chan struct{} {
        w.mutex.Lock()
        defer w.mutex.Unlock()

        // Because triggerRelist has capacity=1, failing to immediately send to the channel means that
        // there's already a signal to request relisting that has not yet been processed.
        select {
        case w.triggerRelist <- struct{}{}:
        default:
        }

        // note: w.relisted is replaced immediately before every attempt at the API call for relisting,
        // so that there's a strict happens-before relation that guarantees that *when* w.relisted is
        // closed, the relevant List call *must* have happened after any attempted send on
        // w.triggerRelist.
        return w.relisted
}

// NopUpdate runs the update handler for the object with the given UID, blocking until completion.
//
// This method returns false if there is no object with the given UID.
//
// Why does this exist? Well, watch events are often going to be handled by adding the object to a
// queue. And sometimes you want to re-inject something into the queue. But it's tricky for that to
// be synchronized unless it's guaranteed to agree with the ongoing watch -- so this method allows
// one to re-inject something into the queue if and only if the watch still belives it exists in
// kubernetes.
func (w *Store[T]) NopUpdate(uid types.UID) (ok bool) {
        w.mutex.Lock()
        defer w.mutex.Unlock()

        obj, ok := w.objects[uid]
        if !ok {
                return false
        }

        copied := w.deepCopy(obj)
        w.handlers.UpdateFunc(copied, obj)
        return true
}

// Listen returns a util.BroadcastReceiver that will be updated whenever the object is modified or
// eventually deleted.
//
// This method returns false if the object with the given UID does not exist.
func (w *Store[T]) Listen(uid types.UID) (_ util.BroadcastReceiver, ok bool) {
        w.mutex.Lock()
        defer w.mutex.Unlock()

        if _, ok := w.objects[uid]; !ok {
                return lo.Empty[util.BroadcastReceiver](), false
        }

        if b, ok := w.listeners[uid]; ok {
                return b.NewReceiver(), true
        } else {
                b := util.NewBroadcaster()
                w.listeners[uid] = b
                return b.NewReceiver(), true
        }
}

func (w *Store[T]) Stop() {
        w.stopSignal.Send(struct{}{})
        w.stopped.Store(true)
}

func (w *Store[T]) Failing() bool {
        return w.failing.Load()
}

func (w *Store[T]) Stopped() bool {
        return w.stopped.Load()
}

func (w *Store[T]) Items() []*T {
        w.mutex.Lock()
        defer w.mutex.Unlock()

        items := make([]*T, len(w.objects))
        i := 0
        for _, val := range w.objects {
                items[i] = val
                i += 1
        }

        return items
}

// NewIndexedStore creates a new IndexedWatchStore from the WatchStore and the index to use.
//
// Note: the index type is assumed to have reference semantics; i.e. any shallow copy of the value
// will affect any other shallow copy.
//
// For more information, refer to IndexedWatchStore.
func NewIndexedStore[T any, I Index[T]](store *Store[T], index I) IndexedStore[T, I] {
        store.mutex.Lock()
        defer store.mutex.Unlock()

        for _, obj := range store.objects {
                index.Add(obj)
        }

        id := store.nextIndexID
        store.nextIndexID += 1
        store.indexes[id] = index

        collector := &struct{}{}
        // when this IndexedWatchStore is GC'd, remove its index from the WatchStore. This should
        // provide a reliable way of making sure that indexes always get cleaned up.
        stdruntime.SetFinalizer(collector, func(_ any) {
                // note: finalizers always run in a separate goroutine, so it's ok to lock here.
                store.mutex.Lock()
                defer store.mutex.Unlock()
                delete(store.indexes, id)
        })

        return IndexedStore[T, I]{store, index, id, collector}
}

// IndexedStore represents a WatchStore, wrapped with a privileged WatchIndex that can be used
// to efficiently answer queries.
type IndexedStore[T any, I Index[T]] struct {
        *Store[T]

        index I

        // id stores the id of this index in the WatchStore
        id uint64
        // collector has a destructor attached to it so that the index can be automatically removed from
        // the WatchStore when it's no longer in use, without requiring users to manually get rid of it.
        collector *struct{}
}

// WithIndex calls a function with the current state of the index, locking the WatchStore around it.
//
// It is almost guaranteed to be an error to indirectly return the index with this function.
func (w IndexedStore[T, I]) WithIndex(f func(I)) {
        w.Store.mutex.Lock()
        defer w.Store.mutex.Unlock()

        f(w.index)
}

func (w IndexedStore[T, I]) GetIndexed(f func(I) (*T, bool)) (obj *T, ok bool) {
        w.WithIndex(func(i I) {
                obj, ok = f(i)
        })
        return
}

func (w IndexedStore[T, I]) ListIndexed(f func(I) []*T) (list []*T) {
        w.WithIndex(func(i I) {
                list = f(i)
        })
        return
}

func NewNameIndex[T any]() *NameIndex[T] {
        // check that *T implements metav1.ObjectMetaAccessor
        var zero T
        ptrToZero := any(&zero)
        if _, ok := ptrToZero.(metav1.ObjectMetaAccessor); !ok {
                panic("type *T must implement metav1.ObjectMetaAccessor")
        }

        // This doesn't *need* to be a pointer, but the intent is a little more clear this way.
        return &NameIndex[T]{
                namespacedNames: make(map[util.NamespacedName]*T),
        }
}

// NameIndex is a WatchIndex that provides efficient lookup for a value with a particular name
type NameIndex[T any] struct {
        namespacedNames map[util.NamespacedName]*T
}

// note: requires that *T implements metav1.ObjectMetaAccessor
func keyForObj[T any](obj *T) util.NamespacedName {
        meta := any(obj).(metav1.ObjectMetaAccessor).GetObjectMeta()

        return util.NamespacedName{Namespace: meta.GetNamespace(), Name: meta.GetName()}
}

func (i *NameIndex[T]) Add(obj *T) {
        i.namespacedNames[keyForObj(obj)] = obj
}

func (i *NameIndex[T]) Update(oldObj, newObj *T) {
        i.Delete(oldObj)
        i.Add(newObj)
}

func (i *NameIndex[T]) Delete(obj *T) {
        delete(i.namespacedNames, keyForObj(obj))
}

func (i *NameIndex[T]) Get(namespace string, name string) (obj *T, ok bool) {
        obj, ok = i.namespacedNames[util.NamespacedName{Namespace: namespace, Name: name}]
        return
}

func NewFlatNameIndex[T any]() *FlatNameIndex[T] {
        // check that *T implements metav1.ObjectMetaAccessor
        var zero T
        ptrToZero := any(&zero)
        if _, ok := ptrToZero.(metav1.ObjectMetaAccessor); !ok {
                panic("type *T must implement metav1.ObjectMetaAccessor")
        }

        return &FlatNameIndex[T]{
                names: make(map[string]*T),
        }
}

type FlatNameIndex[T any] struct {
        names map[string]*T
}

// note: requires that *T implements metav1.ObjectMetaAccessor
func getName[T any](obj *T) string {
        meta := any(obj).(metav1.ObjectMetaAccessor).GetObjectMeta()
        return meta.GetName()
}

func (i *FlatNameIndex[T]) Add(obj *T) {
        i.names[getName(obj)] = obj
}

func (i *FlatNameIndex[T]) Update(oldObj, newObj *T) {
        i.Delete(oldObj)
        i.Add(newObj)
}

func (i *FlatNameIndex[T]) Delete(obj *T) {
        delete(i.names, getName(obj))
}

func (i *FlatNameIndex[T]) Get(name string) (obj *T, ok bool) {
        obj, ok = i.names[name]
        return
}

package main

import (
        "archive/tar"
        "bytes"
        "context"
        _ "embed"
        "errors"
        "flag"
        "fmt"
        "io"
        "log"
        "os"
        "path/filepath"
        "strings"
        "text/template"

        "github.com/alessio/shellescape"
        "github.com/distribution/reference"
        cliconfig "github.com/docker/cli/cli/config"
        "github.com/docker/docker/api/types"
        "github.com/docker/docker/api/types/container"
        "github.com/docker/docker/api/types/registry"
        "github.com/docker/docker/client"
        "github.com/docker/docker/pkg/jsonmessage"
        "golang.org/x/term"
        "gopkg.in/yaml.v3"
)

// vm-builder --src alpine:3.19 --dst vm-alpine:dev --file vm-alpine.qcow2

var (
        //go:embed files/img.Dockerfile
        dockerfileVmBuilder string
        //go:embed files/helper.move-bins.sh
        scriptMoveBinsHelper string
        //go:embed files/vmstart
        scriptVmStart string
        //go:embed files/inittab
        scriptInitTab string
        //go:embed files/vmacpi
        scriptVmAcpi string
        //go:embed files/vmshutdown
        scriptVmShutdown string
        //go:embed files/vminit
        scriptVmInit string
        //go:embed files/udev-init.sh
        scriptUdevInit string
        //go:embed files/resize-swap.sh
        scriptResizeSwap string
        //go:embed files/set-disk-quota.sh
        scriptSetDiskQuota string
        //go:embed files/vector.yaml
        configVector string
        //go:embed files/chrony.conf
        configChrony string
        //go:embed files/sshd_config
        configSshd string
)

const (
        targetArchLinuxAmd64 = "linux/amd64"
        targetArchLinuxArm64 = "linux/arm64"
)

var (
        Version           string
        NeonvmDaemonImage string

        srcImage  = flag.String("src", "", `Docker image used as source for virtual machine disk image: --src=alpine:3.19`)
        dstImage  = flag.String("dst", "", `Docker image with resulting disk image: --dst=vm-alpine:3.19`)
        size      = flag.String("size", "1G", `Size for disk image: --size=1G`)
        outFile   = flag.String("file", "", `Save disk image as file: --file=vm-alpine.qcow2`)
        specFile  = flag.String("spec", "", `File containing additional customization: --spec=spec.yaml`)
        quiet     = flag.Bool("quiet", false, `Show less output from the docker build process`)
        forcePull = flag.Bool("pull", false, `Pull src image even if already present locally`)
        version   = flag.Bool("version", false, `Print vm-builder version`)

        daemonImageFlag = flag.String("daemon-image", "", `Specify the neonvm-daemon image: --daemon-image=neonvm-daemon:dev`)
        targetArch      = flag.String("target-arch", "", fmt.Sprintf("Target architecture: --arch %s | %s", targetArchLinuxAmd64, targetArchLinuxArm64))
)

func AddTemplatedFileToTar(tw *tar.Writer, tmplArgs any, filename string, tmplString string) error {
        tmpl, err := template.New(filename).Parse(tmplString)
        if err != nil {
                return fmt.Errorf("failed to parse template for %q: %w", filename, err)
        }

        var buf bytes.Buffer
        if err = tmpl.Execute(&buf, tmplArgs); err != nil {
                return fmt.Errorf("failed to execute template for %q: %w", filename, err)
        }
        return addFileToTar(tw, filename, buf.Bytes())
}

func addFileToTar(tw *tar.Writer, filename string, contents []byte) error {
        tarHeader := &tar.Header{
                Name: filename,
                Size: int64(len(contents)),
                Mode: 0o755, // TODO: shouldn't just set this for everything.
        }

        if err := tw.WriteHeader(tarHeader); err != nil {
                return fmt.Errorf("failed to write tar header for %q: %w", filename, err)
        }
        if _, err := tw.Write(contents); err != nil {
                return fmt.Errorf("failed to write file content for %q: %w", filename, err)
        }

        return nil
}

type TemplatesContext struct {
        User          string
        Entrypoint    []string
        Cmd           []string
        Env           []string
        RootDiskImage string

        NeonvmDaemonImage string

        SpecBuild       string
        SpecMerge       string
        InittabCommands []inittabCommand
        AgettyTTY       string
        ShutdownHook    string
}

type inittabCommand struct {
        SysvInitAction      string
        CommandUser         string
        ShellEscapedCommand string
}

func main() {
        flag.Parse()
        var dstIm string
        if *version {
                fmt.Println(Version)
                os.Exit(0)
        }
        if len(*daemonImageFlag) == 0 && len(NeonvmDaemonImage) == 0 {
                log.Println("neonvm-daemon image not set, needs to be explicitly passed in, or compiled with -ldflags '-X main.NeonvmDaemonImage=...'")
                flag.PrintDefaults()
                os.Exit(1)
        }

        if targetArch == nil || *targetArch == "" {
                log.Println("Target architecture not set, see usage info:")
                flag.PrintDefaults()
                os.Exit(1)
        }

        if *targetArch != targetArchLinuxAmd64 && *targetArch != targetArchLinuxArm64 {
                log.Fatalf("Unsupported target architecture: %q", *targetArch)
                flag.PrintDefaults()
                return
        }

        neonvmDaemonImage := NeonvmDaemonImage
        if len(*daemonImageFlag) != 0 {
                neonvmDaemonImage = *daemonImageFlag
        }

        if len(*srcImage) == 0 {
                log.Println("-src not set, see usage info:")
                flag.PrintDefaults()
                os.Exit(1)
        }
        if len(*dstImage) == 0 {
                dstIm = fmt.Sprintf("vm-%s", *srcImage)
                log.Printf("-dst not set, using %s\n", dstIm)
        } else {
                dstIm = *dstImage
        }

        var spec *imageSpec
        if *specFile != "" {
                var err error
                spec, err = readImageSpec(*specFile)
                if err != nil {
                        log.Fatalln(err)
                        os.Exit(1)
                }
        }

        log.Println("Load docker credentials")
        dockerConfig, err := cliconfig.Load("" /* auto-detect right directory */)
        if err != nil {
                log.Fatalln(err)
        }

        credentials, err := dockerConfig.GetAllCredentials()
        if err != nil {
                log.Fatalln(err)
        }
        authConfigs := make(map[string]registry.AuthConfig)
        for key, value := range credentials {
                log.Printf("Found docker credentials for %s", key)
                authConfigs[key] = registry.AuthConfig(value)
        }

        ctx := context.Background()

        log.Println("Setup docker connection")
        cli, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation())
        if err != nil {
                log.Fatalln(err)
        }
        defer cli.Close()

        hostContainsSrcImage := false
        if !*forcePull {
                hostImages, err := cli.ImageList(ctx, types.ImageListOptions{})
                if err != nil {
                        log.Fatalln(err) //nolint:gocritic // linter complains that Fatalln circumvents deferred cli.Close(). Too much work to fix in #721, leaving for later.
                }

                for _, img := range hostImages {
                        for _, name := range img.RepoTags {
                                if name == *srcImage {
                                        hostContainsSrcImage = true
                                        break
                                }
                        }
                        if hostContainsSrcImage {
                                break
                        }
                }
        }

        if !hostContainsSrcImage {
                // pull source image
                // use a closure so deferred close is closer
                err := func() error {
                        named, err := reference.ParseNormalizedNamed(*srcImage)
                        if err != nil {
                                return err
                        }
                        reg := reference.Domain(named)

                        imagePullOptions := types.ImagePullOptions{}
                        if authConfig, ok := authConfigs[reg]; ok {
                                encoded, err := registry.EncodeAuthConfig(authConfig)
                                if err != nil {
                                        return err
                                }
                                imagePullOptions.RegistryAuth = encoded
                        } else {
                                // Special case handling of docker.io weirdness.
                                // ref https://github.com/moby/moby/blob/e7347f8a8c2fd3d2abd34b638d6fc8c18b0278d1/registry/config.go#L26-L49
                                // (and other handling around index.docker.io in that file...)
                                //
                                // See also e.g. https://github.com/containrrr/watchtower/issues/1176
                                legacyConfig, hasLegacyDockerConfig := authConfigs["https://index.docker.io/v1/"]
                                if hasLegacyDockerConfig && (reg == "docker.io" || reg == "registry-1.docker.io") {
                                        encoded, err := registry.EncodeAuthConfig(legacyConfig)
                                        if err != nil {
                                                return err
                                        }
                                        imagePullOptions.RegistryAuth = encoded
                                } else {
                                        log.Printf("No docker credentials found for %s", reg)
                                }
                        }

                        log.Printf("Pull source docker image: %s", *srcImage)
                        pull, err := cli.ImagePull(ctx, *srcImage, imagePullOptions)
                        if err != nil {
                                return err
                        }
                        defer pull.Close()
                        // do quiet pull - discard output
                        _, err = io.Copy(io.Discard, pull)
                        return err
                }()
                if err != nil {
                        log.Fatalln(err)
                }

        }

        log.Printf("Build docker image for virtual machine (disk size %s): %s\n", *size, dstIm)
        imageSpec, _, err := cli.ImageInspectWithRaw(ctx, *srcImage)
        if err != nil {
                log.Fatalln(err)
        }

        // Shell-escape all the command pieces, twice. We need to do it twice because we're generating
        // a shell script that appends these to a second shell script.
        for i := range imageSpec.Config.Entrypoint {
                imageSpec.Config.Entrypoint[i] = shellescape.Quote(shellescape.Quote(imageSpec.Config.Entrypoint[i]))
        }
        for i := range imageSpec.Config.Cmd {
                imageSpec.Config.Cmd[i] = shellescape.Quote(shellescape.Quote(imageSpec.Config.Cmd[i]))
        }

        tmplArgs := TemplatesContext{
                User:          "root", // overridden below, if imageSpec.Config.User != ""
                Entrypoint:    imageSpec.Config.Entrypoint,
                Cmd:           imageSpec.Config.Cmd,
                Env:           imageSpec.Config.Env,
                RootDiskImage: *srcImage,

                NeonvmDaemonImage: neonvmDaemonImage,

                SpecBuild:       "",  // overridden below if spec != nil
                SpecMerge:       "",  // overridden below if spec != nil
                InittabCommands: nil, // overridden below if spec != nil
                ShutdownHook:    "",  // overridden below if spec != nil
                AgettyTTY:       getAgettyTTY(*targetArch),
        }

        if len(imageSpec.Config.User) != 0 {
                tmplArgs.User = imageSpec.Config.User
        }

        tarBuffer := new(bytes.Buffer)
        tw := tar.NewWriter(tarBuffer)
        defer tw.Close()

        if spec != nil {
                tmplArgs.SpecBuild = spec.Build
                tmplArgs.SpecMerge = spec.Merge
                tmplArgs.ShutdownHook = strings.ReplaceAll(spec.ShutdownHook, "\n", "\n\t")

                for _, c := range spec.Commands {
                        // Allow core dumps for all inittab targets
                        c.Shell = fmt.Sprintf("ulimit -c unlimited; %s", c.Shell)
                        tmplArgs.InittabCommands = append(tmplArgs.InittabCommands, inittabCommand{
                                SysvInitAction:      c.SysvInitAction,
                                CommandUser:         c.User,
                                ShellEscapedCommand: shellescape.Quote(c.Shell),
                        })
                }
                for _, f := range spec.Files {
                        var contents []byte
                        switch {
                        case f.Content != nil:
                                contents = []byte(*f.Content)
                        case f.HostPath != nil:
                                // the 'host path' is relative to the directory that the spec file is in
                                path := filepath.Join(filepath.Dir(*specFile), *f.HostPath)

                                var err error
                                contents, err = os.ReadFile(path)
                                if err != nil {
                                        err = fmt.Errorf("failed to read file %q: %w", path, err)
                                        log.Fatalln(err)
                                }
                        }

                        if err := addFileToTar(tw, f.Filename, contents); err != nil {
                                log.Fatalln(err)
                        }
                }
        }

        files := []struct {
                filename string
                tmpl     string
        }{
                {"Dockerfile", dockerfileVmBuilder},
                {"helper.move-bins.sh", scriptMoveBinsHelper},
                {"vmstart", scriptVmStart},
                {"vmshutdown", scriptVmShutdown},
                {"inittab", scriptInitTab},
                {"vmacpi", scriptVmAcpi},
                {"vminit", scriptVmInit},
                {"vector.yaml", configVector},
                {"chrony.conf", configChrony},
                {"sshd_config", configSshd},
                {"udev-init.sh", scriptUdevInit},
                {"resize-swap.sh", scriptResizeSwap},
                {"set-disk-quota.sh", scriptSetDiskQuota},
        }

        for _, f := range files {
                if err := AddTemplatedFileToTar(tw, tmplArgs, f.filename, f.tmpl); err != nil {
                        log.Fatalln(err)
                }
        }

        buildArgs := make(map[string]*string)
        buildArgs["DISK_SIZE"] = size
        buildArgs["TARGET_ARCH"] = targetArch
        opt := types.ImageBuildOptions{
                AuthConfigs:    authConfigs,
                Tags:           []string{dstIm},
                BuildArgs:      buildArgs,
                SuppressOutput: *quiet,
                NoCache:        false,
                Context:        tarBuffer,
                Dockerfile:     "Dockerfile",
                Remove:         true,
                ForceRemove:    true,
                Platform:       *targetArch,
        }
        buildResp, err := cli.ImageBuild(ctx, tarBuffer, opt)
        if err != nil {
                log.Fatalln(err)
        }

        defer buildResp.Body.Close()

        out := io.Writer(os.Stdout)
        if *quiet {
                out = io.Discard
        }
        err = jsonmessage.DisplayJSONMessagesStream(buildResp.Body, out, os.Stdout.Fd(), term.IsTerminal(int(os.Stdout.Fd())), nil)
        if err != nil {
                log.Fatalln(err)
        }

        if len(*outFile) != 0 {
                log.Printf("Save disk image as %s", *outFile)
                // create container from docker image we just built
                containerResp, err := cli.ContainerCreate(ctx, &container.Config{
                        Image:      dstIm,
                        Tty:        false,
                        Entrypoint: imageSpec.Config.Entrypoint,
                        Cmd:        imageSpec.Config.Cmd,
                }, nil, nil, nil, "")
                if err != nil {
                        log.Fatalln(err)
                }
                if len(containerResp.Warnings) > 0 {
                        log.Println(containerResp.Warnings)
                }

                // copy file from container as tar archive
                fromContainer, _, err := cli.CopyFromContainer(ctx, containerResp.ID, "/disk.qcow2")
                if err != nil {
                        log.Fatalln(err)
                }

                // untar file from tar archive
                tarReader := tar.NewReader(fromContainer)
                for {
                        header, err := tarReader.Next()
                        if errors.Is(err, io.EOF) {
                                break
                        } else if err != nil {
                                log.Fatalln(err)
                        }

                        if header.Name != "disk.qcow2" {
                                log.Printf("skip file %s", header.Name)
                                continue
                        }
                        path := filepath.Join(*outFile) //nolint:gocritic // FIXME: this is probably incorrect, intended to join with header.Name ?
                        info := header.FileInfo()

                        // Open and write to the file inside a closure, so we can defer close
                        err = func() error {
                                file, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, info.Mode())
                                if err != nil {
                                        return err
                                }
                                defer file.Close()
                                _, err = io.Copy(file, tarReader)
                                return err
                        }()
                        if err != nil {
                                log.Fatalln(err)
                        }
                }
                // remove container
                if err = cli.ContainerRemove(ctx, containerResp.ID, types.ContainerRemoveOptions{}); err != nil {
                        log.Println(err)
                }

        }
}

type imageSpec struct {
        Commands     []command `yaml:"commands"`
        ShutdownHook string    `yaml:"shutdownHook,omitempty"`
        Build        string    `yaml:"build"`
        Merge        string    `yaml:"merge"`
        Files        []file    `yaml:"files"`
}

type command struct {
        Name           string `yaml:"name"`
        User           string `yaml:"user"`
        SysvInitAction string `yaml:"sysvInitAction"`
        Shell          string `yaml:"shell"`
}

type file struct {
        Filename string  `yaml:"filename"`
        HostPath *string `yaml:"hostPath,omitempty"`
        Content  *string `yaml:"content,omitempty"`
}

func readImageSpec(path string) (*imageSpec, error) {
        f, err := os.Open(path)
        if err != nil {
                return nil, fmt.Errorf("failed to open file at %q: %w", path, err)
        }

        var spec imageSpec

        dec := yaml.NewDecoder(f)
        dec.KnownFields(true) // disallow unknown fields
        if err := dec.Decode(&spec); err != nil {
                return nil, err
        }

        var errs []error

        for i, c := range spec.Commands {
                for _, e := range c.validate() {
                        errs = append(errs, fmt.Errorf("error in commands[%d]: %w", i, e))
                }
        }
        for i, f := range spec.Files {
                for _, e := range f.validate() {
                        errs = append(errs, fmt.Errorf("error in files[%d]: %w", i, e))
                }
        }

        if err := errors.Join(errs...); err != nil {
                return nil, fmt.Errorf("invalid image spec: %w", err)
        }

        return &spec, nil
}

func (c command) validate() []error {
        checkNonempty := func(errs *[]error, field string, value string) {
                if value == "" {
                        *errs = append(*errs, fmt.Errorf("command must have non-empty field '%s'", field))
                }
        }

        var errs []error

        checkNonempty(&errs, "name", c.Name)
        checkNonempty(&errs, "user", c.User)
        checkNonempty(&errs, "sysvInitAction", c.SysvInitAction)
        checkNonempty(&errs, "shell", c.Shell)

        return errs
}

func (f file) validate() []error {
        var errs []error

        if f.Filename == "" {
                errs = append(errs, errors.New("file must have non-empty field 'filename'"))
        }

        if f.HostPath == nil && f.Content == nil {
                errs = append(errs, errors.New("file missing either 'hostPath' or 'content'"))
        } else if f.HostPath != nil && f.Content != nil {
                errs = append(errs, errors.New("file must have only one of 'hostPath' or 'content'"))
        }

        return errs
}

// getAgettyTTY returns the tty device name for agetty based on the target architecture.
func getAgettyTTY(targetArch string) string {
        switch targetArch {
        case targetArchLinuxAmd64:
                return "ttyS0"
        case targetArchLinuxArm64:
                return "ttyAMA0"
        default:
                log.Fatalf("Unsupported target architecture: %q", targetArch)
                return ""
        }
}