package main
import (
"context"
"fmt"
"log"
"os/signal"
"syscall"
"time"
"github.com/tychoish/fun/srv"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"go.uber.org/zap/zapio"
"k8s.io/klog/v2"
"k8s.io/kubernetes/cmd/kube-scheduler/app"
"github.com/neondatabase/autoscaling/pkg/plugin"
"github.com/neondatabase/autoscaling/pkg/util"
)
// all of the juicy bits are defined in pkg/plugin/
func main() {
logConfig := zap.NewProductionConfig()
logConfig.Sampling = nil // Disable sampling, which the production config enables by default.
logger := zap.Must(logConfig.Build()).Named("autoscale-scheduler")
logger.Info("", zap.Any("buildInfo", util.GetBuildInfo()))
if err := runProgram(logger); err != nil {
log.Fatal(err)
}
}
// runProgram is the "real" main, but returning an error means that
// the shutdown handling code doesn't have to call os.Exit, even indirectly.
func runProgram(logger *zap.Logger) (err error) {
conf, err := plugin.ReadConfig(plugin.DefaultConfigPath)
if err != nil {
return fmt.Errorf("Error reading config at %q: %w", plugin.DefaultConfigPath, err)
}
// this: listens for sigterm, when we catch that signal, the
// context gets canceled, a go routine waits for half a second, and
// then closes the signal channel, which we block on in a
// defer. because defers execute in LIFO errors, this just
// pauses for a *very* short period of time before exiting.
//
// eventually, the constructed application will track it's
// services and be able to more coherently wait for shutdown
// without needing a sleep.
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGTERM)
defer cancel()
ctx = srv.SetShutdownSignal(ctx)
ctx = srv.WithOrchestrator(ctx)
ctx = srv.SetBaseContext(ctx)
orca := srv.GetOrchestrator(ctx)
defer func() { err = orca.Service().Wait() }()
if err := orca.Add(srv.HTTP("scheduler-pprof", time.Second, util.MakePPROF("0.0.0.0:7777"))); err != nil {
return err
}
// The normal scheduler outputs to klog, and there isn't *really* a way to stop that. So to make
// everything fit nicely, we'll redirect it to zap as well.
redirectKlog(logger.Named("klog"))
constructor := plugin.NewAutoscaleEnforcerPlugin(ctx, logger, conf)
command := app.NewSchedulerCommand(app.WithPlugin(plugin.Name, constructor))
// Don't output the full usage whenever any error occurs (otherwise, startup errors get drowned
// out by many pages of scheduler command flags)
command.SilenceUsage = true
if err := command.ExecuteContext(ctx); err != nil {
return err
}
return
}
func redirectKlog(to *zap.Logger) {
severityPairs := []struct {
klogLevel string
zapLevel zapcore.Level
}{
{"info", zapcore.InfoLevel},
{"warning", zapcore.WarnLevel},
{"error", zapcore.ErrorLevel},
{"fatal", zapcore.FatalLevel},
}
for _, pair := range severityPairs {
klog.SetOutputBySeverity(pair.klogLevel, &zapio.Writer{
Log: to,
Level: pair.zapLevel,
})
}
// By default, we'll get LogToStderr(true), which completely bypasses any redirecting with
// SetOutput or SetOutputBySeverity. So... we'd like to avoid that, which thankfully we can do.
klog.LogToStderr(false)
}
package main
import (
"context"
"os/signal"
"syscall"
"time"
"github.com/tychoish/fun/srv"
"go.uber.org/zap"
"k8s.io/client-go/kubernetes"
scheme "k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
"github.com/neondatabase/autoscaling/pkg/agent"
"github.com/neondatabase/autoscaling/pkg/util"
)
func main() {
logConfig := zap.NewProductionConfig()
logConfig.Sampling = nil // Disable sampling, which the production config enables by default.
logConfig.Level.SetLevel(zap.InfoLevel) // Only "info" level and above (i.e. not debug logs)
logger := zap.Must(logConfig.Build()).Named("autoscaler-agent")
defer logger.Sync() //nolint:errcheck // what are we gonna do, log something about it?
logger.Info("", zap.Any("buildInfo", util.GetBuildInfo()))
envArgs, err := agent.ArgsFromEnv()
if err != nil {
logger.Panic("Failed to get args from environment", zap.Error(err))
}
logger.Info("Got environment args", zap.Any("args", envArgs))
config, err := agent.ReadConfig(envArgs.ConfigPath)
if err != nil {
logger.Panic("Failed to read config", zap.Error(err))
}
logger.Info("Got config", zap.Any("config", config))
kubeConfig, err := rest.InClusterConfig()
if err != nil {
logger.Panic("Failed to get in-cluster K8s config", zap.Error(err))
}
kubeClient, err := kubernetes.NewForConfig(kubeConfig)
if err != nil {
logger.Panic("Failed to make K8S client", zap.Error(err))
}
if err = vmapi.AddToScheme(scheme.Scheme); err != nil {
logger.Panic("Failed to add NeonVM scheme", zap.Error(err))
}
vmClient, err := vmclient.NewForConfig(kubeConfig)
if err != nil {
logger.Panic("Failed to make VM client", zap.Error(err))
}
runner := agent.MainRunner{
EnvArgs: envArgs,
Config: config,
KubeClient: kubeClient,
VMClient: vmClient,
}
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGTERM)
defer cancel()
ctx = srv.SetShutdownSignal(ctx)
ctx = srv.SetBaseContext(ctx)
ctx = srv.WithOrchestrator(ctx)
defer func() {
if err := srv.GetOrchestrator(ctx).Wait(); err != nil {
logger.Panic("Failed to shut down orchestrator", zap.Error(err))
}
logger.Info("Main loop returned without issue. Exiting.")
}()
if err := srv.GetOrchestrator(ctx).Add(srv.HTTP("agent-pprof", time.Second, util.MakePPROF("0.0.0.0:7777"))); err != nil {
logger.Panic("Failed to add pprof service", zap.Error(err))
}
if err = runner.Run(logger, ctx); err != nil {
logger.Panic("Main loop failed", zap.Error(err))
}
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package v1 contains API Schema definitions for the vm v1 API group
// +kubebuilder:object:generate=true
// +groupName=vm.neon.tech
package v1
import (
"sigs.k8s.io/controller-runtime/pkg/scheme"
"k8s.io/apimachinery/pkg/runtime/schema"
)
var (
// SchemeGroupVersion is group version used to register these objects
SchemeGroupVersion = schema.GroupVersion{Group: "vm.neon.tech", Version: "v1"}
// SchemeBuilder is used to add go types to the GroupVersionKind scheme
SchemeBuilder = &scheme.Builder{GroupVersion: SchemeGroupVersion}
// AddToScheme adds the types in this group-version to the given scheme.
AddToScheme = SchemeBuilder.AddToScheme
)
// Resource takes an unqualified resource and returns a Group qualified GroupResource
func Resource(resource string) schema.GroupResource {
return SchemeGroupVersion.WithResource(resource).GroupResource()
}
package v1
import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
// IPPoolSpec defines the desired state of IPPool
type IPPoolSpec struct {
// Range is a RFC 4632/4291-style string that represents an IP address and prefix length in CIDR notation
Range string `json:"range"`
// Allocations is the set of allocated IPs for the given range. Its` indices are a direct mapping to the
// IP with the same index/offset for the pool's range.
Allocations map[string]IPAllocation `json:"allocations"`
}
// IPAllocation represents metadata about the pod/container owner of a specific IP
// coped from Whereabout CNI as their allocation functions used
type IPAllocation struct {
ContainerID string `json:"id"`
PodRef string `json:"podref,omitempty"`
}
//+genclient
//+kubebuilder:object:root=true
//+kubebuilder:resource:singular=ippool
// IPPool is the Schema for the ippools API
type IPPool struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec IPPoolSpec `json:"spec,omitempty"`
}
// +kubebuilder:object:root=true
// IPPoolList contains a list of IPPool
type IPPoolList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []IPPool `json:"items"`
}
func init() {
SchemeBuilder.Register(&IPPool{}, &IPPoolList{}) //nolint:exhaustruct // just being used to provide the types
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
"encoding/json"
"fmt"
"slices"
"time"
"go.uber.org/zap/zapcore"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
const (
// VirtualMachineNameLabel is the label assigned to each NeonVM Pod, providing the name of the
// VirtualMachine object for the VM running in it
//
// This label can be used both to find which VM is running in a Pod (by getting the value of the
// label) or to find which Pod a VM is running in (by searching for Pods with the label equal to
// the VM's name).
VirtualMachineNameLabel string = "vm.neon.tech/name"
// Label that determines the version of runner pod. May be missing on older runners
RunnerPodVersionLabel string = "vm.neon.tech/runner-version"
// VirtualMachineUsageAnnotation is the annotation added to each runner Pod, mirroring
// information about the resource allocations of the VM running in the pod.
//
// The value of this annotation is always a JSON-encoded VirtualMachineUsage object.
VirtualMachineUsageAnnotation string = "vm.neon.tech/usage"
// VirtualMachineResourcesAnnotation is the annotation added to each runner Pod, mirroring
// information about the resource allocations of the VM running in the pod.
//
// The value of this annotation is always a JSON-encoded VirtualMachineResources object.
VirtualMachineResourcesAnnotation string = "vm.neon.tech/resources"
)
// VirtualMachineUsage provides information about a VM's current usage. This is the type of the
// JSON-encoded data in the VirtualMachineUsageAnnotation attached to each runner pod.
type VirtualMachineUsage struct {
CPU *resource.Quantity `json:"cpu"`
Memory *resource.Quantity `json:"memory"`
}
// VirtualMachineResources provides information about a VM's resource allocations.
type VirtualMachineResources struct {
CPUs CPUs `json:"cpus"`
MemorySlots MemorySlots `json:"memorySlots"`
MemorySlotSize resource.Quantity `json:"memorySlotSize"`
}
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
// VirtualMachineSpec defines the desired state of VirtualMachine
type VirtualMachineSpec struct {
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=65535
// +kubebuilder:default:=20183
// +optional
QMP int32 `json:"qmp,omitempty"`
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=65535
// +kubebuilder:default:=20184
// +optional
QMPManual int32 `json:"qmpManual,omitempty"`
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=65535
// +kubebuilder:default:=25183
// +optional
RunnerPort int32 `json:"runnerPort,omitempty"`
// +kubebuilder:default:=5
// +optional
TerminationGracePeriodSeconds *int64 `json:"terminationGracePeriodSeconds"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
Affinity *corev1.Affinity `json:"affinity,omitempty"`
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
SchedulerName string `json:"schedulerName,omitempty"`
ServiceAccountName string `json:"serviceAccountName,omitempty"`
PodResources corev1.ResourceRequirements `json:"podResources,omitempty"`
// +kubebuilder:default:=Always
// +optional
RestartPolicy RestartPolicy `json:"restartPolicy"`
ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"`
Guest Guest `json:"guest"`
// Running init containers is costly, so InitScript field should be preferred over ExtraInitContainers
ExtraInitContainers []corev1.Container `json:"extraInitContainers,omitempty"`
// InitScript will be executed in the main container before VM is started.
// +optional
InitScript string `json:"initScript,omitempty"`
// List of disk that can be mounted by virtual machine.
// +optional
Disks []Disk `json:"disks,omitempty"`
// Extra network interface attached to network provided by Mutlus CNI.
// +optional
ExtraNetwork *ExtraNetwork `json:"extraNetwork,omitempty"`
// +optional
ServiceLinks *bool `json:"service_links,omitempty"`
// Use KVM acceleation
// +kubebuilder:default:=true
// +optional
EnableAcceleration *bool `json:"enableAcceleration,omitempty"`
// Override for normal neonvm-runner image
// +optional
RunnerImage *string `json:"runnerImage,omitempty"`
// Rely on neonvm-daemon inside the VM for fractional CPU limiting
// +kubebuilder:default:=false
// +optional
DelegatedCPULimits *bool `json:"delegatedCPULimits,omitempty"`
// Enable SSH on the VM. It works only if the VM image is built using VM Builder that
// has SSH support (TODO: mention VM Builder version).
// +kubebuilder:default:=true
// +optional
EnableSSH *bool `json:"enableSSH,omitempty"`
// TargetRevision is the identifier set by external party to track when changes to the spec
// propagate to the VM.
//
// If a certain value is written into Spec.TargetRevision together with the changes, and
// the same value is observed in Status.CurrentRevision, it means that the changes were
// propagated to the VM.
// +optional
TargetRevision *RevisionWithTime `json:"targetRevision,omitempty"`
}
func (spec *VirtualMachineSpec) Resources() VirtualMachineResources {
return VirtualMachineResources{
CPUs: spec.Guest.CPUs,
MemorySlots: spec.Guest.MemorySlots,
MemorySlotSize: spec.Guest.MemorySlotSize,
}
}
// +kubebuilder:validation:Enum=Always;OnFailure;Never
type RestartPolicy string
const (
RestartPolicyAlways RestartPolicy = "Always"
RestartPolicyOnFailure RestartPolicy = "OnFailure"
RestartPolicyNever RestartPolicy = "Never"
)
type Guest struct {
// +optional
KernelImage *string `json:"kernelImage,omitempty"`
// +optional
AppendKernelCmdline *string `json:"appendKernelCmdline,omitempty"`
// +optional
CPUs CPUs `json:"cpus"`
// +optional
// +kubebuilder:default:="1Gi"
MemorySlotSize resource.Quantity `json:"memorySlotSize"`
// +optional
MemorySlots MemorySlots `json:"memorySlots"`
// +optional
MemoryProvider *MemoryProvider `json:"memoryProvider,omitempty"`
// +optional
RootDisk RootDisk `json:"rootDisk"`
// Docker image Entrypoint array replacement.
// +optional
Command []string `json:"command,omitempty"`
// Arguments to the entrypoint.
// The docker image's cmd is used if this is not provided.
// +optional
Args []string `json:"args,omitempty"`
// List of environment variables to set in the vmstart process.
// +optional
Env []EnvVar `json:"env,omitempty" patchStrategy:"merge" patchMergeKey:"name"`
// List of ports to expose from the container.
// Cannot be updated.
// +optional
Ports []Port `json:"ports,omitempty"`
// Additional settings for the VM.
// Cannot be updated.
// +optional
Settings *GuestSettings `json:"settings,omitempty"`
}
const virtioMemBlockSizeBytes = 8 * 1024 * 1024 // 8 MiB
// ValidateForMemoryProvider returns an error iff the guest memory settings are invalid for the
// MemoryProvider.
//
// This is used in two places. First, to validate VirtualMachine object creation. Second, to handle
// the defaulting behavior for VirtualMachines that would be switching from DIMMSlots to VirtioMem
// on restart. We place more restrictions on VirtioMem because we use 8MiB block sizes, so changing
// to a new default can only happen if the memory slot size is a multiple of 8MiB.
func (g Guest) ValidateForMemoryProvider(p MemoryProvider) error {
if p == MemoryProviderVirtioMem {
if g.MemorySlotSize.Value()%virtioMemBlockSizeBytes != 0 {
return fmt.Errorf("memorySlotSize invalid for memoryProvider VirtioMem: must be a multiple of 8Mi")
}
}
return nil
}
// Flag is a bitmask of flags. The meaning is up to the user.
//
// Used in Revision below.
type Flag uint64
func (f *Flag) Set(flag Flag) {
*f |= flag
}
func (f *Flag) Clear(flag Flag) {
*f &= ^flag
}
func (f *Flag) Has(flag Flag) bool {
return *f&flag != 0
}
// Revision is an identifier, which can be assigned to a specific configuration of a VM.
// Later it can be used to track the application of the configuration.
type Revision struct {
Value int64 `json:"value"`
Flags Flag `json:"flags"`
}
// ZeroRevision is the default value when revisions updates are disabled.
var ZeroRevision = Revision{Value: 0, Flags: 0}
func (r Revision) Min(other Revision) Revision {
if r.Value < other.Value {
return r
}
return other
}
func (r Revision) WithTime(t time.Time) RevisionWithTime {
return RevisionWithTime{
Revision: r,
UpdatedAt: metav1.NewTime(t),
}
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that Revision can be used with zap.Object
func (r *Revision) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddInt64("value", r.Value)
enc.AddUint64("flags", uint64(r.Flags))
return nil
}
// RevisionWithTime contains a Revision and the time it was last updated.
type RevisionWithTime struct {
Revision `json:"revision"`
UpdatedAt metav1.Time `json:"updatedAt"`
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that RevisionWithTime can be used with zap.Object
func (r *RevisionWithTime) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddTime("updatedAt", r.UpdatedAt.Time)
return r.Revision.MarshalLogObject(enc)
}
type GuestSettings struct {
// Individual lines to add to a sysctl.conf file. See sysctl.conf(5) for more
// +optional
Sysctl []string `json:"sysctl,omitempty"`
// Swap adds a swap disk with the provided size.
//
// +optional
Swap *resource.Quantity `json:"swap,omitempty"`
}
type CPUs struct {
Min MilliCPU `json:"min"`
Max MilliCPU `json:"max"`
Use MilliCPU `json:"use"`
}
// MilliCPU is a special type to represent vCPUs * 1000
// e.g. 2 vCPU is 2000, 0.25 is 250
//
// +kubebuilder:validation:XIntOrString
// +kubebuilder:validation:Pattern=^[0-9]+((\.[0-9]*)?|m)
type MilliCPU uint32 // note: pattern is more restrictive than resource.Quantity, because we're just using it for CPU
// RoundedUp returns the smallest integer number of CPUs greater than or equal to the effective
// value of m.
func (m MilliCPU) RoundedUp() uint32 {
r := uint32(m) / 1000
if m%1000 != 0 {
r += 1
}
return r
}
// MilliCPUFromResourceQuantity converts resource.Quantity into MilliCPU
func MilliCPUFromResourceQuantity(r resource.Quantity) MilliCPU {
return MilliCPU(r.MilliValue())
}
// ToResourceQuantity converts a MilliCPU to resource.Quantity
// this is useful for formatting/serialization
func (m MilliCPU) ToResourceQuantity() *resource.Quantity {
return resource.NewMilliQuantity(int64(m), resource.BinarySI)
}
// AsFloat64 converts the MilliCPU value into a float64 of CPU
//
// This should be preferred over calling m.ToResourceQuantity().AsApproximateFloat64(), because
// going through the resource.Quantity can produce less accurate floats.
func (m MilliCPU) AsFloat64() float64 {
return float64(m) / 1000
}
// this is used to parse scheduler config and communication between components
// we used resource.Quantity as underlying transport format for MilliCPU
func (m *MilliCPU) UnmarshalJSON(data []byte) error {
var quantity resource.Quantity
err := json.Unmarshal(data, &quantity)
if err != nil {
return err
}
*m = MilliCPUFromResourceQuantity(quantity)
return nil
}
func (m MilliCPU) MarshalJSON() ([]byte, error) {
// Mashal as an integer if we can, for backwards-compatibility with components that wouldn't be
// expecting a string here.
if m%1000 == 0 {
return json.Marshal(uint32(m / 1000))
}
return json.Marshal(m.ToResourceQuantity())
}
func (m MilliCPU) Format(state fmt.State, verb rune) {
switch {
case verb == 'v' && state.Flag('#'):
//nolint:errcheck // can't do anything about the write error
state.Write([]byte(fmt.Sprintf("%v", uint32(m))))
default:
//nolint:errcheck // can't do anything about the write error
state.Write([]byte(fmt.Sprintf("%v", m.AsFloat64())))
}
}
type MemorySlots struct {
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=128
// +kubebuilder:validation:ExclusiveMaximum=false
Min int32 `json:"min"`
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=128
// +kubebuilder:validation:ExclusiveMaximum=false
Max int32 `json:"max"`
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=128
// +kubebuilder:validation:ExclusiveMaximum=false
Use int32 `json:"use"`
}
// +kubebuilder:validation:Enum=DIMMSlots;VirtioMem
type MemoryProvider string
const (
MemoryProviderDIMMSlots MemoryProvider = "DIMMSlots"
MemoryProviderVirtioMem MemoryProvider = "VirtioMem"
)
// FlagFunc is a parsing function to be used with flag.Func
func (p *MemoryProvider) FlagFunc(value string) error {
possibleValues := []string{
string(MemoryProviderDIMMSlots),
string(MemoryProviderVirtioMem),
}
if !slices.Contains(possibleValues, value) {
return fmt.Errorf("Unknown MemoryProvider %q, must be one of %v", value, possibleValues)
}
*p = MemoryProvider(value)
return nil
}
type RootDisk struct {
Image string `json:"image"`
// +optional
Size resource.Quantity `json:"size,omitempty"`
// +optional
// +kubebuilder:default:="IfNotPresent"
ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy"`
// +optional
Execute []string `json:"execute,omitempty"`
}
type EnvVar struct {
// Name of the environment variable. Must be a C_IDENTIFIER.
Name string `json:"name"`
// +optional
// +kubebuilder:default:=""
Value string `json:"value,omitempty"`
}
type Port struct {
// If specified, this must be an IANA_SVC_NAME and unique within the pod. Each
// named port in a pod must have a unique name. Name for the port that can be
// referred to by services.
Name string `json:"name,omitempty"`
// Number of port to expose on the pod's IP address.
// This must be a valid port number, 0 < x < 65536.
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=65535
Port int `json:"port"`
// Protocol for port. Must be UDP or TCP.
// Defaults to "TCP".
// +kubebuilder:default:=TCP
Protocol Protocol `json:"protocol,omitempty"`
}
type Protocol string
const (
// ProtocolTCP is the TCP protocol.
ProtocolTCP Protocol = "TCP"
// ProtocolUDP is the UDP protocol.
ProtocolUDP Protocol = "UDP"
)
type Disk struct {
// Disk's name.
// Must be a DNS_LABEL and unique within the virtual machine.
Name string `json:"name"`
// Mounted read-only if true, read-write otherwise (false or unspecified).
// Defaults to false.
// +optional
// +kubebuilder:default:=false
ReadOnly *bool `json:"readOnly,omitempty"`
// Path within the virtual machine at which the disk should be mounted. Must
// not contain ':'.
MountPath string `json:"mountPath"`
// DiskSource represents the location and type of the mounted disk.
DiskSource `json:",inline"`
}
type DiskSource struct {
// EmptyDisk represents a temporary empty qcow2 disk that shares a vm's lifetime.
EmptyDisk *EmptyDiskSource `json:"emptyDisk,omitempty"`
// configMap represents a configMap that should populate this disk
// +optional
ConfigMap *corev1.ConfigMapVolumeSource `json:"configMap,omitempty"`
// Secret represents a secret that should populate this disk.
// +optional
Secret *corev1.SecretVolumeSource `json:"secret,omitempty"`
// TmpfsDisk represents a tmpfs.
// +optional
Tmpfs *TmpfsDiskSource `json:"tmpfs,omitempty"`
}
type EmptyDiskSource struct {
Size resource.Quantity `json:"size"`
// Discard enables the "discard" mount option for the filesystem
Discard bool `json:"discard,omitempty"`
// EnableQuotas enables the "prjquota" mount option for the ext4 filesystem.
// More info here:
// https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9/html/managing_file_systems/limiting-storage-space-usage-on-ext4-with-quotas_managing-file-systems
EnableQuotas bool `json:"enableQuotas,omitempty"`
}
type TmpfsDiskSource struct {
Size resource.Quantity `json:"size"`
}
type ExtraNetwork struct {
// Enable extra network interface
// +kubebuilder:default:=false
// +optional
Enable bool `json:"enable"`
// Interface name.
// +kubebuilder:default:=net1
// +optional
Interface string `json:"interface"`
// Multus Network name specified in network-attachments-definition.
// +optional
MultusNetwork string `json:"multusNetwork,omitempty"`
}
// VirtualMachineStatus defines the observed state of VirtualMachine
type VirtualMachineStatus struct {
// Represents the observations of a VirtualMachine's current state.
// VirtualMachine.status.conditions.type are: "Available", "Progressing", and "Degraded"
// VirtualMachine.status.conditions.status are one of True, False, Unknown.
// VirtualMachine.status.conditions.reason the value should be a CamelCase string and producers of specific
// condition types may define expected values and meanings for this field, and whether the values
// are considered a guaranteed API.
// VirtualMachine.status.conditions.Message is a human readable message indicating details about the transition.
// For further information see: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties
Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"`
// The phase of a VM is a simple, high-level summary of where the VM is in its lifecycle.
// +optional
Phase VmPhase `json:"phase,omitempty"`
// Number of times the VM runner pod has been recreated
// +optional
RestartCount int32 `json:"restartCount"`
// +optional
PodName string `json:"podName,omitempty"`
// +optional
PodIP string `json:"podIP,omitempty"`
// +optional
ExtraNetIP string `json:"extraNetIP,omitempty"`
// +optional
ExtraNetMask string `json:"extraNetMask,omitempty"`
// +optional
Node string `json:"node,omitempty"`
// +optional
CPUs *MilliCPU `json:"cpus,omitempty"`
// +optional
MemorySize *resource.Quantity `json:"memorySize,omitempty"`
// +optional
MemoryProvider *MemoryProvider `json:"memoryProvider,omitempty"`
// +optional
SSHSecretName string `json:"sshSecretName,omitempty"`
// CurrentRevision is updated with Spec.TargetRevision's value once
// the changes are propagated to the VM.
// +optional
CurrentRevision *RevisionWithTime `json:"currentRevision,omitempty"`
}
type VmPhase string
const (
// VmPending means the VM has been accepted by the system, but vm-runner pod
// has not been started. This includes time before being bound to a node, as well as time spent
// pulling images onto the host.
VmPending VmPhase = "Pending"
// VmRunning means the vm-runner pod has been bound to a node and have been started.
VmRunning VmPhase = "Running"
// VmSucceeded means that all containers in the vm-runner pod have voluntarily terminated
// with a container exit code of 0, and the system is not going to restart any of these containers.
VmSucceeded VmPhase = "Succeeded"
// VmFailed means that all containers in the vm-runner pod have terminated, and at least one container has
// terminated in a failure (exited with a non-zero exit code or was stopped by the system).
VmFailed VmPhase = "Failed"
// VmPreMigrating means that VM in preparation to start migration
VmPreMigrating VmPhase = "PreMigrating"
// VmMigrating means that VM in migration to another node
VmMigrating VmPhase = "Migrating"
// VmScaling means that devices are plugging/unplugging to/from the VM
VmScaling VmPhase = "Scaling"
)
// IsAlive returns whether the guest in the VM is expected to be running
func (p VmPhase) IsAlive() bool {
switch p {
case VmRunning, VmPreMigrating, VmMigrating, VmScaling:
return true
default:
return false
}
}
//+genclient
//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:resource:singular=neonvm
// VirtualMachine is the Schema for the virtualmachines API
// +kubebuilder:printcolumn:name="Cpus",type=string,JSONPath=`.status.cpus`
// +kubebuilder:printcolumn:name="Memory",type=string,JSONPath=`.status.memorySize`
// +kubebuilder:printcolumn:name="Pod",type=string,JSONPath=`.status.podName`
// +kubebuilder:printcolumn:name="ExtraIP",type=string,JSONPath=`.status.extraNetIP`
// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.phase`
// +kubebuilder:printcolumn:name="Restarts",type=string,JSONPath=`.status.restarts`
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
// +kubebuilder:printcolumn:name="Node",type=string,priority=1,JSONPath=`.status.node`
// +kubebuilder:printcolumn:name="Image",type=string,priority=1,JSONPath=`.spec.guest.rootDisk.image`
type VirtualMachine struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec VirtualMachineSpec `json:"spec,omitempty"`
Status VirtualMachineStatus `json:"status,omitempty"`
}
func (vm *VirtualMachine) Cleanup() {
vm.Status.PodName = ""
vm.Status.PodIP = ""
vm.Status.Node = ""
vm.Status.CPUs = nil
vm.Status.MemorySize = nil
vm.Status.MemoryProvider = nil
}
func (vm *VirtualMachine) HasRestarted() bool {
return vm.Status.RestartCount > 0
}
//+kubebuilder:object:root=true
// VirtualMachineList contains a list of VirtualMachine
type VirtualMachineList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []VirtualMachine `json:"items"`
}
func init() {
SchemeBuilder.Register(&VirtualMachine{}, &VirtualMachineList{}) //nolint:exhaustruct // just being used to provide the types
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
"errors"
"fmt"
"reflect"
"slices"
"sigs.k8s.io/controller-runtime/pkg/webhook"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
"k8s.io/apimachinery/pkg/runtime"
)
//+kubebuilder:webhook:path=/mutate-vm-neon-tech-v1-virtualmachine,mutating=true,failurePolicy=fail,sideEffects=None,groups=vm.neon.tech,resources=virtualmachines,verbs=create;update,versions=v1,name=mvirtualmachine.kb.io,admissionReviewVersions=v1
var _ webhook.Defaulter = &VirtualMachine{}
// Default implements webhook.Defaulter
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachine) Default() {
// Nothing to do.
}
//+kubebuilder:webhook:path=/validate-vm-neon-tech-v1-virtualmachine,mutating=false,failurePolicy=fail,sideEffects=None,groups=vm.neon.tech,resources=virtualmachines,verbs=create;update,versions=v1,name=vvirtualmachine.kb.io,admissionReviewVersions=v1
var _ webhook.Validator = &VirtualMachine{}
// ValidateCreate implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control.
func (r *VirtualMachine) ValidateCreate() (admission.Warnings, error) {
// validate .spec.guest.cpus.use and .spec.guest.cpus.max
if r.Spec.Guest.CPUs.Use < r.Spec.Guest.CPUs.Min {
return nil, fmt.Errorf(".spec.guest.cpus.use (%v) should be greater than or equal to the .spec.guest.cpus.min (%v)",
r.Spec.Guest.CPUs.Use,
r.Spec.Guest.CPUs.Min)
}
if r.Spec.Guest.CPUs.Use > r.Spec.Guest.CPUs.Max {
return nil, fmt.Errorf(".spec.guest.cpus.use (%v) should be less than or equal to the .spec.guest.cpus.max (%v)",
r.Spec.Guest.CPUs.Use,
r.Spec.Guest.CPUs.Max)
}
// validate .spec.guest.memorySlotSize w.r.t. .spec.guest.memoryProvider
if r.Spec.Guest.MemoryProvider != nil {
if err := r.Spec.Guest.ValidateForMemoryProvider(*r.Spec.Guest.MemoryProvider); err != nil {
return nil, fmt.Errorf(".spec.guest: %w", err)
}
}
// validate .spec.guest.memorySlots.use and .spec.guest.memorySlots.max
if r.Spec.Guest.MemorySlots.Use < r.Spec.Guest.MemorySlots.Min {
return nil, fmt.Errorf(".spec.guest.memorySlots.use (%d) should be greater than or equal to the .spec.guest.memorySlots.min (%d)",
r.Spec.Guest.MemorySlots.Use,
r.Spec.Guest.MemorySlots.Min)
}
if r.Spec.Guest.MemorySlots.Use > r.Spec.Guest.MemorySlots.Max {
return nil, fmt.Errorf(".spec.guest.memorySlots.use (%d) should be less than or equal to the .spec.guest.memorySlots.max (%d)",
r.Spec.Guest.MemorySlots.Use,
r.Spec.Guest.MemorySlots.Max)
}
// validate .spec.disk names
reservedDiskNames := []string{
"virtualmachineimages",
"rootdisk",
"runtime",
"swapdisk",
"sysfscgroup",
"containerdsock",
"ssh-privatekey",
"ssh-publickey",
"ssh-authorized-keys",
}
for _, disk := range r.Spec.Disks {
if slices.Contains(reservedDiskNames, disk.Name) {
return nil, fmt.Errorf("'%s' is reserved for .spec.disks[].name", disk.Name)
}
if len(disk.Name) > 32 {
return nil, fmt.Errorf("disk name '%s' too long, should be less than or equal to 32", disk.Name)
}
}
// validate .spec.guest.ports[].name
for _, port := range r.Spec.Guest.Ports {
if len(port.Name) != 0 && port.Name == "qmp" {
return nil, errors.New("'qmp' is reserved name for .spec.guest.ports[].name")
}
}
return nil, nil
}
// ValidateUpdate implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control.
func (r *VirtualMachine) ValidateUpdate(old runtime.Object) (admission.Warnings, error) {
// process immutable fields
before, _ := old.(*VirtualMachine)
immutableFields := []struct {
fieldName string
getter func(*VirtualMachine) any
}{
{".spec.guest.cpus.min", func(v *VirtualMachine) any { return v.Spec.Guest.CPUs.Min }},
{".spec.guest.cpus.max", func(v *VirtualMachine) any { return v.Spec.Guest.CPUs.Max }},
{".spec.guest.memorySlots.min", func(v *VirtualMachine) any { return v.Spec.Guest.MemorySlots.Min }},
{".spec.guest.memorySlots.max", func(v *VirtualMachine) any { return v.Spec.Guest.MemorySlots.Max }},
// nb: we don't check memoryProvider here, so that it's allowed to be mutable as a way of
// getting flexibility to solidify the memory provider or change it across restarts.
// ref https://github.com/neondatabase/autoscaling/pull/970#discussion_r1644225986
{".spec.guest.ports", func(v *VirtualMachine) any { return v.Spec.Guest.Ports }},
{".spec.guest.rootDisk", func(v *VirtualMachine) any { return v.Spec.Guest.RootDisk }},
{".spec.guest.command", func(v *VirtualMachine) any { return v.Spec.Guest.Command }},
{".spec.guest.args", func(v *VirtualMachine) any { return v.Spec.Guest.Args }},
{".spec.guest.env", func(v *VirtualMachine) any { return v.Spec.Guest.Env }},
{".spec.guest.settings", func(v *VirtualMachine) any { return v.Spec.Guest.Settings }},
{".spec.disks", func(v *VirtualMachine) any { return v.Spec.Disks }},
{".spec.podResources", func(v *VirtualMachine) any { return v.Spec.PodResources }},
{".spec.enableAcceleration", func(v *VirtualMachine) any { return v.Spec.EnableAcceleration }},
{".spec.enableSSH", func(v *VirtualMachine) any { return v.Spec.EnableSSH }},
{".spec.initScript", func(v *VirtualMachine) any { return v.Spec.InitScript }},
}
for _, info := range immutableFields {
if !reflect.DeepEqual(info.getter(r), info.getter(before)) {
return nil, fmt.Errorf("%s is immutable", info.fieldName)
}
}
// validate .spec.guest.cpu.use
if r.Spec.Guest.CPUs.Use < r.Spec.Guest.CPUs.Min {
return nil, fmt.Errorf(".cpus.use (%v) should be greater than or equal to the .cpus.min (%v)",
r.Spec.Guest.CPUs.Use,
r.Spec.Guest.CPUs.Min)
}
if r.Spec.Guest.CPUs.Use > r.Spec.Guest.CPUs.Max {
return nil, fmt.Errorf(".cpus.use (%v) should be less than or equal to the .cpus.max (%v)",
r.Spec.Guest.CPUs.Use,
r.Spec.Guest.CPUs.Max)
}
// validate .spec.guest.memorySlots.use
if r.Spec.Guest.MemorySlots.Use < r.Spec.Guest.MemorySlots.Min {
return nil, fmt.Errorf(".memorySlots.use (%d) should be greater than or equal to the .memorySlots.min (%d)",
r.Spec.Guest.MemorySlots.Use,
r.Spec.Guest.MemorySlots.Min)
}
if r.Spec.Guest.MemorySlots.Use > r.Spec.Guest.MemorySlots.Max {
return nil, fmt.Errorf(".memorySlots.use (%d) should be less than or equal to the .memorySlots.max (%d)",
r.Spec.Guest.MemorySlots.Use,
r.Spec.Guest.MemorySlots.Max)
}
return nil, nil
}
// ValidateDelete implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachine) ValidateDelete() (admission.Warnings, error) {
// No deletion validation required currently.
return nil, nil
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
const MigrationPort int32 = 20187
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
// VirtualMachineMigrationSpec defines the desired state of VirtualMachineMigration
type VirtualMachineMigrationSpec struct {
VmName string `json:"vmName"`
// TODO: not implemented
// +optional
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
// TODO: not implemented
// +optional
NodeAffinity *corev1.NodeAffinity `json:"nodeAffinity,omitempty"`
// +optional
// +kubebuilder:default:=true
PreventMigrationToSameHost bool `json:"preventMigrationToSameHost"`
// TODO: not implemented
// Set 1 hour as default timeout for migration
// +optional
// +kubebuilder:default:=3600
CompletionTimeout int32 `json:"completionTimeout"`
// Trigger incremental disk copy migration by default, otherwise full disk copy used in migration
// +optional
// +kubebuilder:default:=true
Incremental bool `json:"incremental"`
// Use PostCopy migration by default
// +optional
// +kubebuilder:default:=false
AllowPostCopy bool `json:"allowPostCopy"`
// Use Auto converge by default
// +optional
// +kubebuilder:default:=true
AutoConverge bool `json:"autoConverge"`
// Set 1 Gbyte/sec as default for migration bandwidth
// +optional
// +kubebuilder:default:="1Gi"
MaxBandwidth resource.Quantity `json:"maxBandwidth"`
}
// VirtualMachineMigrationStatus defines the observed state of VirtualMachineMigration
type VirtualMachineMigrationStatus struct {
// Represents the observations of a VirtualMachineMigration's current state.
// VirtualMachineMigration.status.conditions.type are: "Available", "Progressing", and "Degraded"
// VirtualMachineMigration.status.conditions.status are one of True, False, Unknown.
// VirtualMachineMigration.status.conditions.reason the value should be a CamelCase string and producers of specific
// condition types may define expected values and meanings for this field, and whether the values
// are considered a guaranteed API.
// VirtualMachineMigration.status.conditions.Message is a human readable message indicating details about the transition.
// For further information see: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties
Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"`
// The phase of a VM is a simple, high-level summary of where the VM is in its lifecycle.
// +optional
Phase VmmPhase `json:"phase,omitempty"`
// +optional
SourcePodName string `json:"sourcePodName,omitempty"`
// +optional
TargetPodName string `json:"targetPodName,omitempty"`
// +optional
SourcePodIP string `json:"sourcePodIP,omitempty"`
// +optional
TargetPodIP string `json:"targetPodIP,omitempty"`
// +optional
SourceNode string `json:"sourceNode,omitempty"`
// +optional
TargetNode string `json:"targetNode,omitempty"`
// +optional
Info MigrationInfo `json:"info,omitempty"`
}
type MigrationInfo struct {
// +optional
Status string `json:"status,omitempty"`
// +optional
TotalTimeMs int64 `json:"totalTimeMs,omitempty"`
// +optional
SetupTimeMs int64 `json:"setupTimeMs,omitempty"`
// +optional
DowntimeMs int64 `json:"downtimeMs,omitempty"`
// +optional
Ram MigrationInfoRam `json:"ram,omitempty"`
// +optional
Compression MigrationInfoCompression `json:"compression,omitempty"`
}
type MigrationInfoRam struct {
// +optional
Transferred int64 `json:"transferred,omitempty"`
// +optional
Remaining int64 `json:"remaining,omitempty"`
// +optional
Total int64 `json:"total,omitempty"`
}
type MigrationInfoCompression struct {
// +optional
CompressedSize int64 `json:"compressedSize,omitempty"`
// +optional
CompressionRate int64 `json:"compressionRate,omitempty"`
}
type VmmPhase string
const (
// VmmPending means the migration has been accepted by the system, but target vm-runner pod
// has not been started. This includes time before being bound to a node, as well as time spent
// pulling images onto the host.
VmmPending VmmPhase = "Pending"
// VmmRunning means the target vm-runner pod has been bound to a node and have been started.
VmmRunning VmmPhase = "Running"
// VmmSucceeded means that migration finisged with success
VmmSucceeded VmmPhase = "Succeeded"
// VmmFailed means that migration failed
VmmFailed VmmPhase = "Failed"
)
//+genclient
//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:resource:singular=neonvmm
// VirtualMachineMigration is the Schema for the virtualmachinemigrations API
// +kubebuilder:printcolumn:name="VM",type=string,JSONPath=`.spec.vmName`
// +kubebuilder:printcolumn:name="Source",type=string,JSONPath=`.status.sourcePodName`
// +kubebuilder:printcolumn:name="SourceIP",type=string,priority=1,JSONPath=`.status.sourcePodIP`
// +kubebuilder:printcolumn:name="Target",type=string,JSONPath=`.status.targetPodName`
// +kubebuilder:printcolumn:name="TargetIP",type=string,priority=1,JSONPath=`.status.targetPodIP`
// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.phase`
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
type VirtualMachineMigration struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec VirtualMachineMigrationSpec `json:"spec,omitempty"`
Status VirtualMachineMigrationStatus `json:"status,omitempty"`
}
//+kubebuilder:object:root=true
// VirtualMachineMigrationList contains a list of VirtualMachineMigration
type VirtualMachineMigrationList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []VirtualMachineMigration `json:"items"`
}
func init() {
SchemeBuilder.Register(&VirtualMachineMigration{}, &VirtualMachineMigrationList{}) //nolint:exhaustruct // just being used to provide the types
}
/*
Copyright 2023.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
"sigs.k8s.io/controller-runtime/pkg/webhook"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
"k8s.io/apimachinery/pkg/runtime"
)
//+kubebuilder:webhook:path=/mutate-vm-neon-tech-v1-virtualmachinemigration,mutating=true,failurePolicy=fail,sideEffects=None,groups=vm.neon.tech,resources=virtualmachinemigrations,verbs=create;update,versions=v1,name=mvirtualmachinemigration.kb.io,admissionReviewVersions=v1
var _ webhook.Defaulter = &VirtualMachineMigration{}
// Default implements webhook.Defaulter
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachineMigration) Default() {
// TODO: implement defaults
}
//+kubebuilder:webhook:path=/validate-vm-neon-tech-v1-virtualmachinemigration,mutating=false,failurePolicy=fail,sideEffects=None,groups=vm.neon.tech,resources=virtualmachinemigrations,verbs=create;update,versions=v1,name=vvirtualmachinemigration.kb.io,admissionReviewVersions=v1
var _ webhook.Validator = &VirtualMachineMigration{}
// ValidateCreate implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachineMigration) ValidateCreate() (admission.Warnings, error) {
// TODO: implement creation validation webhook (?)
return nil, nil
}
// ValidateUpdate implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachineMigration) ValidateUpdate(old runtime.Object) (admission.Warnings, error) {
// TODO: implement update validation webhook
return nil, nil
}
// ValidateDelete implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachineMigration) ValidateDelete() (admission.Warnings, error) {
// TODO: implement deletion validation webhook (?)
return nil, nil
}
//go:build !ignore_autogenerated
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by controller-gen. DO NOT EDIT.
package v1
import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
)
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *CPUs) DeepCopyInto(out *CPUs) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CPUs.
func (in *CPUs) DeepCopy() *CPUs {
if in == nil {
return nil
}
out := new(CPUs)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Disk) DeepCopyInto(out *Disk) {
*out = *in
if in.ReadOnly != nil {
in, out := &in.ReadOnly, &out.ReadOnly
*out = new(bool)
**out = **in
}
in.DiskSource.DeepCopyInto(&out.DiskSource)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Disk.
func (in *Disk) DeepCopy() *Disk {
if in == nil {
return nil
}
out := new(Disk)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DiskSource) DeepCopyInto(out *DiskSource) {
*out = *in
if in.EmptyDisk != nil {
in, out := &in.EmptyDisk, &out.EmptyDisk
*out = new(EmptyDiskSource)
(*in).DeepCopyInto(*out)
}
if in.ConfigMap != nil {
in, out := &in.ConfigMap, &out.ConfigMap
*out = new(corev1.ConfigMapVolumeSource)
(*in).DeepCopyInto(*out)
}
if in.Secret != nil {
in, out := &in.Secret, &out.Secret
*out = new(corev1.SecretVolumeSource)
(*in).DeepCopyInto(*out)
}
if in.Tmpfs != nil {
in, out := &in.Tmpfs, &out.Tmpfs
*out = new(TmpfsDiskSource)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiskSource.
func (in *DiskSource) DeepCopy() *DiskSource {
if in == nil {
return nil
}
out := new(DiskSource)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *EmptyDiskSource) DeepCopyInto(out *EmptyDiskSource) {
*out = *in
out.Size = in.Size.DeepCopy()
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EmptyDiskSource.
func (in *EmptyDiskSource) DeepCopy() *EmptyDiskSource {
if in == nil {
return nil
}
out := new(EmptyDiskSource)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *EnvVar) DeepCopyInto(out *EnvVar) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvVar.
func (in *EnvVar) DeepCopy() *EnvVar {
if in == nil {
return nil
}
out := new(EnvVar)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ExtraNetwork) DeepCopyInto(out *ExtraNetwork) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtraNetwork.
func (in *ExtraNetwork) DeepCopy() *ExtraNetwork {
if in == nil {
return nil
}
out := new(ExtraNetwork)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Guest) DeepCopyInto(out *Guest) {
*out = *in
if in.KernelImage != nil {
in, out := &in.KernelImage, &out.KernelImage
*out = new(string)
**out = **in
}
if in.AppendKernelCmdline != nil {
in, out := &in.AppendKernelCmdline, &out.AppendKernelCmdline
*out = new(string)
**out = **in
}
out.CPUs = in.CPUs
out.MemorySlotSize = in.MemorySlotSize.DeepCopy()
out.MemorySlots = in.MemorySlots
if in.MemoryProvider != nil {
in, out := &in.MemoryProvider, &out.MemoryProvider
*out = new(MemoryProvider)
**out = **in
}
in.RootDisk.DeepCopyInto(&out.RootDisk)
if in.Command != nil {
in, out := &in.Command, &out.Command
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.Args != nil {
in, out := &in.Args, &out.Args
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.Env != nil {
in, out := &in.Env, &out.Env
*out = make([]EnvVar, len(*in))
copy(*out, *in)
}
if in.Ports != nil {
in, out := &in.Ports, &out.Ports
*out = make([]Port, len(*in))
copy(*out, *in)
}
if in.Settings != nil {
in, out := &in.Settings, &out.Settings
*out = new(GuestSettings)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Guest.
func (in *Guest) DeepCopy() *Guest {
if in == nil {
return nil
}
out := new(Guest)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *GuestSettings) DeepCopyInto(out *GuestSettings) {
*out = *in
if in.Sysctl != nil {
in, out := &in.Sysctl, &out.Sysctl
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.Swap != nil {
in, out := &in.Swap, &out.Swap
x := (*in).DeepCopy()
*out = &x
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GuestSettings.
func (in *GuestSettings) DeepCopy() *GuestSettings {
if in == nil {
return nil
}
out := new(GuestSettings)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IPAllocation) DeepCopyInto(out *IPAllocation) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IPAllocation.
func (in *IPAllocation) DeepCopy() *IPAllocation {
if in == nil {
return nil
}
out := new(IPAllocation)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IPPool) DeepCopyInto(out *IPPool) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
in.Spec.DeepCopyInto(&out.Spec)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IPPool.
func (in *IPPool) DeepCopy() *IPPool {
if in == nil {
return nil
}
out := new(IPPool)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *IPPool) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IPPoolList) DeepCopyInto(out *IPPoolList) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ListMeta.DeepCopyInto(&out.ListMeta)
if in.Items != nil {
in, out := &in.Items, &out.Items
*out = make([]IPPool, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IPPoolList.
func (in *IPPoolList) DeepCopy() *IPPoolList {
if in == nil {
return nil
}
out := new(IPPoolList)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *IPPoolList) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IPPoolSpec) DeepCopyInto(out *IPPoolSpec) {
*out = *in
if in.Allocations != nil {
in, out := &in.Allocations, &out.Allocations
*out = make(map[string]IPAllocation, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IPPoolSpec.
func (in *IPPoolSpec) DeepCopy() *IPPoolSpec {
if in == nil {
return nil
}
out := new(IPPoolSpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MemorySlots) DeepCopyInto(out *MemorySlots) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MemorySlots.
func (in *MemorySlots) DeepCopy() *MemorySlots {
if in == nil {
return nil
}
out := new(MemorySlots)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MigrationInfo) DeepCopyInto(out *MigrationInfo) {
*out = *in
out.Ram = in.Ram
out.Compression = in.Compression
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationInfo.
func (in *MigrationInfo) DeepCopy() *MigrationInfo {
if in == nil {
return nil
}
out := new(MigrationInfo)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MigrationInfoCompression) DeepCopyInto(out *MigrationInfoCompression) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationInfoCompression.
func (in *MigrationInfoCompression) DeepCopy() *MigrationInfoCompression {
if in == nil {
return nil
}
out := new(MigrationInfoCompression)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MigrationInfoRam) DeepCopyInto(out *MigrationInfoRam) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationInfoRam.
func (in *MigrationInfoRam) DeepCopy() *MigrationInfoRam {
if in == nil {
return nil
}
out := new(MigrationInfoRam)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Port) DeepCopyInto(out *Port) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Port.
func (in *Port) DeepCopy() *Port {
if in == nil {
return nil
}
out := new(Port)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Revision) DeepCopyInto(out *Revision) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Revision.
func (in *Revision) DeepCopy() *Revision {
if in == nil {
return nil
}
out := new(Revision)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RevisionWithTime) DeepCopyInto(out *RevisionWithTime) {
*out = *in
out.Revision = in.Revision
in.UpdatedAt.DeepCopyInto(&out.UpdatedAt)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RevisionWithTime.
func (in *RevisionWithTime) DeepCopy() *RevisionWithTime {
if in == nil {
return nil
}
out := new(RevisionWithTime)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RootDisk) DeepCopyInto(out *RootDisk) {
*out = *in
out.Size = in.Size.DeepCopy()
if in.Execute != nil {
in, out := &in.Execute, &out.Execute
*out = make([]string, len(*in))
copy(*out, *in)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RootDisk.
func (in *RootDisk) DeepCopy() *RootDisk {
if in == nil {
return nil
}
out := new(RootDisk)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *TmpfsDiskSource) DeepCopyInto(out *TmpfsDiskSource) {
*out = *in
out.Size = in.Size.DeepCopy()
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TmpfsDiskSource.
func (in *TmpfsDiskSource) DeepCopy() *TmpfsDiskSource {
if in == nil {
return nil
}
out := new(TmpfsDiskSource)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachine) DeepCopyInto(out *VirtualMachine) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
in.Spec.DeepCopyInto(&out.Spec)
in.Status.DeepCopyInto(&out.Status)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachine.
func (in *VirtualMachine) DeepCopy() *VirtualMachine {
if in == nil {
return nil
}
out := new(VirtualMachine)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *VirtualMachine) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineList) DeepCopyInto(out *VirtualMachineList) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ListMeta.DeepCopyInto(&out.ListMeta)
if in.Items != nil {
in, out := &in.Items, &out.Items
*out = make([]VirtualMachine, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineList.
func (in *VirtualMachineList) DeepCopy() *VirtualMachineList {
if in == nil {
return nil
}
out := new(VirtualMachineList)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *VirtualMachineList) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineMigration) DeepCopyInto(out *VirtualMachineMigration) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
in.Spec.DeepCopyInto(&out.Spec)
in.Status.DeepCopyInto(&out.Status)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineMigration.
func (in *VirtualMachineMigration) DeepCopy() *VirtualMachineMigration {
if in == nil {
return nil
}
out := new(VirtualMachineMigration)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *VirtualMachineMigration) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineMigrationList) DeepCopyInto(out *VirtualMachineMigrationList) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ListMeta.DeepCopyInto(&out.ListMeta)
if in.Items != nil {
in, out := &in.Items, &out.Items
*out = make([]VirtualMachineMigration, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineMigrationList.
func (in *VirtualMachineMigrationList) DeepCopy() *VirtualMachineMigrationList {
if in == nil {
return nil
}
out := new(VirtualMachineMigrationList)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *VirtualMachineMigrationList) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineMigrationSpec) DeepCopyInto(out *VirtualMachineMigrationSpec) {
*out = *in
if in.NodeSelector != nil {
in, out := &in.NodeSelector, &out.NodeSelector
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
if in.NodeAffinity != nil {
in, out := &in.NodeAffinity, &out.NodeAffinity
*out = new(corev1.NodeAffinity)
(*in).DeepCopyInto(*out)
}
out.MaxBandwidth = in.MaxBandwidth.DeepCopy()
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineMigrationSpec.
func (in *VirtualMachineMigrationSpec) DeepCopy() *VirtualMachineMigrationSpec {
if in == nil {
return nil
}
out := new(VirtualMachineMigrationSpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineMigrationStatus) DeepCopyInto(out *VirtualMachineMigrationStatus) {
*out = *in
if in.Conditions != nil {
in, out := &in.Conditions, &out.Conditions
*out = make([]metav1.Condition, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
out.Info = in.Info
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineMigrationStatus.
func (in *VirtualMachineMigrationStatus) DeepCopy() *VirtualMachineMigrationStatus {
if in == nil {
return nil
}
out := new(VirtualMachineMigrationStatus)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineResources) DeepCopyInto(out *VirtualMachineResources) {
*out = *in
out.CPUs = in.CPUs
out.MemorySlots = in.MemorySlots
out.MemorySlotSize = in.MemorySlotSize.DeepCopy()
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineResources.
func (in *VirtualMachineResources) DeepCopy() *VirtualMachineResources {
if in == nil {
return nil
}
out := new(VirtualMachineResources)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineSpec) DeepCopyInto(out *VirtualMachineSpec) {
*out = *in
if in.TerminationGracePeriodSeconds != nil {
in, out := &in.TerminationGracePeriodSeconds, &out.TerminationGracePeriodSeconds
*out = new(int64)
**out = **in
}
if in.NodeSelector != nil {
in, out := &in.NodeSelector, &out.NodeSelector
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
if in.Affinity != nil {
in, out := &in.Affinity, &out.Affinity
*out = new(corev1.Affinity)
(*in).DeepCopyInto(*out)
}
if in.Tolerations != nil {
in, out := &in.Tolerations, &out.Tolerations
*out = make([]corev1.Toleration, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
in.PodResources.DeepCopyInto(&out.PodResources)
if in.ImagePullSecrets != nil {
in, out := &in.ImagePullSecrets, &out.ImagePullSecrets
*out = make([]corev1.LocalObjectReference, len(*in))
copy(*out, *in)
}
in.Guest.DeepCopyInto(&out.Guest)
if in.ExtraInitContainers != nil {
in, out := &in.ExtraInitContainers, &out.ExtraInitContainers
*out = make([]corev1.Container, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.Disks != nil {
in, out := &in.Disks, &out.Disks
*out = make([]Disk, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.ExtraNetwork != nil {
in, out := &in.ExtraNetwork, &out.ExtraNetwork
*out = new(ExtraNetwork)
**out = **in
}
if in.ServiceLinks != nil {
in, out := &in.ServiceLinks, &out.ServiceLinks
*out = new(bool)
**out = **in
}
if in.EnableAcceleration != nil {
in, out := &in.EnableAcceleration, &out.EnableAcceleration
*out = new(bool)
**out = **in
}
if in.RunnerImage != nil {
in, out := &in.RunnerImage, &out.RunnerImage
*out = new(string)
**out = **in
}
if in.DelegatedCPULimits != nil {
in, out := &in.DelegatedCPULimits, &out.DelegatedCPULimits
*out = new(bool)
**out = **in
}
if in.EnableSSH != nil {
in, out := &in.EnableSSH, &out.EnableSSH
*out = new(bool)
**out = **in
}
if in.TargetRevision != nil {
in, out := &in.TargetRevision, &out.TargetRevision
*out = new(RevisionWithTime)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineSpec.
func (in *VirtualMachineSpec) DeepCopy() *VirtualMachineSpec {
if in == nil {
return nil
}
out := new(VirtualMachineSpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineStatus) DeepCopyInto(out *VirtualMachineStatus) {
*out = *in
if in.Conditions != nil {
in, out := &in.Conditions, &out.Conditions
*out = make([]metav1.Condition, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.CPUs != nil {
in, out := &in.CPUs, &out.CPUs
*out = new(MilliCPU)
**out = **in
}
if in.MemorySize != nil {
in, out := &in.MemorySize, &out.MemorySize
x := (*in).DeepCopy()
*out = &x
}
if in.MemoryProvider != nil {
in, out := &in.MemoryProvider, &out.MemoryProvider
*out = new(MemoryProvider)
**out = **in
}
if in.CurrentRevision != nil {
in, out := &in.CurrentRevision, &out.CurrentRevision
*out = new(RevisionWithTime)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineStatus.
func (in *VirtualMachineStatus) DeepCopy() *VirtualMachineStatus {
if in == nil {
return nil
}
out := new(VirtualMachineStatus)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineUsage) DeepCopyInto(out *VirtualMachineUsage) {
*out = *in
if in.CPU != nil {
in, out := &in.CPU, &out.CPU
x := (*in).DeepCopy()
*out = &x
}
if in.Memory != nil {
in, out := &in.Memory, &out.Memory
x := (*in).DeepCopy()
*out = &x
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineUsage.
func (in *VirtualMachineUsage) DeepCopy() *VirtualMachineUsage {
if in == nil {
return nil
}
out := new(VirtualMachineUsage)
in.DeepCopyInto(out)
return out
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.
package versioned
import (
"fmt"
"net/http"
neonvmv1 "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/typed/neonvm/v1"
discovery "k8s.io/client-go/discovery"
rest "k8s.io/client-go/rest"
flowcontrol "k8s.io/client-go/util/flowcontrol"
)
type Interface interface {
Discovery() discovery.DiscoveryInterface
NeonvmV1() neonvmv1.NeonvmV1Interface
}
// Clientset contains the clients for groups.
type Clientset struct {
*discovery.DiscoveryClient
neonvmV1 *neonvmv1.NeonvmV1Client
}
// NeonvmV1 retrieves the NeonvmV1Client
func (c *Clientset) NeonvmV1() neonvmv1.NeonvmV1Interface {
return c.neonvmV1
}
// Discovery retrieves the DiscoveryClient
func (c *Clientset) Discovery() discovery.DiscoveryInterface {
if c == nil {
return nil
}
return c.DiscoveryClient
}
// NewForConfig creates a new Clientset for the given config.
// If config's RateLimiter is not set and QPS and Burst are acceptable,
// NewForConfig will generate a rate-limiter in configShallowCopy.
// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient),
// where httpClient was generated with rest.HTTPClientFor(c).
func NewForConfig(c *rest.Config) (*Clientset, error) {
configShallowCopy := *c
if configShallowCopy.UserAgent == "" {
configShallowCopy.UserAgent = rest.DefaultKubernetesUserAgent()
}
// share the transport between all clients
httpClient, err := rest.HTTPClientFor(&configShallowCopy)
if err != nil {
return nil, err
}
return NewForConfigAndClient(&configShallowCopy, httpClient)
}
// NewForConfigAndClient creates a new Clientset for the given config and http client.
// Note the http client provided takes precedence over the configured transport values.
// If config's RateLimiter is not set and QPS and Burst are acceptable,
// NewForConfigAndClient will generate a rate-limiter in configShallowCopy.
func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, error) {
configShallowCopy := *c
if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 {
if configShallowCopy.Burst <= 0 {
return nil, fmt.Errorf("burst is required to be greater than 0 when RateLimiter is not set and QPS is set to greater than 0")
}
configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst)
}
var cs Clientset
var err error
cs.neonvmV1, err = neonvmv1.NewForConfigAndClient(&configShallowCopy, httpClient)
if err != nil {
return nil, err
}
cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfigAndClient(&configShallowCopy, httpClient)
if err != nil {
return nil, err
}
return &cs, nil
}
// NewForConfigOrDie creates a new Clientset for the given config and
// panics if there is an error in the config.
func NewForConfigOrDie(c *rest.Config) *Clientset {
cs, err := NewForConfig(c)
if err != nil {
panic(err)
}
return cs
}
// New creates a new Clientset for the given RESTClient.
func New(c rest.Interface) *Clientset {
var cs Clientset
cs.neonvmV1 = neonvmv1.New(c)
cs.DiscoveryClient = discovery.NewDiscoveryClient(c)
return &cs
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.
package fake
import (
clientset "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
neonvmv1 "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/typed/neonvm/v1"
fakeneonvmv1 "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/typed/neonvm/v1/fake"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/discovery"
fakediscovery "k8s.io/client-go/discovery/fake"
"k8s.io/client-go/testing"
)
// NewSimpleClientset returns a clientset that will respond with the provided objects.
// It's backed by a very simple object tracker that processes creates, updates and deletions as-is,
// without applying any validations and/or defaults. It shouldn't be considered a replacement
// for a real clientset and is mostly useful in simple unit tests.
func NewSimpleClientset(objects ...runtime.Object) *Clientset {
o := testing.NewObjectTracker(scheme, codecs.UniversalDecoder())
for _, obj := range objects {
if err := o.Add(obj); err != nil {
panic(err)
}
}
cs := &Clientset{tracker: o}
cs.discovery = &fakediscovery.FakeDiscovery{Fake: &cs.Fake}
cs.AddReactor("*", "*", testing.ObjectReaction(o))
cs.AddWatchReactor("*", func(action testing.Action) (handled bool, ret watch.Interface, err error) {
gvr := action.GetResource()
ns := action.GetNamespace()
watch, err := o.Watch(gvr, ns)
if err != nil {
return false, nil, err
}
return true, watch, nil
})
return cs
}
// Clientset implements clientset.Interface. Meant to be embedded into a
// struct to get a default implementation. This makes faking out just the method
// you want to test easier.
type Clientset struct {
testing.Fake
discovery *fakediscovery.FakeDiscovery
tracker testing.ObjectTracker
}
func (c *Clientset) Discovery() discovery.DiscoveryInterface {
return c.discovery
}
func (c *Clientset) Tracker() testing.ObjectTracker {
return c.tracker
}
var (
_ clientset.Interface = &Clientset{}
_ testing.FakeClient = &Clientset{}
)
// NeonvmV1 retrieves the NeonvmV1Client
func (c *Clientset) NeonvmV1() neonvmv1.NeonvmV1Interface {
return &fakeneonvmv1.FakeNeonvmV1{Fake: &c.Fake}
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.
package fake
import (
neonvmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
schema "k8s.io/apimachinery/pkg/runtime/schema"
serializer "k8s.io/apimachinery/pkg/runtime/serializer"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
)
var scheme = runtime.NewScheme()
var codecs = serializer.NewCodecFactory(scheme)
var localSchemeBuilder = runtime.SchemeBuilder{
neonvmv1.AddToScheme,
}
// AddToScheme adds all types of this clientset into the given scheme. This allows composition
// of clientsets, like in:
//
// import (
// "k8s.io/client-go/kubernetes"
// clientsetscheme "k8s.io/client-go/kubernetes/scheme"
// aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme"
// )
//
// kclientset, _ := kubernetes.NewForConfig(c)
// _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme)
//
// After this, RawExtensions in Kubernetes types will serialize kube-aggregator types
// correctly.
var AddToScheme = localSchemeBuilder.AddToScheme
func init() {
v1.AddToGroupVersion(scheme, schema.GroupVersion{Version: "v1"})
utilruntime.Must(AddToScheme(scheme))
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.
package scheme
import (
neonvmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
schema "k8s.io/apimachinery/pkg/runtime/schema"
serializer "k8s.io/apimachinery/pkg/runtime/serializer"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
)
var Scheme = runtime.NewScheme()
var Codecs = serializer.NewCodecFactory(Scheme)
var ParameterCodec = runtime.NewParameterCodec(Scheme)
var localSchemeBuilder = runtime.SchemeBuilder{
neonvmv1.AddToScheme,
}
// AddToScheme adds all types of this clientset into the given scheme. This allows composition
// of clientsets, like in:
//
// import (
// "k8s.io/client-go/kubernetes"
// clientsetscheme "k8s.io/client-go/kubernetes/scheme"
// aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme"
// )
//
// kclientset, _ := kubernetes.NewForConfig(c)
// _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme)
//
// After this, RawExtensions in Kubernetes types will serialize kube-aggregator types
// correctly.
var AddToScheme = localSchemeBuilder.AddToScheme
func init() {
v1.AddToGroupVersion(Scheme, schema.GroupVersion{Version: "v1"})
utilruntime.Must(AddToScheme(Scheme))
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.
package fake
import (
"context"
v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
labels "k8s.io/apimachinery/pkg/labels"
types "k8s.io/apimachinery/pkg/types"
watch "k8s.io/apimachinery/pkg/watch"
testing "k8s.io/client-go/testing"
)
// FakeIPPools implements IPPoolInterface
type FakeIPPools struct {
Fake *FakeNeonvmV1
ns string
}
var ippoolsResource = v1.SchemeGroupVersion.WithResource("ippools")
var ippoolsKind = v1.SchemeGroupVersion.WithKind("IPPool")
// Get takes name of the iPPool, and returns the corresponding iPPool object, and an error if there is any.
func (c *FakeIPPools) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.IPPool, err error) {
obj, err := c.Fake.
Invokes(testing.NewGetAction(ippoolsResource, c.ns, name), &v1.IPPool{})
if obj == nil {
return nil, err
}
return obj.(*v1.IPPool), err
}
// List takes label and field selectors, and returns the list of IPPools that match those selectors.
func (c *FakeIPPools) List(ctx context.Context, opts metav1.ListOptions) (result *v1.IPPoolList, err error) {
obj, err := c.Fake.
Invokes(testing.NewListAction(ippoolsResource, ippoolsKind, c.ns, opts), &v1.IPPoolList{})
if obj == nil {
return nil, err
}
label, _, _ := testing.ExtractFromListOptions(opts)
if label == nil {
label = labels.Everything()
}
list := &v1.IPPoolList{ListMeta: obj.(*v1.IPPoolList).ListMeta}
for _, item := range obj.(*v1.IPPoolList).Items {
if label.Matches(labels.Set(item.Labels)) {
list.Items = append(list.Items, item)
}
}
return list, err
}
// Watch returns a watch.Interface that watches the requested iPPools.
func (c *FakeIPPools) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) {
return c.Fake.
InvokesWatch(testing.NewWatchAction(ippoolsResource, c.ns, opts))
}
// Create takes the representation of a iPPool and creates it. Returns the server's representation of the iPPool, and an error, if there is any.
func (c *FakeIPPools) Create(ctx context.Context, iPPool *v1.IPPool, opts metav1.CreateOptions) (result *v1.IPPool, err error) {
obj, err := c.Fake.
Invokes(testing.NewCreateAction(ippoolsResource, c.ns, iPPool), &v1.IPPool{})
if obj == nil {
return nil, err
}
return obj.(*v1.IPPool), err
}
// Update takes the representation of a iPPool and updates it. Returns the server's representation of the iPPool, and an error, if there is any.
func (c *FakeIPPools) Update(ctx context.Context, iPPool *v1.IPPool, opts metav1.UpdateOptions) (result *v1.IPPool, err error) {
obj, err := c.Fake.
Invokes(testing.NewUpdateAction(ippoolsResource, c.ns, iPPool), &v1.IPPool{})
if obj == nil {
return nil, err
}
return obj.(*v1.IPPool), err
}
// Delete takes name of the iPPool and deletes it. Returns an error if one occurs.
func (c *FakeIPPools) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
_, err := c.Fake.
Invokes(testing.NewDeleteActionWithOptions(ippoolsResource, c.ns, name, opts), &v1.IPPool{})
return err
}
// DeleteCollection deletes a collection of objects.
func (c *FakeIPPools) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error {
action := testing.NewDeleteCollectionAction(ippoolsResource, c.ns, listOpts)
_, err := c.Fake.Invokes(action, &v1.IPPoolList{})
return err
}
// Patch applies the patch and returns the patched iPPool.
func (c *FakeIPPools) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.IPPool, err error) {
obj, err := c.Fake.
Invokes(testing.NewPatchSubresourceAction(ippoolsResource, c.ns, name, pt, data, subresources...), &v1.IPPool{})
if obj == nil {
return nil, err
}
return obj.(*v1.IPPool), err
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.
package fake
import (
v1 "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/typed/neonvm/v1"
rest "k8s.io/client-go/rest"
testing "k8s.io/client-go/testing"
)
type FakeNeonvmV1 struct {
*testing.Fake
}
func (c *FakeNeonvmV1) IPPools(namespace string) v1.IPPoolInterface {
return &FakeIPPools{c, namespace}
}
func (c *FakeNeonvmV1) VirtualMachines(namespace string) v1.VirtualMachineInterface {
return &FakeVirtualMachines{c, namespace}
}
func (c *FakeNeonvmV1) VirtualMachineMigrations(namespace string) v1.VirtualMachineMigrationInterface {
return &FakeVirtualMachineMigrations{c, namespace}
}
// RESTClient returns a RESTClient that is used to communicate
// with API server by this client implementation.
func (c *FakeNeonvmV1) RESTClient() rest.Interface {
var ret *rest.RESTClient
return ret
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.
package fake
import (
"context"
v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
labels "k8s.io/apimachinery/pkg/labels"
types "k8s.io/apimachinery/pkg/types"
watch "k8s.io/apimachinery/pkg/watch"
testing "k8s.io/client-go/testing"
)
// FakeVirtualMachines implements VirtualMachineInterface
type FakeVirtualMachines struct {
Fake *FakeNeonvmV1
ns string
}
var virtualmachinesResource = v1.SchemeGroupVersion.WithResource("virtualmachines")
var virtualmachinesKind = v1.SchemeGroupVersion.WithKind("VirtualMachine")
// Get takes name of the virtualMachine, and returns the corresponding virtualMachine object, and an error if there is any.
func (c *FakeVirtualMachines) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.VirtualMachine, err error) {
obj, err := c.Fake.
Invokes(testing.NewGetAction(virtualmachinesResource, c.ns, name), &v1.VirtualMachine{})
if obj == nil {
return nil, err
}
return obj.(*v1.VirtualMachine), err
}
// List takes label and field selectors, and returns the list of VirtualMachines that match those selectors.
func (c *FakeVirtualMachines) List(ctx context.Context, opts metav1.ListOptions) (result *v1.VirtualMachineList, err error) {
obj, err := c.Fake.
Invokes(testing.NewListAction(virtualmachinesResource, virtualmachinesKind, c.ns, opts), &v1.VirtualMachineList{})
if obj == nil {
return nil, err
}
label, _, _ := testing.ExtractFromListOptions(opts)
if label == nil {
label = labels.Everything()
}
list := &v1.VirtualMachineList{ListMeta: obj.(*v1.VirtualMachineList).ListMeta}
for _, item := range obj.(*v1.VirtualMachineList).Items {
if label.Matches(labels.Set(item.Labels)) {
list.Items = append(list.Items, item)
}
}
return list, err
}
// Watch returns a watch.Interface that watches the requested virtualMachines.
func (c *FakeVirtualMachines) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) {
return c.Fake.
InvokesWatch(testing.NewWatchAction(virtualmachinesResource, c.ns, opts))
}
// Create takes the representation of a virtualMachine and creates it. Returns the server's representation of the virtualMachine, and an error, if there is any.
func (c *FakeVirtualMachines) Create(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.CreateOptions) (result *v1.VirtualMachine, err error) {
obj, err := c.Fake.
Invokes(testing.NewCreateAction(virtualmachinesResource, c.ns, virtualMachine), &v1.VirtualMachine{})
if obj == nil {
return nil, err
}
return obj.(*v1.VirtualMachine), err
}
// Update takes the representation of a virtualMachine and updates it. Returns the server's representation of the virtualMachine, and an error, if there is any.
func (c *FakeVirtualMachines) Update(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.UpdateOptions) (result *v1.VirtualMachine, err error) {
obj, err := c.Fake.
Invokes(testing.NewUpdateAction(virtualmachinesResource, c.ns, virtualMachine), &v1.VirtualMachine{})
if obj == nil {
return nil, err
}
return obj.(*v1.VirtualMachine), err
}
// UpdateStatus was generated because the type contains a Status member.
// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus().
func (c *FakeVirtualMachines) UpdateStatus(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.UpdateOptions) (*v1.VirtualMachine, error) {
obj, err := c.Fake.
Invokes(testing.NewUpdateSubresourceAction(virtualmachinesResource, "status", c.ns, virtualMachine), &v1.VirtualMachine{})
if obj == nil {
return nil, err
}
return obj.(*v1.VirtualMachine), err
}
// Delete takes name of the virtualMachine and deletes it. Returns an error if one occurs.
func (c *FakeVirtualMachines) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
_, err := c.Fake.
Invokes(testing.NewDeleteActionWithOptions(virtualmachinesResource, c.ns, name, opts), &v1.VirtualMachine{})
return err
}
// DeleteCollection deletes a collection of objects.
func (c *FakeVirtualMachines) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error {
action := testing.NewDeleteCollectionAction(virtualmachinesResource, c.ns, listOpts)
_, err := c.Fake.Invokes(action, &v1.VirtualMachineList{})
return err
}
// Patch applies the patch and returns the patched virtualMachine.
func (c *FakeVirtualMachines) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.VirtualMachine, err error) {
obj, err := c.Fake.
Invokes(testing.NewPatchSubresourceAction(virtualmachinesResource, c.ns, name, pt, data, subresources...), &v1.VirtualMachine{})
if obj == nil {
return nil, err
}
return obj.(*v1.VirtualMachine), err
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.
package fake
import (
"context"
v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
labels "k8s.io/apimachinery/pkg/labels"
types "k8s.io/apimachinery/pkg/types"
watch "k8s.io/apimachinery/pkg/watch"
testing "k8s.io/client-go/testing"
)
// FakeVirtualMachineMigrations implements VirtualMachineMigrationInterface
type FakeVirtualMachineMigrations struct {
Fake *FakeNeonvmV1
ns string
}
var virtualmachinemigrationsResource = v1.SchemeGroupVersion.WithResource("virtualmachinemigrations")
var virtualmachinemigrationsKind = v1.SchemeGroupVersion.WithKind("VirtualMachineMigration")
// Get takes name of the virtualMachineMigration, and returns the corresponding virtualMachineMigration object, and an error if there is any.
func (c *FakeVirtualMachineMigrations) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.VirtualMachineMigration, err error) {
obj, err := c.Fake.
Invokes(testing.NewGetAction(virtualmachinemigrationsResource, c.ns, name), &v1.VirtualMachineMigration{})
if obj == nil {
return nil, err
}
return obj.(*v1.VirtualMachineMigration), err
}
// List takes label and field selectors, and returns the list of VirtualMachineMigrations that match those selectors.
func (c *FakeVirtualMachineMigrations) List(ctx context.Context, opts metav1.ListOptions) (result *v1.VirtualMachineMigrationList, err error) {
obj, err := c.Fake.
Invokes(testing.NewListAction(virtualmachinemigrationsResource, virtualmachinemigrationsKind, c.ns, opts), &v1.VirtualMachineMigrationList{})
if obj == nil {
return nil, err
}
label, _, _ := testing.ExtractFromListOptions(opts)
if label == nil {
label = labels.Everything()
}
list := &v1.VirtualMachineMigrationList{ListMeta: obj.(*v1.VirtualMachineMigrationList).ListMeta}
for _, item := range obj.(*v1.VirtualMachineMigrationList).Items {
if label.Matches(labels.Set(item.Labels)) {
list.Items = append(list.Items, item)
}
}
return list, err
}
// Watch returns a watch.Interface that watches the requested virtualMachineMigrations.
func (c *FakeVirtualMachineMigrations) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) {
return c.Fake.
InvokesWatch(testing.NewWatchAction(virtualmachinemigrationsResource, c.ns, opts))
}
// Create takes the representation of a virtualMachineMigration and creates it. Returns the server's representation of the virtualMachineMigration, and an error, if there is any.
func (c *FakeVirtualMachineMigrations) Create(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.CreateOptions) (result *v1.VirtualMachineMigration, err error) {
obj, err := c.Fake.
Invokes(testing.NewCreateAction(virtualmachinemigrationsResource, c.ns, virtualMachineMigration), &v1.VirtualMachineMigration{})
if obj == nil {
return nil, err
}
return obj.(*v1.VirtualMachineMigration), err
}
// Update takes the representation of a virtualMachineMigration and updates it. Returns the server's representation of the virtualMachineMigration, and an error, if there is any.
func (c *FakeVirtualMachineMigrations) Update(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.UpdateOptions) (result *v1.VirtualMachineMigration, err error) {
obj, err := c.Fake.
Invokes(testing.NewUpdateAction(virtualmachinemigrationsResource, c.ns, virtualMachineMigration), &v1.VirtualMachineMigration{})
if obj == nil {
return nil, err
}
return obj.(*v1.VirtualMachineMigration), err
}
// UpdateStatus was generated because the type contains a Status member.
// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus().
func (c *FakeVirtualMachineMigrations) UpdateStatus(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.UpdateOptions) (*v1.VirtualMachineMigration, error) {
obj, err := c.Fake.
Invokes(testing.NewUpdateSubresourceAction(virtualmachinemigrationsResource, "status", c.ns, virtualMachineMigration), &v1.VirtualMachineMigration{})
if obj == nil {
return nil, err
}
return obj.(*v1.VirtualMachineMigration), err
}
// Delete takes name of the virtualMachineMigration and deletes it. Returns an error if one occurs.
func (c *FakeVirtualMachineMigrations) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
_, err := c.Fake.
Invokes(testing.NewDeleteActionWithOptions(virtualmachinemigrationsResource, c.ns, name, opts), &v1.VirtualMachineMigration{})
return err
}
// DeleteCollection deletes a collection of objects.
func (c *FakeVirtualMachineMigrations) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error {
action := testing.NewDeleteCollectionAction(virtualmachinemigrationsResource, c.ns, listOpts)
_, err := c.Fake.Invokes(action, &v1.VirtualMachineMigrationList{})
return err
}
// Patch applies the patch and returns the patched virtualMachineMigration.
func (c *FakeVirtualMachineMigrations) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.VirtualMachineMigration, err error) {
obj, err := c.Fake.
Invokes(testing.NewPatchSubresourceAction(virtualmachinemigrationsResource, c.ns, name, pt, data, subresources...), &v1.VirtualMachineMigration{})
if obj == nil {
return nil, err
}
return obj.(*v1.VirtualMachineMigration), err
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.
package v1
import (
"context"
"time"
v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
scheme "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/scheme"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
types "k8s.io/apimachinery/pkg/types"
watch "k8s.io/apimachinery/pkg/watch"
rest "k8s.io/client-go/rest"
)
// IPPoolsGetter has a method to return a IPPoolInterface.
// A group's client should implement this interface.
type IPPoolsGetter interface {
IPPools(namespace string) IPPoolInterface
}
// IPPoolInterface has methods to work with IPPool resources.
type IPPoolInterface interface {
Create(ctx context.Context, iPPool *v1.IPPool, opts metav1.CreateOptions) (*v1.IPPool, error)
Update(ctx context.Context, iPPool *v1.IPPool, opts metav1.UpdateOptions) (*v1.IPPool, error)
Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error
DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error
Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.IPPool, error)
List(ctx context.Context, opts metav1.ListOptions) (*v1.IPPoolList, error)
Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error)
Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.IPPool, err error)
IPPoolExpansion
}
// iPPools implements IPPoolInterface
type iPPools struct {
client rest.Interface
ns string
}
// newIPPools returns a IPPools
func newIPPools(c *NeonvmV1Client, namespace string) *iPPools {
return &iPPools{
client: c.RESTClient(),
ns: namespace,
}
}
// Get takes name of the iPPool, and returns the corresponding iPPool object, and an error if there is any.
func (c *iPPools) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.IPPool, err error) {
result = &v1.IPPool{}
err = c.client.Get().
Namespace(c.ns).
Resource("ippools").
Name(name).
VersionedParams(&options, scheme.ParameterCodec).
Do(ctx).
Into(result)
return
}
// List takes label and field selectors, and returns the list of IPPools that match those selectors.
func (c *iPPools) List(ctx context.Context, opts metav1.ListOptions) (result *v1.IPPoolList, err error) {
var timeout time.Duration
if opts.TimeoutSeconds != nil {
timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
}
result = &v1.IPPoolList{}
err = c.client.Get().
Namespace(c.ns).
Resource("ippools").
VersionedParams(&opts, scheme.ParameterCodec).
Timeout(timeout).
Do(ctx).
Into(result)
return
}
// Watch returns a watch.Interface that watches the requested iPPools.
func (c *iPPools) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) {
var timeout time.Duration
if opts.TimeoutSeconds != nil {
timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
}
opts.Watch = true
return c.client.Get().
Namespace(c.ns).
Resource("ippools").
VersionedParams(&opts, scheme.ParameterCodec).
Timeout(timeout).
Watch(ctx)
}
// Create takes the representation of a iPPool and creates it. Returns the server's representation of the iPPool, and an error, if there is any.
func (c *iPPools) Create(ctx context.Context, iPPool *v1.IPPool, opts metav1.CreateOptions) (result *v1.IPPool, err error) {
result = &v1.IPPool{}
err = c.client.Post().
Namespace(c.ns).
Resource("ippools").
VersionedParams(&opts, scheme.ParameterCodec).
Body(iPPool).
Do(ctx).
Into(result)
return
}
// Update takes the representation of a iPPool and updates it. Returns the server's representation of the iPPool, and an error, if there is any.
func (c *iPPools) Update(ctx context.Context, iPPool *v1.IPPool, opts metav1.UpdateOptions) (result *v1.IPPool, err error) {
result = &v1.IPPool{}
err = c.client.Put().
Namespace(c.ns).
Resource("ippools").
Name(iPPool.Name).
VersionedParams(&opts, scheme.ParameterCodec).
Body(iPPool).
Do(ctx).
Into(result)
return
}
// Delete takes name of the iPPool and deletes it. Returns an error if one occurs.
func (c *iPPools) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
return c.client.Delete().
Namespace(c.ns).
Resource("ippools").
Name(name).
Body(&opts).
Do(ctx).
Error()
}
// DeleteCollection deletes a collection of objects.
func (c *iPPools) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error {
var timeout time.Duration
if listOpts.TimeoutSeconds != nil {
timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second
}
return c.client.Delete().
Namespace(c.ns).
Resource("ippools").
VersionedParams(&listOpts, scheme.ParameterCodec).
Timeout(timeout).
Body(&opts).
Do(ctx).
Error()
}
// Patch applies the patch and returns the patched iPPool.
func (c *iPPools) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.IPPool, err error) {
result = &v1.IPPool{}
err = c.client.Patch(pt).
Namespace(c.ns).
Resource("ippools").
Name(name).
SubResource(subresources...).
VersionedParams(&opts, scheme.ParameterCodec).
Body(data).
Do(ctx).
Into(result)
return
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.
package v1
import (
"net/http"
v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/scheme"
rest "k8s.io/client-go/rest"
)
type NeonvmV1Interface interface {
RESTClient() rest.Interface
IPPoolsGetter
VirtualMachinesGetter
VirtualMachineMigrationsGetter
}
// NeonvmV1Client is used to interact with features provided by the neonvm group.
type NeonvmV1Client struct {
restClient rest.Interface
}
func (c *NeonvmV1Client) IPPools(namespace string) IPPoolInterface {
return newIPPools(c, namespace)
}
func (c *NeonvmV1Client) VirtualMachines(namespace string) VirtualMachineInterface {
return newVirtualMachines(c, namespace)
}
func (c *NeonvmV1Client) VirtualMachineMigrations(namespace string) VirtualMachineMigrationInterface {
return newVirtualMachineMigrations(c, namespace)
}
// NewForConfig creates a new NeonvmV1Client for the given config.
// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient),
// where httpClient was generated with rest.HTTPClientFor(c).
func NewForConfig(c *rest.Config) (*NeonvmV1Client, error) {
config := *c
if err := setConfigDefaults(&config); err != nil {
return nil, err
}
httpClient, err := rest.HTTPClientFor(&config)
if err != nil {
return nil, err
}
return NewForConfigAndClient(&config, httpClient)
}
// NewForConfigAndClient creates a new NeonvmV1Client for the given config and http client.
// Note the http client provided takes precedence over the configured transport values.
func NewForConfigAndClient(c *rest.Config, h *http.Client) (*NeonvmV1Client, error) {
config := *c
if err := setConfigDefaults(&config); err != nil {
return nil, err
}
client, err := rest.RESTClientForConfigAndClient(&config, h)
if err != nil {
return nil, err
}
return &NeonvmV1Client{client}, nil
}
// NewForConfigOrDie creates a new NeonvmV1Client for the given config and
// panics if there is an error in the config.
func NewForConfigOrDie(c *rest.Config) *NeonvmV1Client {
client, err := NewForConfig(c)
if err != nil {
panic(err)
}
return client
}
// New creates a new NeonvmV1Client for the given RESTClient.
func New(c rest.Interface) *NeonvmV1Client {
return &NeonvmV1Client{c}
}
func setConfigDefaults(config *rest.Config) error {
gv := v1.SchemeGroupVersion
config.GroupVersion = &gv
config.APIPath = "/apis"
config.NegotiatedSerializer = scheme.Codecs.WithoutConversion()
if config.UserAgent == "" {
config.UserAgent = rest.DefaultKubernetesUserAgent()
}
return nil
}
// RESTClient returns a RESTClient that is used to communicate
// with API server by this client implementation.
func (c *NeonvmV1Client) RESTClient() rest.Interface {
if c == nil {
return nil
}
return c.restClient
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.
package v1
import (
"context"
"time"
v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
scheme "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/scheme"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
types "k8s.io/apimachinery/pkg/types"
watch "k8s.io/apimachinery/pkg/watch"
rest "k8s.io/client-go/rest"
)
// VirtualMachinesGetter has a method to return a VirtualMachineInterface.
// A group's client should implement this interface.
type VirtualMachinesGetter interface {
VirtualMachines(namespace string) VirtualMachineInterface
}
// VirtualMachineInterface has methods to work with VirtualMachine resources.
type VirtualMachineInterface interface {
Create(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.CreateOptions) (*v1.VirtualMachine, error)
Update(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.UpdateOptions) (*v1.VirtualMachine, error)
UpdateStatus(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.UpdateOptions) (*v1.VirtualMachine, error)
Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error
DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error
Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.VirtualMachine, error)
List(ctx context.Context, opts metav1.ListOptions) (*v1.VirtualMachineList, error)
Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error)
Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.VirtualMachine, err error)
VirtualMachineExpansion
}
// virtualMachines implements VirtualMachineInterface
type virtualMachines struct {
client rest.Interface
ns string
}
// newVirtualMachines returns a VirtualMachines
func newVirtualMachines(c *NeonvmV1Client, namespace string) *virtualMachines {
return &virtualMachines{
client: c.RESTClient(),
ns: namespace,
}
}
// Get takes name of the virtualMachine, and returns the corresponding virtualMachine object, and an error if there is any.
func (c *virtualMachines) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.VirtualMachine, err error) {
result = &v1.VirtualMachine{}
err = c.client.Get().
Namespace(c.ns).
Resource("virtualmachines").
Name(name).
VersionedParams(&options, scheme.ParameterCodec).
Do(ctx).
Into(result)
return
}
// List takes label and field selectors, and returns the list of VirtualMachines that match those selectors.
func (c *virtualMachines) List(ctx context.Context, opts metav1.ListOptions) (result *v1.VirtualMachineList, err error) {
var timeout time.Duration
if opts.TimeoutSeconds != nil {
timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
}
result = &v1.VirtualMachineList{}
err = c.client.Get().
Namespace(c.ns).
Resource("virtualmachines").
VersionedParams(&opts, scheme.ParameterCodec).
Timeout(timeout).
Do(ctx).
Into(result)
return
}
// Watch returns a watch.Interface that watches the requested virtualMachines.
func (c *virtualMachines) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) {
var timeout time.Duration
if opts.TimeoutSeconds != nil {
timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
}
opts.Watch = true
return c.client.Get().
Namespace(c.ns).
Resource("virtualmachines").
VersionedParams(&opts, scheme.ParameterCodec).
Timeout(timeout).
Watch(ctx)
}
// Create takes the representation of a virtualMachine and creates it. Returns the server's representation of the virtualMachine, and an error, if there is any.
func (c *virtualMachines) Create(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.CreateOptions) (result *v1.VirtualMachine, err error) {
result = &v1.VirtualMachine{}
err = c.client.Post().
Namespace(c.ns).
Resource("virtualmachines").
VersionedParams(&opts, scheme.ParameterCodec).
Body(virtualMachine).
Do(ctx).
Into(result)
return
}
// Update takes the representation of a virtualMachine and updates it. Returns the server's representation of the virtualMachine, and an error, if there is any.
func (c *virtualMachines) Update(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.UpdateOptions) (result *v1.VirtualMachine, err error) {
result = &v1.VirtualMachine{}
err = c.client.Put().
Namespace(c.ns).
Resource("virtualmachines").
Name(virtualMachine.Name).
VersionedParams(&opts, scheme.ParameterCodec).
Body(virtualMachine).
Do(ctx).
Into(result)
return
}
// UpdateStatus was generated because the type contains a Status member.
// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus().
func (c *virtualMachines) UpdateStatus(ctx context.Context, virtualMachine *v1.VirtualMachine, opts metav1.UpdateOptions) (result *v1.VirtualMachine, err error) {
result = &v1.VirtualMachine{}
err = c.client.Put().
Namespace(c.ns).
Resource("virtualmachines").
Name(virtualMachine.Name).
SubResource("status").
VersionedParams(&opts, scheme.ParameterCodec).
Body(virtualMachine).
Do(ctx).
Into(result)
return
}
// Delete takes name of the virtualMachine and deletes it. Returns an error if one occurs.
func (c *virtualMachines) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
return c.client.Delete().
Namespace(c.ns).
Resource("virtualmachines").
Name(name).
Body(&opts).
Do(ctx).
Error()
}
// DeleteCollection deletes a collection of objects.
func (c *virtualMachines) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error {
var timeout time.Duration
if listOpts.TimeoutSeconds != nil {
timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second
}
return c.client.Delete().
Namespace(c.ns).
Resource("virtualmachines").
VersionedParams(&listOpts, scheme.ParameterCodec).
Timeout(timeout).
Body(&opts).
Do(ctx).
Error()
}
// Patch applies the patch and returns the patched virtualMachine.
func (c *virtualMachines) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.VirtualMachine, err error) {
result = &v1.VirtualMachine{}
err = c.client.Patch(pt).
Namespace(c.ns).
Resource("virtualmachines").
Name(name).
SubResource(subresources...).
VersionedParams(&opts, scheme.ParameterCodec).
Body(data).
Do(ctx).
Into(result)
return
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by client-gen. DO NOT EDIT.
package v1
import (
"context"
"time"
v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
scheme "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned/scheme"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
types "k8s.io/apimachinery/pkg/types"
watch "k8s.io/apimachinery/pkg/watch"
rest "k8s.io/client-go/rest"
)
// VirtualMachineMigrationsGetter has a method to return a VirtualMachineMigrationInterface.
// A group's client should implement this interface.
type VirtualMachineMigrationsGetter interface {
VirtualMachineMigrations(namespace string) VirtualMachineMigrationInterface
}
// VirtualMachineMigrationInterface has methods to work with VirtualMachineMigration resources.
type VirtualMachineMigrationInterface interface {
Create(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.CreateOptions) (*v1.VirtualMachineMigration, error)
Update(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.UpdateOptions) (*v1.VirtualMachineMigration, error)
UpdateStatus(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.UpdateOptions) (*v1.VirtualMachineMigration, error)
Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error
DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error
Get(ctx context.Context, name string, opts metav1.GetOptions) (*v1.VirtualMachineMigration, error)
List(ctx context.Context, opts metav1.ListOptions) (*v1.VirtualMachineMigrationList, error)
Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error)
Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.VirtualMachineMigration, err error)
VirtualMachineMigrationExpansion
}
// virtualMachineMigrations implements VirtualMachineMigrationInterface
type virtualMachineMigrations struct {
client rest.Interface
ns string
}
// newVirtualMachineMigrations returns a VirtualMachineMigrations
func newVirtualMachineMigrations(c *NeonvmV1Client, namespace string) *virtualMachineMigrations {
return &virtualMachineMigrations{
client: c.RESTClient(),
ns: namespace,
}
}
// Get takes name of the virtualMachineMigration, and returns the corresponding virtualMachineMigration object, and an error if there is any.
func (c *virtualMachineMigrations) Get(ctx context.Context, name string, options metav1.GetOptions) (result *v1.VirtualMachineMigration, err error) {
result = &v1.VirtualMachineMigration{}
err = c.client.Get().
Namespace(c.ns).
Resource("virtualmachinemigrations").
Name(name).
VersionedParams(&options, scheme.ParameterCodec).
Do(ctx).
Into(result)
return
}
// List takes label and field selectors, and returns the list of VirtualMachineMigrations that match those selectors.
func (c *virtualMachineMigrations) List(ctx context.Context, opts metav1.ListOptions) (result *v1.VirtualMachineMigrationList, err error) {
var timeout time.Duration
if opts.TimeoutSeconds != nil {
timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
}
result = &v1.VirtualMachineMigrationList{}
err = c.client.Get().
Namespace(c.ns).
Resource("virtualmachinemigrations").
VersionedParams(&opts, scheme.ParameterCodec).
Timeout(timeout).
Do(ctx).
Into(result)
return
}
// Watch returns a watch.Interface that watches the requested virtualMachineMigrations.
func (c *virtualMachineMigrations) Watch(ctx context.Context, opts metav1.ListOptions) (watch.Interface, error) {
var timeout time.Duration
if opts.TimeoutSeconds != nil {
timeout = time.Duration(*opts.TimeoutSeconds) * time.Second
}
opts.Watch = true
return c.client.Get().
Namespace(c.ns).
Resource("virtualmachinemigrations").
VersionedParams(&opts, scheme.ParameterCodec).
Timeout(timeout).
Watch(ctx)
}
// Create takes the representation of a virtualMachineMigration and creates it. Returns the server's representation of the virtualMachineMigration, and an error, if there is any.
func (c *virtualMachineMigrations) Create(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.CreateOptions) (result *v1.VirtualMachineMigration, err error) {
result = &v1.VirtualMachineMigration{}
err = c.client.Post().
Namespace(c.ns).
Resource("virtualmachinemigrations").
VersionedParams(&opts, scheme.ParameterCodec).
Body(virtualMachineMigration).
Do(ctx).
Into(result)
return
}
// Update takes the representation of a virtualMachineMigration and updates it. Returns the server's representation of the virtualMachineMigration, and an error, if there is any.
func (c *virtualMachineMigrations) Update(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.UpdateOptions) (result *v1.VirtualMachineMigration, err error) {
result = &v1.VirtualMachineMigration{}
err = c.client.Put().
Namespace(c.ns).
Resource("virtualmachinemigrations").
Name(virtualMachineMigration.Name).
VersionedParams(&opts, scheme.ParameterCodec).
Body(virtualMachineMigration).
Do(ctx).
Into(result)
return
}
// UpdateStatus was generated because the type contains a Status member.
// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus().
func (c *virtualMachineMigrations) UpdateStatus(ctx context.Context, virtualMachineMigration *v1.VirtualMachineMigration, opts metav1.UpdateOptions) (result *v1.VirtualMachineMigration, err error) {
result = &v1.VirtualMachineMigration{}
err = c.client.Put().
Namespace(c.ns).
Resource("virtualmachinemigrations").
Name(virtualMachineMigration.Name).
SubResource("status").
VersionedParams(&opts, scheme.ParameterCodec).
Body(virtualMachineMigration).
Do(ctx).
Into(result)
return
}
// Delete takes name of the virtualMachineMigration and deletes it. Returns an error if one occurs.
func (c *virtualMachineMigrations) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error {
return c.client.Delete().
Namespace(c.ns).
Resource("virtualmachinemigrations").
Name(name).
Body(&opts).
Do(ctx).
Error()
}
// DeleteCollection deletes a collection of objects.
func (c *virtualMachineMigrations) DeleteCollection(ctx context.Context, opts metav1.DeleteOptions, listOpts metav1.ListOptions) error {
var timeout time.Duration
if listOpts.TimeoutSeconds != nil {
timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second
}
return c.client.Delete().
Namespace(c.ns).
Resource("virtualmachinemigrations").
VersionedParams(&listOpts, scheme.ParameterCodec).
Timeout(timeout).
Body(&opts).
Do(ctx).
Error()
}
// Patch applies the patch and returns the patched virtualMachineMigration.
func (c *virtualMachineMigrations) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions, subresources ...string) (result *v1.VirtualMachineMigration, err error) {
result = &v1.VirtualMachineMigration{}
err = c.client.Patch(pt).
Namespace(c.ns).
Resource("virtualmachinemigrations").
Name(name).
SubResource(subresources...).
VersionedParams(&opts, scheme.ParameterCodec).
Body(data).
Do(ctx).
Into(result)
return
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.
package externalversions
import (
reflect "reflect"
sync "sync"
time "time"
versioned "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
internalinterfaces "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/internalinterfaces"
neonvm "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/neonvm"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
schema "k8s.io/apimachinery/pkg/runtime/schema"
cache "k8s.io/client-go/tools/cache"
)
// SharedInformerOption defines the functional option type for SharedInformerFactory.
type SharedInformerOption func(*sharedInformerFactory) *sharedInformerFactory
type sharedInformerFactory struct {
client versioned.Interface
namespace string
tweakListOptions internalinterfaces.TweakListOptionsFunc
lock sync.Mutex
defaultResync time.Duration
customResync map[reflect.Type]time.Duration
informers map[reflect.Type]cache.SharedIndexInformer
// startedInformers is used for tracking which informers have been started.
// This allows Start() to be called multiple times safely.
startedInformers map[reflect.Type]bool
// wg tracks how many goroutines were started.
wg sync.WaitGroup
// shuttingDown is true when Shutdown has been called. It may still be running
// because it needs to wait for goroutines.
shuttingDown bool
}
// WithCustomResyncConfig sets a custom resync period for the specified informer types.
func WithCustomResyncConfig(resyncConfig map[v1.Object]time.Duration) SharedInformerOption {
return func(factory *sharedInformerFactory) *sharedInformerFactory {
for k, v := range resyncConfig {
factory.customResync[reflect.TypeOf(k)] = v
}
return factory
}
}
// WithTweakListOptions sets a custom filter on all listers of the configured SharedInformerFactory.
func WithTweakListOptions(tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerOption {
return func(factory *sharedInformerFactory) *sharedInformerFactory {
factory.tweakListOptions = tweakListOptions
return factory
}
}
// WithNamespace limits the SharedInformerFactory to the specified namespace.
func WithNamespace(namespace string) SharedInformerOption {
return func(factory *sharedInformerFactory) *sharedInformerFactory {
factory.namespace = namespace
return factory
}
}
// NewSharedInformerFactory constructs a new instance of sharedInformerFactory for all namespaces.
func NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) SharedInformerFactory {
return NewSharedInformerFactoryWithOptions(client, defaultResync)
}
// NewFilteredSharedInformerFactory constructs a new instance of sharedInformerFactory.
// Listers obtained via this SharedInformerFactory will be subject to the same filters
// as specified here.
// Deprecated: Please use NewSharedInformerFactoryWithOptions instead
func NewFilteredSharedInformerFactory(client versioned.Interface, defaultResync time.Duration, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerFactory {
return NewSharedInformerFactoryWithOptions(client, defaultResync, WithNamespace(namespace), WithTweakListOptions(tweakListOptions))
}
// NewSharedInformerFactoryWithOptions constructs a new instance of a SharedInformerFactory with additional options.
func NewSharedInformerFactoryWithOptions(client versioned.Interface, defaultResync time.Duration, options ...SharedInformerOption) SharedInformerFactory {
factory := &sharedInformerFactory{
client: client,
namespace: v1.NamespaceAll,
defaultResync: defaultResync,
informers: make(map[reflect.Type]cache.SharedIndexInformer),
startedInformers: make(map[reflect.Type]bool),
customResync: make(map[reflect.Type]time.Duration),
}
// Apply all options
for _, opt := range options {
factory = opt(factory)
}
return factory
}
func (f *sharedInformerFactory) Start(stopCh <-chan struct{}) {
f.lock.Lock()
defer f.lock.Unlock()
if f.shuttingDown {
return
}
for informerType, informer := range f.informers {
if !f.startedInformers[informerType] {
f.wg.Add(1)
// We need a new variable in each loop iteration,
// otherwise the goroutine would use the loop variable
// and that keeps changing.
informer := informer
go func() {
defer f.wg.Done()
informer.Run(stopCh)
}()
f.startedInformers[informerType] = true
}
}
}
func (f *sharedInformerFactory) Shutdown() {
f.lock.Lock()
f.shuttingDown = true
f.lock.Unlock()
// Will return immediately if there is nothing to wait for.
f.wg.Wait()
}
func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool {
informers := func() map[reflect.Type]cache.SharedIndexInformer {
f.lock.Lock()
defer f.lock.Unlock()
informers := map[reflect.Type]cache.SharedIndexInformer{}
for informerType, informer := range f.informers {
if f.startedInformers[informerType] {
informers[informerType] = informer
}
}
return informers
}()
res := map[reflect.Type]bool{}
for informType, informer := range informers {
res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced)
}
return res
}
// InformerFor returns the SharedIndexInformer for obj using an internal
// client.
func (f *sharedInformerFactory) InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer {
f.lock.Lock()
defer f.lock.Unlock()
informerType := reflect.TypeOf(obj)
informer, exists := f.informers[informerType]
if exists {
return informer
}
resyncPeriod, exists := f.customResync[informerType]
if !exists {
resyncPeriod = f.defaultResync
}
informer = newFunc(f.client, resyncPeriod)
f.informers[informerType] = informer
return informer
}
// SharedInformerFactory provides shared informers for resources in all known
// API group versions.
//
// It is typically used like this:
//
// ctx, cancel := context.Background()
// defer cancel()
// factory := NewSharedInformerFactory(client, resyncPeriod)
// defer factory.WaitForStop() // Returns immediately if nothing was started.
// genericInformer := factory.ForResource(resource)
// typedInformer := factory.SomeAPIGroup().V1().SomeType()
// factory.Start(ctx.Done()) // Start processing these informers.
// synced := factory.WaitForCacheSync(ctx.Done())
// for v, ok := range synced {
// if !ok {
// fmt.Fprintf(os.Stderr, "caches failed to sync: %v", v)
// return
// }
// }
//
// // Creating informers can also be created after Start, but then
// // Start must be called again:
// anotherGenericInformer := factory.ForResource(resource)
// factory.Start(ctx.Done())
type SharedInformerFactory interface {
internalinterfaces.SharedInformerFactory
// Start initializes all requested informers. They are handled in goroutines
// which run until the stop channel gets closed.
Start(stopCh <-chan struct{})
// Shutdown marks a factory as shutting down. At that point no new
// informers can be started anymore and Start will return without
// doing anything.
//
// In addition, Shutdown blocks until all goroutines have terminated. For that
// to happen, the close channel(s) that they were started with must be closed,
// either before Shutdown gets called or while it is waiting.
//
// Shutdown may be called multiple times, even concurrently. All such calls will
// block until all goroutines have terminated.
Shutdown()
// WaitForCacheSync blocks until all started informers' caches were synced
// or the stop channel gets closed.
WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool
// ForResource gives generic access to a shared informer of the matching type.
ForResource(resource schema.GroupVersionResource) (GenericInformer, error)
// InformerFor returns the SharedIndexInformer for obj using an internal
// client.
InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer
Neonvm() neonvm.Interface
}
func (f *sharedInformerFactory) Neonvm() neonvm.Interface {
return neonvm.New(f, f.namespace, f.tweakListOptions)
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.
package externalversions
import (
"fmt"
v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
schema "k8s.io/apimachinery/pkg/runtime/schema"
cache "k8s.io/client-go/tools/cache"
)
// GenericInformer is type of SharedIndexInformer which will locate and delegate to other
// sharedInformers based on type
type GenericInformer interface {
Informer() cache.SharedIndexInformer
Lister() cache.GenericLister
}
type genericInformer struct {
informer cache.SharedIndexInformer
resource schema.GroupResource
}
// Informer returns the SharedIndexInformer.
func (f *genericInformer) Informer() cache.SharedIndexInformer {
return f.informer
}
// Lister returns the GenericLister.
func (f *genericInformer) Lister() cache.GenericLister {
return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource)
}
// ForResource gives generic access to a shared informer of the matching type
// TODO extend this to unknown resources with a client pool
func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) {
switch resource {
// Group=neonvm, Version=v1
case v1.SchemeGroupVersion.WithResource("ippools"):
return &genericInformer{resource: resource.GroupResource(), informer: f.Neonvm().V1().IPPools().Informer()}, nil
case v1.SchemeGroupVersion.WithResource("virtualmachines"):
return &genericInformer{resource: resource.GroupResource(), informer: f.Neonvm().V1().VirtualMachines().Informer()}, nil
case v1.SchemeGroupVersion.WithResource("virtualmachinemigrations"):
return &genericInformer{resource: resource.GroupResource(), informer: f.Neonvm().V1().VirtualMachineMigrations().Informer()}, nil
}
return nil, fmt.Errorf("no informer found for %v", resource)
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.
package neonvm
import (
internalinterfaces "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/internalinterfaces"
v1 "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/neonvm/v1"
)
// Interface provides access to each of this group's versions.
type Interface interface {
// V1 provides access to shared informers for resources in V1.
V1() v1.Interface
}
type group struct {
factory internalinterfaces.SharedInformerFactory
namespace string
tweakListOptions internalinterfaces.TweakListOptionsFunc
}
// New returns a new Interface.
func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface {
return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions}
}
// V1 returns a new v1.Interface.
func (g *group) V1() v1.Interface {
return v1.New(g.factory, g.namespace, g.tweakListOptions)
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.
package v1
import (
internalinterfaces "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/internalinterfaces"
)
// Interface provides access to all the informers in this group version.
type Interface interface {
// IPPools returns a IPPoolInformer.
IPPools() IPPoolInformer
// VirtualMachines returns a VirtualMachineInformer.
VirtualMachines() VirtualMachineInformer
// VirtualMachineMigrations returns a VirtualMachineMigrationInformer.
VirtualMachineMigrations() VirtualMachineMigrationInformer
}
type version struct {
factory internalinterfaces.SharedInformerFactory
namespace string
tweakListOptions internalinterfaces.TweakListOptionsFunc
}
// New returns a new Interface.
func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface {
return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions}
}
// IPPools returns a IPPoolInformer.
func (v *version) IPPools() IPPoolInformer {
return &iPPoolInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions}
}
// VirtualMachines returns a VirtualMachineInformer.
func (v *version) VirtualMachines() VirtualMachineInformer {
return &virtualMachineInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions}
}
// VirtualMachineMigrations returns a VirtualMachineMigrationInformer.
func (v *version) VirtualMachineMigrations() VirtualMachineMigrationInformer {
return &virtualMachineMigrationInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions}
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.
package v1
import (
"context"
time "time"
neonvmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
versioned "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
internalinterfaces "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/internalinterfaces"
v1 "github.com/neondatabase/autoscaling/neonvm/client/listers/neonvm/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
watch "k8s.io/apimachinery/pkg/watch"
cache "k8s.io/client-go/tools/cache"
)
// IPPoolInformer provides access to a shared informer and lister for
// IPPools.
type IPPoolInformer interface {
Informer() cache.SharedIndexInformer
Lister() v1.IPPoolLister
}
type iPPoolInformer struct {
factory internalinterfaces.SharedInformerFactory
tweakListOptions internalinterfaces.TweakListOptionsFunc
namespace string
}
// NewIPPoolInformer constructs a new informer for IPPool type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewIPPoolInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer {
return NewFilteredIPPoolInformer(client, namespace, resyncPeriod, indexers, nil)
}
// NewFilteredIPPoolInformer constructs a new informer for IPPool type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewFilteredIPPoolInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer {
return cache.NewSharedIndexInformer(
&cache.ListWatch{
ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
if tweakListOptions != nil {
tweakListOptions(&options)
}
return client.NeonvmV1().IPPools(namespace).List(context.TODO(), options)
},
WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
if tweakListOptions != nil {
tweakListOptions(&options)
}
return client.NeonvmV1().IPPools(namespace).Watch(context.TODO(), options)
},
},
&neonvmv1.IPPool{},
resyncPeriod,
indexers,
)
}
func (f *iPPoolInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer {
return NewFilteredIPPoolInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions)
}
func (f *iPPoolInformer) Informer() cache.SharedIndexInformer {
return f.factory.InformerFor(&neonvmv1.IPPool{}, f.defaultInformer)
}
func (f *iPPoolInformer) Lister() v1.IPPoolLister {
return v1.NewIPPoolLister(f.Informer().GetIndexer())
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.
package v1
import (
"context"
time "time"
neonvmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
versioned "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
internalinterfaces "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/internalinterfaces"
v1 "github.com/neondatabase/autoscaling/neonvm/client/listers/neonvm/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
watch "k8s.io/apimachinery/pkg/watch"
cache "k8s.io/client-go/tools/cache"
)
// VirtualMachineInformer provides access to a shared informer and lister for
// VirtualMachines.
type VirtualMachineInformer interface {
Informer() cache.SharedIndexInformer
Lister() v1.VirtualMachineLister
}
type virtualMachineInformer struct {
factory internalinterfaces.SharedInformerFactory
tweakListOptions internalinterfaces.TweakListOptionsFunc
namespace string
}
// NewVirtualMachineInformer constructs a new informer for VirtualMachine type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewVirtualMachineInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer {
return NewFilteredVirtualMachineInformer(client, namespace, resyncPeriod, indexers, nil)
}
// NewFilteredVirtualMachineInformer constructs a new informer for VirtualMachine type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewFilteredVirtualMachineInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer {
return cache.NewSharedIndexInformer(
&cache.ListWatch{
ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
if tweakListOptions != nil {
tweakListOptions(&options)
}
return client.NeonvmV1().VirtualMachines(namespace).List(context.TODO(), options)
},
WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
if tweakListOptions != nil {
tweakListOptions(&options)
}
return client.NeonvmV1().VirtualMachines(namespace).Watch(context.TODO(), options)
},
},
&neonvmv1.VirtualMachine{},
resyncPeriod,
indexers,
)
}
func (f *virtualMachineInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer {
return NewFilteredVirtualMachineInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions)
}
func (f *virtualMachineInformer) Informer() cache.SharedIndexInformer {
return f.factory.InformerFor(&neonvmv1.VirtualMachine{}, f.defaultInformer)
}
func (f *virtualMachineInformer) Lister() v1.VirtualMachineLister {
return v1.NewVirtualMachineLister(f.Informer().GetIndexer())
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by informer-gen. DO NOT EDIT.
package v1
import (
"context"
time "time"
neonvmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
versioned "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
internalinterfaces "github.com/neondatabase/autoscaling/neonvm/client/informers/externalversions/internalinterfaces"
v1 "github.com/neondatabase/autoscaling/neonvm/client/listers/neonvm/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
watch "k8s.io/apimachinery/pkg/watch"
cache "k8s.io/client-go/tools/cache"
)
// VirtualMachineMigrationInformer provides access to a shared informer and lister for
// VirtualMachineMigrations.
type VirtualMachineMigrationInformer interface {
Informer() cache.SharedIndexInformer
Lister() v1.VirtualMachineMigrationLister
}
type virtualMachineMigrationInformer struct {
factory internalinterfaces.SharedInformerFactory
tweakListOptions internalinterfaces.TweakListOptionsFunc
namespace string
}
// NewVirtualMachineMigrationInformer constructs a new informer for VirtualMachineMigration type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewVirtualMachineMigrationInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer {
return NewFilteredVirtualMachineMigrationInformer(client, namespace, resyncPeriod, indexers, nil)
}
// NewFilteredVirtualMachineMigrationInformer constructs a new informer for VirtualMachineMigration type.
// Always prefer using an informer factory to get a shared informer instead of getting an independent
// one. This reduces memory footprint and number of connections to the server.
func NewFilteredVirtualMachineMigrationInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer {
return cache.NewSharedIndexInformer(
&cache.ListWatch{
ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
if tweakListOptions != nil {
tweakListOptions(&options)
}
return client.NeonvmV1().VirtualMachineMigrations(namespace).List(context.TODO(), options)
},
WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
if tweakListOptions != nil {
tweakListOptions(&options)
}
return client.NeonvmV1().VirtualMachineMigrations(namespace).Watch(context.TODO(), options)
},
},
&neonvmv1.VirtualMachineMigration{},
resyncPeriod,
indexers,
)
}
func (f *virtualMachineMigrationInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer {
return NewFilteredVirtualMachineMigrationInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions)
}
func (f *virtualMachineMigrationInformer) Informer() cache.SharedIndexInformer {
return f.factory.InformerFor(&neonvmv1.VirtualMachineMigration{}, f.defaultInformer)
}
func (f *virtualMachineMigrationInformer) Lister() v1.VirtualMachineMigrationLister {
return v1.NewVirtualMachineMigrationLister(f.Informer().GetIndexer())
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by lister-gen. DO NOT EDIT.
package v1
import (
v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/tools/cache"
)
// IPPoolLister helps list IPPools.
// All objects returned here must be treated as read-only.
type IPPoolLister interface {
// List lists all IPPools in the indexer.
// Objects returned here must be treated as read-only.
List(selector labels.Selector) (ret []*v1.IPPool, err error)
// IPPools returns an object that can list and get IPPools.
IPPools(namespace string) IPPoolNamespaceLister
IPPoolListerExpansion
}
// iPPoolLister implements the IPPoolLister interface.
type iPPoolLister struct {
indexer cache.Indexer
}
// NewIPPoolLister returns a new IPPoolLister.
func NewIPPoolLister(indexer cache.Indexer) IPPoolLister {
return &iPPoolLister{indexer: indexer}
}
// List lists all IPPools in the indexer.
func (s *iPPoolLister) List(selector labels.Selector) (ret []*v1.IPPool, err error) {
err = cache.ListAll(s.indexer, selector, func(m interface{}) {
ret = append(ret, m.(*v1.IPPool))
})
return ret, err
}
// IPPools returns an object that can list and get IPPools.
func (s *iPPoolLister) IPPools(namespace string) IPPoolNamespaceLister {
return iPPoolNamespaceLister{indexer: s.indexer, namespace: namespace}
}
// IPPoolNamespaceLister helps list and get IPPools.
// All objects returned here must be treated as read-only.
type IPPoolNamespaceLister interface {
// List lists all IPPools in the indexer for a given namespace.
// Objects returned here must be treated as read-only.
List(selector labels.Selector) (ret []*v1.IPPool, err error)
// Get retrieves the IPPool from the indexer for a given namespace and name.
// Objects returned here must be treated as read-only.
Get(name string) (*v1.IPPool, error)
IPPoolNamespaceListerExpansion
}
// iPPoolNamespaceLister implements the IPPoolNamespaceLister
// interface.
type iPPoolNamespaceLister struct {
indexer cache.Indexer
namespace string
}
// List lists all IPPools in the indexer for a given namespace.
func (s iPPoolNamespaceLister) List(selector labels.Selector) (ret []*v1.IPPool, err error) {
err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) {
ret = append(ret, m.(*v1.IPPool))
})
return ret, err
}
// Get retrieves the IPPool from the indexer for a given namespace and name.
func (s iPPoolNamespaceLister) Get(name string) (*v1.IPPool, error) {
obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name)
if err != nil {
return nil, err
}
if !exists {
return nil, errors.NewNotFound(v1.Resource("ippool"), name)
}
return obj.(*v1.IPPool), nil
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by lister-gen. DO NOT EDIT.
package v1
import (
v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/tools/cache"
)
// VirtualMachineLister helps list VirtualMachines.
// All objects returned here must be treated as read-only.
type VirtualMachineLister interface {
// List lists all VirtualMachines in the indexer.
// Objects returned here must be treated as read-only.
List(selector labels.Selector) (ret []*v1.VirtualMachine, err error)
// VirtualMachines returns an object that can list and get VirtualMachines.
VirtualMachines(namespace string) VirtualMachineNamespaceLister
VirtualMachineListerExpansion
}
// virtualMachineLister implements the VirtualMachineLister interface.
type virtualMachineLister struct {
indexer cache.Indexer
}
// NewVirtualMachineLister returns a new VirtualMachineLister.
func NewVirtualMachineLister(indexer cache.Indexer) VirtualMachineLister {
return &virtualMachineLister{indexer: indexer}
}
// List lists all VirtualMachines in the indexer.
func (s *virtualMachineLister) List(selector labels.Selector) (ret []*v1.VirtualMachine, err error) {
err = cache.ListAll(s.indexer, selector, func(m interface{}) {
ret = append(ret, m.(*v1.VirtualMachine))
})
return ret, err
}
// VirtualMachines returns an object that can list and get VirtualMachines.
func (s *virtualMachineLister) VirtualMachines(namespace string) VirtualMachineNamespaceLister {
return virtualMachineNamespaceLister{indexer: s.indexer, namespace: namespace}
}
// VirtualMachineNamespaceLister helps list and get VirtualMachines.
// All objects returned here must be treated as read-only.
type VirtualMachineNamespaceLister interface {
// List lists all VirtualMachines in the indexer for a given namespace.
// Objects returned here must be treated as read-only.
List(selector labels.Selector) (ret []*v1.VirtualMachine, err error)
// Get retrieves the VirtualMachine from the indexer for a given namespace and name.
// Objects returned here must be treated as read-only.
Get(name string) (*v1.VirtualMachine, error)
VirtualMachineNamespaceListerExpansion
}
// virtualMachineNamespaceLister implements the VirtualMachineNamespaceLister
// interface.
type virtualMachineNamespaceLister struct {
indexer cache.Indexer
namespace string
}
// List lists all VirtualMachines in the indexer for a given namespace.
func (s virtualMachineNamespaceLister) List(selector labels.Selector) (ret []*v1.VirtualMachine, err error) {
err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) {
ret = append(ret, m.(*v1.VirtualMachine))
})
return ret, err
}
// Get retrieves the VirtualMachine from the indexer for a given namespace and name.
func (s virtualMachineNamespaceLister) Get(name string) (*v1.VirtualMachine, error) {
obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name)
if err != nil {
return nil, err
}
if !exists {
return nil, errors.NewNotFound(v1.Resource("virtualmachine"), name)
}
return obj.(*v1.VirtualMachine), nil
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by lister-gen. DO NOT EDIT.
package v1
import (
v1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/tools/cache"
)
// VirtualMachineMigrationLister helps list VirtualMachineMigrations.
// All objects returned here must be treated as read-only.
type VirtualMachineMigrationLister interface {
// List lists all VirtualMachineMigrations in the indexer.
// Objects returned here must be treated as read-only.
List(selector labels.Selector) (ret []*v1.VirtualMachineMigration, err error)
// VirtualMachineMigrations returns an object that can list and get VirtualMachineMigrations.
VirtualMachineMigrations(namespace string) VirtualMachineMigrationNamespaceLister
VirtualMachineMigrationListerExpansion
}
// virtualMachineMigrationLister implements the VirtualMachineMigrationLister interface.
type virtualMachineMigrationLister struct {
indexer cache.Indexer
}
// NewVirtualMachineMigrationLister returns a new VirtualMachineMigrationLister.
func NewVirtualMachineMigrationLister(indexer cache.Indexer) VirtualMachineMigrationLister {
return &virtualMachineMigrationLister{indexer: indexer}
}
// List lists all VirtualMachineMigrations in the indexer.
func (s *virtualMachineMigrationLister) List(selector labels.Selector) (ret []*v1.VirtualMachineMigration, err error) {
err = cache.ListAll(s.indexer, selector, func(m interface{}) {
ret = append(ret, m.(*v1.VirtualMachineMigration))
})
return ret, err
}
// VirtualMachineMigrations returns an object that can list and get VirtualMachineMigrations.
func (s *virtualMachineMigrationLister) VirtualMachineMigrations(namespace string) VirtualMachineMigrationNamespaceLister {
return virtualMachineMigrationNamespaceLister{indexer: s.indexer, namespace: namespace}
}
// VirtualMachineMigrationNamespaceLister helps list and get VirtualMachineMigrations.
// All objects returned here must be treated as read-only.
type VirtualMachineMigrationNamespaceLister interface {
// List lists all VirtualMachineMigrations in the indexer for a given namespace.
// Objects returned here must be treated as read-only.
List(selector labels.Selector) (ret []*v1.VirtualMachineMigration, err error)
// Get retrieves the VirtualMachineMigration from the indexer for a given namespace and name.
// Objects returned here must be treated as read-only.
Get(name string) (*v1.VirtualMachineMigration, error)
VirtualMachineMigrationNamespaceListerExpansion
}
// virtualMachineMigrationNamespaceLister implements the VirtualMachineMigrationNamespaceLister
// interface.
type virtualMachineMigrationNamespaceLister struct {
indexer cache.Indexer
namespace string
}
// List lists all VirtualMachineMigrations in the indexer for a given namespace.
func (s virtualMachineMigrationNamespaceLister) List(selector labels.Selector) (ret []*v1.VirtualMachineMigration, err error) {
err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) {
ret = append(ret, m.(*v1.VirtualMachineMigration))
})
return ret, err
}
// Get retrieves the VirtualMachineMigration from the indexer for a given namespace and name.
func (s virtualMachineMigrationNamespaceLister) Get(name string) (*v1.VirtualMachineMigration, error) {
obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name)
if err != nil {
return nil, err
}
if !exists {
return nil, errors.NewNotFound(v1.Resource("virtualmachinemigration"), name)
}
return obj.(*v1.VirtualMachineMigration), nil
}
package controllers
import (
"context"
"fmt"
"runtime/debug"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)
type catchPanicReconciler struct {
inner reconcile.Reconciler
}
func withCatchPanic(r reconcile.Reconciler) reconcile.Reconciler {
return &catchPanicReconciler{inner: r}
}
func (r *catchPanicReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) {
log := log.FromContext(ctx)
defer func() {
if v := recover(); v != nil {
err = fmt.Errorf("panicked with: %v", v)
log.Error(err, "Reconcile panicked", "stack", string(debug.Stack()))
}
}()
result, err = r.inner.Reconcile(ctx, req)
return
}
package controllers
import (
"time"
"k8s.io/apimachinery/pkg/types"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)
// ReconcilerConfig stores shared configuration for VirtualMachineReconciler and
// VirtualMachineMigrationReconciler.
type ReconcilerConfig struct {
// IsK3s is true iff the cluster is running k3s nodes.
//
// This is required because - unlike the other most common kubernetes distributions - k3s
// changes the location of the containerd socket.
// There unfortunately does not appear to be a way to disable this behavior.
IsK3s bool
// UseContainerMgr, if true, enables using container-mgr for new VM runner pods.
//
// This is defined as a config option so we can do a gradual rollout of this change.
UseContainerMgr bool
// DisableRunnerCgroup, if true, disables running QEMU in a cgroup in new VM runner pods.
// Fractional CPU scaling will continue to *pretend* to work, but it will not do anything in
// practice.
//
// Under the hood, this results in passing -skip-cgroup-management and -enable-dummy-cpu-server
// to neonvm-runner.
DisableRunnerCgroup bool
MaxConcurrentReconciles int
// SkipUpdateValidationFor is the set of object names that we should ignore when doing webhook
// update validation.
SkipUpdateValidationFor map[types.NamespacedName]struct{}
// QEMUDiskCacheSettings sets the values of the 'cache.*' settings used for QEMU disks.
//
// This field is passed to neonvm-runner as the `-qemu-disk-cache-settings` arg, and is directly
// used in setting up the VM disks via QEMU's `-drive` flag.
QEMUDiskCacheSettings string
// DefaultMemoryProvider is the memory provider (dimm slots or virtio-mem) that will be used for
// new VMs (or, when old ones restart) if nothing is explicitly set.
DefaultMemoryProvider vmv1.MemoryProvider
// MemhpAutoMovableRatio specifies the value that new neonvm-runners will set as the
// kernel's 'memory_hotplug.auto_movable_ratio', iff the memory provider is virtio-mem.
//
// This value is passed directly to neonvm-runner as the '-memhp-auto-movable-ratio' flag.
// We've confirmed sensible values are from 301 to 801 (i.e. 3.01:1 through 8.01:1).
// The range of sensible values may extend further, but we have not tested that.
MemhpAutoMovableRatio string
// FailurePendingPeriod is the period for the propagation of
// reconciliation failures to the observability instruments
FailurePendingPeriod time.Duration
// FailingRefreshInterval is the interval between consecutive
// updates of metrics and logs, related to failing reconciliations
FailingRefreshInterval time.Duration
}
func (c *ReconcilerConfig) criEndpointSocketPath() string {
if c.IsK3s {
return "/run/k3s/containerd/containerd.sock"
} else {
return "/run/containerd/containerd.sock"
}
}
package failurelag
import (
"sync"
"time"
)
// Tracker accumulates failure events for a given key and determines if
// the key is degraded. The key becomes degraded if it receives only failures
// over a configurable pending period. Once the success event is received, the key
// is no longer considered degraded, and the pending period is reset.
type Tracker[T comparable] struct {
period time.Duration
pendingSince map[T]time.Time
degraded map[T]struct{}
degradeAt []degradeAt[T]
lock sync.Mutex
Now func() time.Time
}
type degradeAt[T comparable] struct {
ts time.Time
key T
}
func NewTracker[T comparable](period time.Duration) *Tracker[T] {
return &Tracker[T]{
period: period,
pendingSince: make(map[T]time.Time),
degraded: make(map[T]struct{}),
degradeAt: []degradeAt[T]{},
lock: sync.Mutex{},
Now: time.Now,
}
}
// forward processes all the fireAt events that are now in the past.
func (t *Tracker[T]) forward(now time.Time) {
i := 0
for ; i < len(t.degradeAt); i++ {
event := t.degradeAt[i]
if event.ts.After(now) {
break
}
pendingSince, ok := t.pendingSince[event.key]
if !ok {
// There was a success event in between
continue
}
if event.ts.Sub(pendingSince) < t.period {
// There was a success, and another failure in between
// We will have another fireAt event for this key in the future
continue
}
t.degraded[event.key] = struct{}{}
}
t.degradeAt = t.degradeAt[i:]
}
func (t *Tracker[T]) RecordSuccess(key T) {
t.lock.Lock()
defer t.lock.Unlock()
delete(t.degraded, key)
delete(t.pendingSince, key)
t.forward(t.Now())
}
func (t *Tracker[T]) RecordFailure(key T) {
t.lock.Lock()
defer t.lock.Unlock()
now := t.Now()
if _, ok := t.pendingSince[key]; !ok {
t.pendingSince[key] = now
}
t.degradeAt = append(t.degradeAt, degradeAt[T]{
ts: now.Add(t.period),
key: key,
})
t.forward(now)
}
func (t *Tracker[T]) DegradedCount() int {
t.lock.Lock()
defer t.lock.Unlock()
t.forward(t.Now())
return len(t.degraded)
}
func (t *Tracker[T]) Degraded() []T {
t.lock.Lock()
defer t.lock.Unlock()
t.forward(t.Now())
keys := make([]T, 0, len(t.degraded))
for k := range t.degraded {
keys = append(keys, k)
}
return keys
}
package controllers
import (
"context"
"fmt"
"time"
"github.com/go-logr/logr"
"github.com/prometheus/client_golang/prometheus"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/metrics"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"k8s.io/apimachinery/pkg/api/errors"
"github.com/neondatabase/autoscaling/neonvm/controllers/failurelag"
"github.com/neondatabase/autoscaling/pkg/util"
)
type ReconcilerMetrics struct {
failing *prometheus.GaugeVec
vmCreationToRunnerCreationTime prometheus.Histogram
runnerCreationToVMRunningTime prometheus.Histogram
vmCreationToVMRunningTime prometheus.Histogram
vmRestartCounts prometheus.Counter
reconcileDuration prometheus.HistogramVec
}
const OutcomeLabel = "outcome"
func MakeReconcilerMetrics() ReconcilerMetrics {
// Copied bucket values from controller runtime latency metric. We can
// adjust them in the future if needed.
buckets := []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}
m := ReconcilerMetrics{
failing: util.RegisterMetric(metrics.Registry, prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "reconcile_failing_objects",
Help: "Number of objects that are failing to reconcile for each specific controller",
},
[]string{"controller", OutcomeLabel},
)),
vmCreationToRunnerCreationTime: util.RegisterMetric(metrics.Registry, prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "vm_creation_to_runner_creation_duration_seconds",
Help: "Time duration from VirtualMachine.CreationTimestamp to runner Pod.CreationTimestamp",
Buckets: buckets,
},
)),
runnerCreationToVMRunningTime: util.RegisterMetric(metrics.Registry, prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "vm_runner_creation_to_vm_running_duration_seconds",
Help: "Time duration from runner Pod.CreationTimestamp to the moment when VirtualMachine.Status.Phase becomes Running",
Buckets: buckets,
},
)),
vmCreationToVMRunningTime: util.RegisterMetric(metrics.Registry, prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "vm_creation_to_vm_running_duration_seconds",
Help: "Time duration from VirtualMachine.CreationTimeStamp to the moment when VirtualMachine.Status.Phase becomes Running",
Buckets: buckets,
},
)),
vmRestartCounts: util.RegisterMetric(metrics.Registry, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "vm_restarts_count",
Help: "Total number of VM restarts across the cluster captured by VirtualMachine reconciler",
},
)),
reconcileDuration: *util.RegisterMetric(metrics.Registry, prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "reconcile_duration_seconds",
Help: "Time duration of reconciles",
Buckets: buckets,
}, []string{OutcomeLabel},
)),
}
return m
}
type ReconcileOutcome string
const (
SuccessOutcome ReconcileOutcome = "success"
FailureOutcome ReconcileOutcome = "failure"
ConflictOutcome ReconcileOutcome = "conflict"
)
func (m ReconcilerMetrics) ObserveReconcileDuration(
outcome ReconcileOutcome,
duration time.Duration,
) {
m.reconcileDuration.WithLabelValues(string(outcome)).Observe(duration.Seconds())
}
type wrappedReconciler struct {
ControllerName string
Reconciler reconcile.Reconciler
Metrics ReconcilerMetrics
refreshFailingInterval time.Duration
failing *failurelag.Tracker[client.ObjectKey]
conflicting *failurelag.Tracker[client.ObjectKey]
}
// ReconcilerWithMetrics is a Reconciler produced by WithMetrics that can return a snapshot of the
// state backing the metrics.
type ReconcilerWithMetrics interface {
reconcile.Reconciler
Snapshot() ReconcileSnapshot
FailingRefresher() FailingRefresher
}
// ReconcileSnapshot provides a glimpse into the current state of ongoing reconciles
//
// This type is (transitively) returned by the controller's "dump state" HTTP endpoint, and exists
// to allow us to get deeper information on the metrics - we can't expose information for every
// VirtualMachine into the metrics (it'd be too high cardinality), but we *can* make it available
// when requested.
type ReconcileSnapshot struct {
// ControllerName is the name of the controller: virtualmachine or virtualmachinemigration.
ControllerName string `json:"controllerName"`
// Failing is the list of objects currently failing to reconcile
Failing []string `json:"failing"`
// Conflicting is the list of objects currently failing to reconcile
// due to a conflict
Conflicting []string `json:"conflicting"`
}
// WithMetrics wraps a given Reconciler with metrics capabilities.
//
// The returned reconciler also provides a way to get a snapshot of the state of ongoing reconciles,
// to see the data backing the metrics.
func WithMetrics(
reconciler reconcile.Reconciler,
rm ReconcilerMetrics,
cntrlName string,
failurePendingPeriod time.Duration,
refreshFailingInterval time.Duration,
) ReconcilerWithMetrics {
return &wrappedReconciler{
Reconciler: reconciler,
Metrics: rm,
ControllerName: cntrlName,
failing: failurelag.NewTracker[client.ObjectKey](failurePendingPeriod),
conflicting: failurelag.NewTracker[client.ObjectKey](failurePendingPeriod),
refreshFailingInterval: refreshFailingInterval,
}
}
func (d *wrappedReconciler) refreshFailing(
log logr.Logger,
outcome ReconcileOutcome,
tracker *failurelag.Tracker[client.ObjectKey],
) {
degraded := tracker.Degraded()
d.Metrics.failing.WithLabelValues(d.ControllerName, string(outcome)).
Set(float64(len(degraded)))
// Log each object on a separate line (even though we could just put them all on the same line)
// so that:
// 1. we avoid super long log lines (which can make log storage / querying unhappy), and
// 2. so that we can process it with Grafana Loki, which can't handle arrays
for _, obj := range degraded {
log.Info(
fmt.Sprintf("Currently failing to reconcile %v object", d.ControllerName),
"outcome", outcome,
"object", obj,
)
}
}
func (d *wrappedReconciler) runRefreshFailing(ctx context.Context) {
log := log.FromContext(ctx)
for {
select {
case <-ctx.Done():
return
case <-time.After(d.refreshFailingInterval):
d.refreshFailing(log, FailureOutcome, d.failing)
d.refreshFailing(log, ConflictOutcome, d.conflicting)
}
}
}
func (d *wrappedReconciler) FailingRefresher() FailingRefresher {
return FailingRefresher{r: d}
}
// FailingRefresher is a wrapper, which implements manager.Runnable
type FailingRefresher struct {
r *wrappedReconciler
}
func (f FailingRefresher) Start(ctx context.Context) error {
go f.r.runRefreshFailing(ctx)
return nil
}
func (d *wrappedReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := log.FromContext(ctx)
now := time.Now()
res, err := d.Reconciler.Reconcile(ctx, req)
duration := time.Since(now)
outcome := SuccessOutcome
if err != nil {
if errors.IsConflict(err) {
outcome = ConflictOutcome
d.conflicting.RecordFailure(req.NamespacedName)
} else {
outcome = FailureOutcome
d.failing.RecordFailure(req.NamespacedName)
// If the VM is now getting non-conflict errors, it probably
// means transient conflicts has been resolved.
//
// Notably, the other way around is not true:
// if a VM is getting conflict errors, it doesn't mean
// non-conflict errors are resolved, as they are more
// likely to be persistent.
d.conflicting.RecordSuccess(req.NamespacedName)
}
log.Error(err, "Failed to reconcile VirtualMachine",
"duration", duration.String(), "outcome", outcome)
} else {
d.failing.RecordSuccess(req.NamespacedName)
d.conflicting.RecordSuccess(req.NamespacedName)
log.Info("Successful reconciliation", "duration", duration.String(), "requeueAfter", res.RequeueAfter)
}
d.Metrics.ObserveReconcileDuration(outcome, duration)
d.Metrics.failing.WithLabelValues(d.ControllerName,
string(FailureOutcome)).Set(float64(d.failing.DegradedCount()))
d.Metrics.failing.WithLabelValues(d.ControllerName,
string(ConflictOutcome)).Set(float64(d.conflicting.DegradedCount()))
return res, err
}
func toStringSlice(s []client.ObjectKey) []string {
keys := make([]string, 0, len(s))
for _, k := range s {
keys = append(keys, k.String())
}
return keys
}
func (r *wrappedReconciler) Snapshot() ReconcileSnapshot {
failing := toStringSlice(r.failing.Degraded())
conflicting := toStringSlice(r.conflicting.Degraded())
return ReconcileSnapshot{
ControllerName: r.ControllerName,
Failing: failing,
Conflicting: conflicting,
}
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controllers
import (
"bytes"
"context"
"crypto/ed25519"
"crypto/rand"
"encoding/base64"
"encoding/json"
"encoding/pem"
"fmt"
"io"
"net/http"
"os"
"reflect"
"strconv"
"time"
nadapiv1 "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1"
"github.com/samber/lo"
"golang.org/x/crypto/ssh"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/log"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apiserver/pkg/storage/names"
"k8s.io/client-go/tools/record"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/neonvm/controllers/buildtag"
"github.com/neondatabase/autoscaling/neonvm/pkg/ipam"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util/patch"
)
const (
virtualmachineFinalizer = "vm.neon.tech/finalizer"
)
// Definitions to manage status conditions
const (
// typeAvailableVirtualMachine represents the status of the Deployment reconciliation
typeAvailableVirtualMachine = "Available"
// typeDegradedVirtualMachine represents the status used when the custom resource is deleted and the finalizer operations are must to occur.
typeDegradedVirtualMachine = "Degraded"
)
const (
minSupportedRunnerVersion api.RunnerProtoVersion = api.RunnerProtoV1
maxSupportedRunnerVersion api.RunnerProtoVersion = api.RunnerProtoV1
)
// VMReconciler reconciles a VirtualMachine object
type VMReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
Config *ReconcilerConfig
Metrics ReconcilerMetrics `exhaustruct:"optional"`
}
// The following markers are used to generate the rules permissions (RBAC) on config/rbac using controller-gen
// when controller-gen (used by 'make generate') is executed.
// To know more about markers see: https://book.kubebuilder.io/reference/markers.html
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachines,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachines/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachines/finalizers,verbs=update
//+kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
//+kubebuilder:rbac:groups=core,resources=nodes,verbs=list
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;list;watch
//+kubebuilder:rbac:groups=vm.neon.tech,resources=ippools,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=vm.neon.tech,resources=ippools/finalizers,verbs=update
//+kubebuilder:rbac:groups=k8s.cni.cncf.io,resources=network-attachment-definitions,verbs=get;list;watch
// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// It is essential for the controller's reconciliation loop to be idempotent. By following the Operator
// pattern you will create Controllers which provide a reconcile function
// responsible for synchronizing resources until the desired state is reached on the cluster.
// Breaking this recommendation goes against the design principles of controller-runtime.
// and may lead to unforeseen consequences such as resources becoming stuck and requiring manual intervention.
// For further info:
// - About Operator Pattern: https://kubernetes.io/docs/concepts/extend-kubernetes/operator/
// - About Controllers: https://kubernetes.io/docs/concepts/architecture/controller/
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.13.0/pkg/reconcile
func (r *VMReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := log.FromContext(ctx)
var vm vmv1.VirtualMachine
if err := r.Get(ctx, req.NamespacedName, &vm); err != nil {
// Error reading the object - requeue the request.
if notfound := client.IgnoreNotFound(err); notfound == nil {
log.Info("virtualmachine resource not found. Ignoring since object must be deleted")
return ctrl.Result{}, nil
}
log.Error(err, "Unable to fetch VirtualMachine")
return ctrl.Result{}, client.IgnoreNotFound(err)
}
// examine DeletionTimestamp to determine if object is under deletion
if vm.ObjectMeta.DeletionTimestamp.IsZero() {
// The object is not being deleted, so if it does not have our finalizer,
// then lets add the finalizer and update the object. This is equivalent
// registering our finalizer.
if !controllerutil.ContainsFinalizer(&vm, virtualmachineFinalizer) {
log.Info("Adding Finalizer for VirtualMachine")
if ok := controllerutil.AddFinalizer(&vm, virtualmachineFinalizer); !ok {
log.Info("Failed to add finalizer from VirtualMachine")
return ctrl.Result{Requeue: true}, nil
}
if err := r.tryUpdateVM(ctx, &vm); err != nil {
log.Error(err, "Failed to update status about adding finalizer to VirtualMachine")
return ctrl.Result{}, err
}
return ctrl.Result{Requeue: true}, nil
}
} else {
// The object is being deleted
if controllerutil.ContainsFinalizer(&vm, virtualmachineFinalizer) {
// our finalizer is present, so lets handle any external dependency
log.Info("Performing Finalizer Operations for VirtualMachine before delete it")
r.doFinalizerOperationsForVirtualMachine(ctx, &vm)
// remove our finalizer from the list and update it.
log.Info("Removing Finalizer for VirtualMachine after successfully perform the operations")
if ok := controllerutil.RemoveFinalizer(&vm, virtualmachineFinalizer); !ok {
log.Info("Failed to remove finalizer from VirtualMachine")
return ctrl.Result{Requeue: true}, nil
}
if err := r.tryUpdateVM(ctx, &vm); err != nil {
log.Error(err, "Failed to update status about removing finalizer from VirtualMachine")
return ctrl.Result{}, err
}
}
// Stop reconciliation as the item is being deleted
return ctrl.Result{}, nil
}
statusBefore := vm.Status.DeepCopy()
if err := r.doReconcile(ctx, &vm); err != nil {
r.Recorder.Eventf(&vm, corev1.EventTypeWarning, "Failed",
"Failed to reconcile (%s): %s", vm.Name, err)
return ctrl.Result{}, err
}
// If the status changed, try to update the object
if !DeepEqual(statusBefore, vm.Status) {
if err := r.Status().Update(ctx, &vm); err != nil {
log.Error(err, "Failed to update VirtualMachine status after reconcile loop",
"virtualmachine", vm.Name)
return ctrl.Result{}, err
}
}
// Only quickly requeue if we're scaling or migrating. Otherwise, we aren't expecting any
// changes from QEMU, and it's wasteful to repeatedly check.
requeueAfter := time.Second
if vm.Status.Phase == vmv1.VmPending || vm.Status.Phase == vmv1.VmRunning {
requeueAfter = 15 * time.Second
}
return ctrl.Result{RequeueAfter: requeueAfter}, nil
}
// doFinalizerOperationsForVirtualMachine will perform the required operations before delete the CR.
func (r *VMReconciler) doFinalizerOperationsForVirtualMachine(ctx context.Context, vm *vmv1.VirtualMachine) {
// Note: It is not recommended to use finalizers with the purpose of delete resources which are
// created and managed in the reconciliation. These ones, such as the Pod created on this reconcile,
// are defined as depended of the custom resource. See that we use the method ctrl.SetControllerReference.
// to set the ownerRef which means that the Deployment will be deleted by the Kubernetes API.
// More info: https://kubernetes.io/docs/tasks/administer-cluster/use-cascading-deletion/
log := log.FromContext(ctx)
// The following implementation will raise an event
r.Recorder.Event(vm, "Warning", "Deleting",
fmt.Sprintf("Custom Resource %s is being deleted from the namespace %s",
vm.Name,
vm.Namespace))
// Release overlay IP address
if vm.Spec.ExtraNetwork != nil {
// Create IPAM object
nadName, err := nadIpamName()
if err != nil {
// ignore error
log.Error(err, "ignored error")
return
}
nadNamespace, err := nadIpamNamespace()
if err != nil {
// ignore error
log.Error(err, "ignored error")
return
}
ipam, err := ipam.New(ctx, nadName, nadNamespace)
if err != nil {
// ignore error
log.Error(err, "ignored error")
return
}
defer ipam.Close()
ip, err := ipam.ReleaseIP(ctx, vm.Name, vm.Namespace)
if err != nil {
// ignore error
log.Error(err, "fail to release IP, error ignored")
return
}
message := fmt.Sprintf("Released IP %s", ip.String())
log.Info(message)
r.Recorder.Event(vm, "Normal", "OverlayNet", message)
}
}
func getRunnerVersion(pod *corev1.Pod) (api.RunnerProtoVersion, error) {
val, ok := pod.Labels[vmv1.RunnerPodVersionLabel]
if !ok {
return api.RunnerProtoVersion(0), nil
}
uintVal, err := strconv.ParseUint(val, 10, 32)
if err != nil {
return 0, fmt.Errorf("failed to parse label value as integer: %w", err)
}
return api.RunnerProtoVersion(uintVal), nil
}
func runnerVersionIsSupported(version api.RunnerProtoVersion) bool {
return version >= minSupportedRunnerVersion && version <= maxSupportedRunnerVersion
}
func (r *VMReconciler) updateVMStatusCPU(
ctx context.Context,
vm *vmv1.VirtualMachine,
vmRunner *corev1.Pod,
qmpPluggedCPUs uint32,
cgroupUsage *api.VCPUCgroup,
) {
log := log.FromContext(ctx)
// We expect:
// - vm.Status.CPUs = cgroupUsage.VCPUs
// - vm.Status.CPUs.RoundUp() == qmpPluggedCPUs
// Otherwise, we update the status.
var currentCPUUsage vmv1.MilliCPU
if cgroupUsage != nil {
if cgroupUsage.VCPUs.RoundedUp() != qmpPluggedCPUs {
// This is not expected but it's fine. We only report the
// mismatch here and will resolve it in the next reconcile
// iteration loops by comparing these values to spec CPU use
// and moving to the scaling phase.
log.Error(nil, "Mismatch in the number of VM's plugged CPUs and runner pod's cgroup vCPUs",
"VirtualMachine", vm.Name,
"Runner Pod", vmRunner.Name,
"plugged CPUs", qmpPluggedCPUs,
"cgroup vCPUs", cgroupUsage.VCPUs)
}
currentCPUUsage = min(cgroupUsage.VCPUs, vmv1.MilliCPU(1000*qmpPluggedCPUs))
} else {
currentCPUUsage = vmv1.MilliCPU(1000 * qmpPluggedCPUs)
}
if vm.Status.CPUs == nil || *vm.Status.CPUs != currentCPUUsage {
vm.Status.CPUs = ¤tCPUUsage
r.Recorder.Event(vm, "Normal", "CpuInfo",
fmt.Sprintf("VirtualMachine %s uses %v cpu cores",
vm.Name,
vm.Status.CPUs))
}
}
func (r *VMReconciler) updateVMStatusMemory(
vm *vmv1.VirtualMachine,
qmpMemorySize *resource.Quantity,
) {
if vm.Status.MemorySize == nil || !qmpMemorySize.Equal(*vm.Status.MemorySize) {
vm.Status.MemorySize = qmpMemorySize
r.Recorder.Event(vm, "Normal", "MemoryInfo",
fmt.Sprintf("VirtualMachine %s uses %v memory",
vm.Name,
vm.Status.MemorySize))
}
}
func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine) error {
log := log.FromContext(ctx)
// Let's check and just set the condition status as Unknown when no status are available
if len(vm.Status.Conditions) == 0 {
// set Unknown condition status for AvailableVirtualMachine
meta.SetStatusCondition(&vm.Status.Conditions, metav1.Condition{Type: typeAvailableVirtualMachine, Status: metav1.ConditionUnknown, Reason: "Reconciling", Message: "Starting reconciliation"})
}
// NB: .Spec.EnableSSH guaranteed non-nil because the k8s API server sets the default for us.
enableSSH := *vm.Spec.EnableSSH
// Generate ssh secret name
if enableSSH && len(vm.Status.SSHSecretName) == 0 {
vm.Status.SSHSecretName = fmt.Sprintf("ssh-neonvm-%s", vm.Name)
}
// Set memory provider for old VMs that don't have it in the Status.
if vm.Status.PodName != "" && vm.Status.MemoryProvider == nil {
oldMemProvider := vmv1.MemoryProviderDIMMSlots
log.Error(nil, "Setting default MemoryProvider for VM", "MemoryProvider", oldMemProvider)
vm.Status.MemoryProvider = lo.ToPtr(oldMemProvider)
}
switch vm.Status.Phase {
case "":
// Acquire overlay IP address
if vm.Spec.ExtraNetwork != nil &&
vm.Spec.ExtraNetwork.Enable &&
len(vm.Status.ExtraNetIP) == 0 {
// Create IPAM object
nadName, err := nadIpamName()
if err != nil {
return err
}
nadNamespace, err := nadIpamNamespace()
if err != nil {
return err
}
ipam, err := ipam.New(ctx, nadName, nadNamespace)
if err != nil {
log.Error(err, "failed to create IPAM")
return err
}
defer ipam.Close()
ip, err := ipam.AcquireIP(ctx, vm.Name, vm.Namespace)
if err != nil {
log.Error(err, "fail to acquire IP")
return err
}
message := fmt.Sprintf("Acquired IP %s for overlay network interface", ip.String())
log.Info(message)
vm.Status.ExtraNetIP = ip.IP.String()
vm.Status.ExtraNetMask = fmt.Sprintf("%d.%d.%d.%d", ip.Mask[0], ip.Mask[1], ip.Mask[2], ip.Mask[3])
r.Recorder.Event(vm, "Normal", "OverlayNet", message)
}
// VirtualMachine just created, change Phase to "Pending"
vm.Status.Phase = vmv1.VmPending
case vmv1.VmPending:
// Generate runner pod name and set desired memory provider.
// Together with Status.MemoryProvider set for PodName != "" above,
// It is now guaranteed to have Status.MemoryProvider != nil
if len(vm.Status.PodName) == 0 {
vm.Status.PodName = names.SimpleNameGenerator.GenerateName(fmt.Sprintf("%s-", vm.Name))
if vm.Status.MemoryProvider == nil {
vm.Status.MemoryProvider = lo.ToPtr(pickMemoryProvider(r.Config, vm))
}
// Update the .Status on API Server to avoid creating multiple pods for a single VM
// See https://github.com/neondatabase/autoscaling/issues/794 for the context
if err := r.Status().Update(ctx, vm); err != nil {
return fmt.Errorf("Failed to update VirtualMachine status: %w", err)
}
}
memoryProvider := *vm.Status.MemoryProvider
// Check if the runner pod already exists, if not create a new one
vmRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, vmRunner)
if err != nil && apierrors.IsNotFound(err) {
var sshSecret *corev1.Secret
if enableSSH {
// Check if the ssh secret already exists, if not create a new one
sshSecret = &corev1.Secret{}
err := r.Get(ctx, types.NamespacedName{
Name: vm.Status.SSHSecretName,
Namespace: vm.Namespace,
}, sshSecret)
if err != nil && apierrors.IsNotFound(err) {
// Define a new ssh secret
sshSecret, err = r.sshSecretForVirtualMachine(vm)
if err != nil {
log.Error(err, "Failed to define new SSH Secret for VirtualMachine")
return err
}
log.Info("Creating a new SSH Secret", "Secret.Namespace", sshSecret.Namespace, "Secret.Name", sshSecret.Name)
if err = r.Create(ctx, sshSecret); err != nil {
log.Error(err, "Failed to create new SSH secret", "Secret.Namespace", sshSecret.Namespace, "Secret.Name", sshSecret.Name)
return err
}
log.Info("SSH Secret was created", "Secret.Namespace", sshSecret.Namespace, "Secret.Name", sshSecret.Name)
} else if err != nil {
log.Error(err, "Failed to get SSH Secret")
return err
}
}
// Define a new pod
pod, err := r.podForVirtualMachine(vm, memoryProvider, sshSecret)
if err != nil {
log.Error(err, "Failed to define new Pod resource for VirtualMachine")
return err
}
log.Info("Creating a new Pod", "Pod.Namespace", pod.Namespace, "Pod.Name", pod.Name)
if err = r.Create(ctx, pod); err != nil {
log.Error(err, "Failed to create new Pod", "Pod.Namespace", pod.Namespace, "Pod.Name", pod.Name)
return err
}
log.Info("Runner Pod was created", "Pod.Namespace", pod.Namespace, "Pod.Name", pod.Name)
msg := fmt.Sprintf("VirtualMachine %s created, Pod %s", vm.Name, pod.Name)
if sshSecret != nil {
msg = fmt.Sprintf("%s, SSH Secret %s", msg, sshSecret.Name)
}
r.Recorder.Event(vm, "Normal", "Created", msg)
if !vm.HasRestarted() {
d := pod.CreationTimestamp.Time.Sub(vm.CreationTimestamp.Time)
r.Metrics.vmCreationToRunnerCreationTime.Observe(d.Seconds())
}
} else if err != nil {
log.Error(err, "Failed to get vm-runner Pod")
return err
}
// runner pod found, check phase
switch runnerStatus(vmRunner) {
case runnerRunning:
vm.Status.PodIP = vmRunner.Status.PodIP
vm.Status.Phase = vmv1.VmRunning
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) created successfully", vm.Status.PodName, vm.Name)})
{
// Calculating VM startup latency metrics
now := time.Now()
d := now.Sub(vmRunner.CreationTimestamp.Time)
r.Metrics.runnerCreationToVMRunningTime.Observe(d.Seconds())
if !vm.HasRestarted() {
d := now.Sub(vm.CreationTimestamp.Time)
r.Metrics.vmCreationToVMRunningTime.Observe(d.Seconds())
log.Info("VM creation to VM running time", "duration(sec)", d.Seconds())
}
}
case runnerSucceeded:
vm.Status.Phase = vmv1.VmSucceeded
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) succeeded", vm.Status.PodName, vm.Name)})
case runnerFailed:
vm.Status.Phase = vmv1.VmFailed
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachine,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) failed", vm.Status.PodName, vm.Name)})
case runnerUnknown:
vm.Status.Phase = vmv1.VmPending
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) in Unknown phase", vm.Status.PodName, vm.Name)})
default:
// do nothing
}
case vmv1.VmRunning:
// Check if the runner pod exists
vmRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, vmRunner)
if err != nil && apierrors.IsNotFound(err) {
// lost runner pod for running VirtualMachine ?
r.Recorder.Event(vm, "Warning", "NotFound",
fmt.Sprintf("runner pod %s not found",
vm.Status.PodName))
vm.Status.Phase = vmv1.VmFailed
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachine,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) not found", vm.Status.PodName, vm.Name)})
} else if err != nil {
log.Error(err, "Failed to get runner Pod")
return err
}
// Update the metadata (including "usage" annotation) before anything else, so that it
// will be correctly set even if the rest of the reconcile operation fails.
if err := updatePodMetadataIfNecessary(ctx, r.Client, vm, vmRunner); err != nil {
log.Error(err, "Failed to sync pod labels and annotations", "VirtualMachine", vm.Name)
}
// runner pod found, check/update phase now
switch runnerStatus(vmRunner) {
case runnerRunning:
// update status by IP of runner pod
vm.Status.PodIP = vmRunner.Status.PodIP
// update phase
vm.Status.Phase = vmv1.VmRunning
// update Node name where runner working
vm.Status.Node = vmRunner.Spec.NodeName
runnerVersion, err := getRunnerVersion(vmRunner)
if err != nil {
log.Error(err, "Failed to get runner version of VM runner pod", "VirtualMachine", vm.Name)
return err
}
if !runnerVersionIsSupported(runnerVersion) {
err := fmt.Errorf("runner version %v is not supported", runnerVersion)
log.Error(err, "VM runner pod has unsupported version", "VirtualMachine", vm.Name)
return err
}
// get CPU details from QEMU
cpuSlotsPlugged, _, err := QmpGetCpus(QmpAddr(vm))
if err != nil {
log.Error(err, "Failed to get CPU details from VirtualMachine", "VirtualMachine", vm.Name)
return err
}
pluggedCPU := uint32(len(cpuSlotsPlugged))
// get cgroups CPU details from runner pod
cgroupUsage, err := getRunnerCgroup(ctx, vm)
if err != nil {
log.Error(err, "Failed to get CPU details from runner", "VirtualMachine", vm.Name)
return err
}
// update status by CPUs used in the VM
r.updateVMStatusCPU(ctx, vm, vmRunner, pluggedCPU, cgroupUsage)
// get Memory details from hypervisor and update VM status
memorySize, err := QmpGetMemorySize(QmpAddr(vm))
if err != nil {
log.Error(err, "Failed to get Memory details from VirtualMachine", "VirtualMachine", vm.Name)
return err
}
// update status by memory sizes used in the VM
r.updateVMStatusMemory(vm, memorySize)
// check if need hotplug/unplug CPU or memory
// compare guest spec and count of plugged
specUseCPU := vm.Spec.Guest.CPUs.Use
scaleCgroupCPU := specUseCPU != cgroupUsage.VCPUs
scaleQemuCPU := specUseCPU.RoundedUp() != pluggedCPU
if scaleCgroupCPU || scaleQemuCPU {
log.Info("VM goes into scaling mode, CPU count needs to be changed",
"CPUs on runner pod cgroup", cgroupUsage.VCPUs,
"CPUs on board", pluggedCPU,
"CPUs in spec", vm.Spec.Guest.CPUs.Use)
vm.Status.Phase = vmv1.VmScaling
}
memorySizeFromSpec := resource.NewQuantity(int64(vm.Spec.Guest.MemorySlots.Use)*vm.Spec.Guest.MemorySlotSize.Value(), resource.BinarySI)
if !memorySize.Equal(*memorySizeFromSpec) {
log.Info("VM goes into scale mode, need to resize Memory",
"Memory on board", memorySize,
"Memory in spec", memorySizeFromSpec)
vm.Status.Phase = vmv1.VmScaling
}
case runnerSucceeded:
vm.Status.Phase = vmv1.VmSucceeded
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) succeeded", vm.Status.PodName, vm.Name)})
case runnerFailed:
vm.Status.Phase = vmv1.VmFailed
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachine,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) failed", vm.Status.PodName, vm.Name)})
case runnerUnknown:
vm.Status.Phase = vmv1.VmPending
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) in Unknown phase", vm.Status.PodName, vm.Name)})
default:
// do nothing
}
case vmv1.VmScaling:
// Check that runner pod is still ok
vmRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, vmRunner)
if err != nil && apierrors.IsNotFound(err) {
// lost runner pod for running VirtualMachine ?
r.Recorder.Event(vm, "Warning", "NotFound",
fmt.Sprintf("runner pod %s not found",
vm.Status.PodName))
vm.Status.Phase = vmv1.VmFailed
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachine,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) not found", vm.Status.PodName, vm.Name)})
} else if err != nil {
log.Error(err, "Failed to get runner Pod")
return err
}
// Update the metadata (including "usage" annotation) before anything else, so that it
// will be correctly set even if the rest of the reconcile operation fails.
if err := updatePodMetadataIfNecessary(ctx, r.Client, vm, vmRunner); err != nil {
log.Error(err, "Failed to sync pod labels and annotations", "VirtualMachine", vm.Name)
}
// runner pod found, check that it's still up:
switch runnerStatus(vmRunner) {
case runnerSucceeded:
vm.Status.Phase = vmv1.VmSucceeded
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) succeeded", vm.Status.PodName, vm.Name)})
return nil
case runnerFailed:
vm.Status.Phase = vmv1.VmFailed
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachine,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) failed", vm.Status.PodName, vm.Name)})
return nil
case runnerUnknown:
vm.Status.Phase = vmv1.VmPending
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) in Unknown phase", vm.Status.PodName, vm.Name)})
return nil
default:
// do nothing
}
runnerVersion, err := getRunnerVersion(vmRunner)
if err != nil {
log.Error(err, "Failed to get runner version of VM runner pod", "VirtualMachine", vm.Name)
return err
}
if !runnerVersionIsSupported(runnerVersion) {
err := fmt.Errorf("runner version %v is not supported", runnerVersion)
log.Error(err, "VM runner pod has unsupported version", "VirtualMachine", vm.Name)
return err
}
cpuScaled := false
ramScaled := false
// do hotplug/unplug CPU
// firstly get current state from QEMU
cpuSlotsPlugged, _, err := QmpGetCpus(QmpAddr(vm))
if err != nil {
log.Error(err, "Failed to get CPU details from VirtualMachine", "VirtualMachine", vm.Name)
return err
}
specCPU := vm.Spec.Guest.CPUs.Use
pluggedCPU := uint32(len(cpuSlotsPlugged))
cgroupUsage, err := getRunnerCgroup(ctx, vm)
if err != nil {
log.Error(err, "Failed to get CPU details from runner", "VirtualMachine", vm.Name)
return err
}
// compare guest spec to count of plugged and runner pod cgroups
if specCPU.RoundedUp() > pluggedCPU {
// going to plug one CPU
log.Info("Plug one more CPU into VM")
if err := QmpPlugCpu(QmpAddr(vm)); err != nil {
return err
}
r.Recorder.Event(vm, "Normal", "ScaleUp",
fmt.Sprintf("One more CPU was plugged into VM %s",
vm.Name))
} else if specCPU.RoundedUp() < pluggedCPU {
// going to unplug one CPU
log.Info("Unplug one CPU from VM")
if err := QmpUnplugCpu(QmpAddr(vm)); err != nil {
return err
}
r.Recorder.Event(vm, "Normal", "ScaleDown",
fmt.Sprintf("One CPU was unplugged from VM %s",
vm.Name))
} else if specCPU != cgroupUsage.VCPUs {
log.Info("Update runner pod cgroups", "runner", cgroupUsage.VCPUs, "spec", specCPU)
if err := setRunnerCgroup(ctx, vm, specCPU); err != nil {
return err
}
reason := "ScaleDown"
if specCPU > cgroupUsage.VCPUs {
reason = "ScaleUp"
}
r.Recorder.Event(vm, "Normal", reason,
fmt.Sprintf("Runner pod cgroups was updated on VM %s",
vm.Name))
} else {
// seems already plugged correctly
cpuScaled = true
}
// update status by CPUs used in the VM
r.updateVMStatusCPU(ctx, vm, vmRunner, pluggedCPU, cgroupUsage)
// do hotplug/unplug Memory
switch *vm.Status.MemoryProvider {
case vmv1.MemoryProviderVirtioMem:
ramScaled, err = r.doVirtioMemScaling(vm)
if err != nil {
return err
}
case vmv1.MemoryProviderDIMMSlots:
ramScaled, err = r.doDIMMSlotsScaling(ctx, vm)
if err != nil {
return err
}
default:
panic(fmt.Errorf("unexpected vm.status.memoryProvider %q", *vm.Status.MemoryProvider))
}
// set VM phase to running if everything scaled
if cpuScaled && ramScaled {
vm.Status.Phase = vmv1.VmRunning
}
case vmv1.VmSucceeded, vmv1.VmFailed:
// Always delete runner pod. Otherwise, we could end up with one container succeeded/failed
// but the other one still running (meaning that the pod still ends up Running).
vmRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, vmRunner)
if err == nil {
// delete current runner
if err := r.deleteRunnerPodIfEnabled(ctx, vm, vmRunner); err != nil {
return err
}
} else if !apierrors.IsNotFound(err) {
return err
}
// We must keep the VM status the same until we know the neonvm-runner container has been
// terminated, otherwise we could end up starting a new runner pod while the VM in the old
// one is still running.
//
// Note that this is required because 'VmSucceeded' and 'VmFailed' are true if *at least
// one* container inside the runner pod has finished; the VM itself may still be running.
if apierrors.IsNotFound(err) || runnerContainerStopped(vmRunner) {
// NB: Cleanup() leaves status .Phase and .RestartCount (+ some others) but unsets other fields.
vm.Cleanup()
var shouldRestart bool
switch vm.Spec.RestartPolicy {
case vmv1.RestartPolicyAlways:
shouldRestart = true
case vmv1.RestartPolicyOnFailure:
shouldRestart = vm.Status.Phase == vmv1.VmFailed
case vmv1.RestartPolicyNever:
shouldRestart = false
}
if shouldRestart {
log.Info("Restarting VM runner pod", "VM.Phase", vm.Status.Phase, "RestartPolicy", vm.Spec.RestartPolicy)
vm.Status.Phase = vmv1.VmPending // reset to trigger restart
vm.Status.RestartCount += 1 // increment restart count
r.Metrics.vmRestartCounts.Inc()
}
// TODO for RestartPolicyNever: implement TTL or do nothing
}
default:
// do nothing
}
// Propagate TargetRevision to CurrentRevision. This is done only if the VM is fully
// reconciled and running.
if vm.Status.Phase == vmv1.VmRunning {
propagateRevision(vm)
}
return nil
}
func propagateRevision(vm *vmv1.VirtualMachine) {
if vm.Spec.TargetRevision == nil {
return
}
if vm.Status.CurrentRevision != nil &&
vm.Status.CurrentRevision.Revision == vm.Spec.TargetRevision.Revision {
return
}
rev := vm.Spec.TargetRevision.WithTime(time.Now())
vm.Status.CurrentRevision = &rev
}
func pickMemoryProvider(config *ReconcilerConfig, vm *vmv1.VirtualMachine) vmv1.MemoryProvider {
if p := vm.Spec.Guest.MemoryProvider; p != nil {
return *p
}
if p := vm.Status.MemoryProvider; p != nil {
return *p
}
// Not all configurations are valid for virtio-mem. Only switch to the default as long as it
// won't be invalid:
if err := vm.Spec.Guest.ValidateForMemoryProvider(config.DefaultMemoryProvider); err != nil {
return vmv1.MemoryProviderDIMMSlots
}
return config.DefaultMemoryProvider
}
func (r *VMReconciler) doVirtioMemScaling(vm *vmv1.VirtualMachine) (done bool, _ error) {
targetSlotCount := int(vm.Spec.Guest.MemorySlots.Use - vm.Spec.Guest.MemorySlots.Min)
targetVirtioMemSize := int64(targetSlotCount) * vm.Spec.Guest.MemorySlotSize.Value()
previousTarget, err := QmpSetVirtioMem(vm, targetVirtioMemSize)
if err != nil {
return false, err
}
goalTotalSize := resource.NewQuantity(
int64(vm.Spec.Guest.MemorySlots.Use)*vm.Spec.Guest.MemorySlotSize.Value(),
resource.BinarySI,
)
if previousTarget != targetVirtioMemSize {
// We changed the requested size. Make an event for it.
reason := "ScaleUp"
if targetVirtioMemSize < previousTarget {
reason = "ScaleDown"
}
r.Recorder.Eventf(vm, "Normal", reason, "Set virtio-mem size for %v total memory", goalTotalSize)
}
// Maybe we're already using the amount we want?
// Update the status to reflect the current size - and if it matches goalTotalSize, ram
// scaling is done.
currentTotalSize, err := QmpGetMemorySize(QmpAddr(vm))
if err != nil {
return false, err
}
done = currentTotalSize.Value() == goalTotalSize.Value()
r.updateVMStatusMemory(vm, currentTotalSize)
return done, nil
}
func (r *VMReconciler) doDIMMSlotsScaling(ctx context.Context, vm *vmv1.VirtualMachine) (done bool, _ error) {
log := log.FromContext(ctx)
memSlotsMin := vm.Spec.Guest.MemorySlots.Min
targetSlotCount := int(vm.Spec.Guest.MemorySlots.Use - memSlotsMin)
realSlots, err := QmpSetMemorySlots(ctx, vm, targetSlotCount, r.Recorder)
if realSlots < 0 {
return false, err
}
if realSlots != int(targetSlotCount) {
log.Info("Couldn't achieve desired memory slot count, will modify .spec.guest.memorySlots.use instead", "details", err)
// firstly re-fetch VM
if err := r.Get(ctx, types.NamespacedName{Name: vm.Name, Namespace: vm.Namespace}, vm); err != nil {
log.Error(err, "Unable to re-fetch VirtualMachine")
return false, err
}
memorySlotsUseInSpec := vm.Spec.Guest.MemorySlots.Use
memoryPluggedSlots := memSlotsMin + int32(realSlots)
vm.Spec.Guest.MemorySlots.Use = memoryPluggedSlots
if err := r.tryUpdateVM(ctx, vm); err != nil {
log.Error(err, "Failed to update .spec.guest.memorySlots.use",
"old value", memorySlotsUseInSpec,
"new value", memoryPluggedSlots)
return false, err
}
} else {
done = true
}
// get Memory details from hypervisor and update VM status
memorySize, err := QmpGetMemorySize(QmpAddr(vm))
if err != nil {
log.Error(err, "Failed to get Memory details from VirtualMachine", "VirtualMachine", vm.Name)
return false, err
}
// update status by memory sizes used in the VM
r.updateVMStatusMemory(vm, memorySize)
return done, nil
}
type runnerStatusKind string
const (
runnerUnknown runnerStatusKind = "Unknown"
runnerPending runnerStatusKind = "Pending"
runnerRunning runnerStatusKind = "Running"
runnerFailed runnerStatusKind = "Failed"
runnerSucceeded runnerStatusKind = "Succeeded"
)
// runnerStatus returns a description of the status of the VM inside the runner pod.
//
// This is *similar* to the value of pod.Status.Phase, but takes into consideration the statuses of
// the individual containers within the pod. This is because Kubernetes sets the pod phase to Failed
// or Succeeded only if *all* pods have exited, whereas we'd like to consider the VM to be Failed or
// Succeeded if *any* pod has exited.
//
// The full set of outputs is:
//
// - runnerUnknown, if pod.Status.Phase is Unknown
// - runnerPending, if pod.Status.Phase is "" or Pending
// - runnerRunning, if pod.Status.Phase is Running, and no containers have exited
// - runnerFailed, if pod.Status.Phase is Failed, or if any container has failed, or if any
// container other than neonvm-runner has exited
// - runnerSucceeded, if pod.Status.Phase is Succeeded, or if neonvm-runner has exited
// successfully
func runnerStatus(pod *corev1.Pod) runnerStatusKind {
switch pod.Status.Phase {
case "", corev1.PodPending:
return runnerPending
case corev1.PodSucceeded:
return runnerSucceeded
case corev1.PodFailed:
return runnerFailed
case corev1.PodUnknown:
return runnerUnknown
// See comment above for context on this logic
case corev1.PodRunning:
nonRunnerContainerSucceeded := false
runnerContainerSucceeded := false
for _, stat := range pod.Status.ContainerStatuses {
if stat.State.Terminated != nil {
failed := stat.State.Terminated.ExitCode != 0
isRunner := stat.Name == "neonvm-runner"
if failed {
// return that the "runner" has failed if any container has.
return runnerFailed
} else /* succeeded */ {
if isRunner {
// neonvm-runner succeeded. We'll return runnerSucceeded if no other
// container has failed.
runnerContainerSucceeded = true
} else {
// Other container has succeeded. We'll return runnerSucceeded if
// neonvm-runner has succeeded, but runnerFailed if this exited while
// neonvm-runner is still going.
nonRunnerContainerSucceeded = true
}
}
}
}
if runnerContainerSucceeded {
return runnerSucceeded
} else if nonRunnerContainerSucceeded {
return runnerFailed
} else {
return runnerRunning
}
default:
panic(fmt.Errorf("unknown pod phase: %q", pod.Status.Phase))
}
}
// runnerContainerStopped returns true iff the neonvm-runner container has exited.
//
// The guarantee is simple: It is only safe to start a new runner pod for a VM if
// runnerContainerStopped returns true (otherwise, we may end up with >1 instance of the same VM).
func runnerContainerStopped(pod *corev1.Pod) bool {
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
return true
}
for _, stat := range pod.Status.ContainerStatuses {
if stat.Name == "neonvm-runner" {
return stat.State.Terminated != nil
}
}
return false
}
// deleteRunnerPodIfEnabled deletes the runner pod if buildtag.NeverDeleteRunnerPods is false, and
// then emits an event and log line about what it did, whether it actually deleted the runner pod.
func (r *VMReconciler) deleteRunnerPodIfEnabled(
ctx context.Context,
vm *vmv1.VirtualMachine,
runner *corev1.Pod,
) error {
log := log.FromContext(ctx)
var msg, eventReason string
if buildtag.NeverDeleteRunnerPods {
msg = fmt.Sprintf("VM runner pod deletion was skipped due to '%s' build tag", buildtag.TagnameNeverDeleteRunnerPods)
eventReason = "DeleteSkipped"
} else {
// delete current runner
if err := r.Delete(ctx, runner); err != nil {
return err
}
msg = "VM runner pod was deleted"
eventReason = "Deleted"
}
log.Info(msg, "Pod.Namespace", runner.Namespace, "Pod.Name", runner.Name)
r.Recorder.Event(vm, "Normal", eventReason, fmt.Sprintf("%s: %s", msg, runner.Name))
return nil
}
// updates the values of the runner pod's labels and annotations so that they are exactly equal to
// the set of labels/annotations we expect - minus some that are ignored.
//
// The reason we also need to delete unrecognized labels/annotations is so that if a
// label/annotation on the VM itself is deleted, we can accurately reflect that in the pod.
func updatePodMetadataIfNecessary(ctx context.Context, c client.Client, vm *vmv1.VirtualMachine, runnerPod *corev1.Pod) error {
log := log.FromContext(ctx)
var patches []patch.Operation
metaSpecs := []struct {
metaField string
expected map[string]string
actual map[string]string
ignoreExtra map[string]bool // use bool here so `if ignoreExtra[key] { ... }` works
}{
{
metaField: "labels",
expected: labelsForVirtualMachine(vm, nil), // don't include runner version
actual: runnerPod.Labels,
ignoreExtra: map[string]bool{
// Don't override the runner pod version - we need to keep it around without
// changing it; otherwise it's not useful!
vmv1.RunnerPodVersionLabel: true,
},
},
{
metaField: "annotations",
expected: annotationsForVirtualMachine(vm),
actual: runnerPod.Annotations,
ignoreExtra: map[string]bool{
"k8s.v1.cni.cncf.io/networks": true,
"k8s.v1.cni.cncf.io/network-status": true,
"k8s.v1.cni.cncf.io/networks-status": true,
},
},
}
var removedMessageParts []string
for _, spec := range metaSpecs {
// Add/update the entries we're expecting to be there
for k, e := range spec.expected {
if a, ok := spec.actual[k]; !ok || e != a {
patches = append(patches, patch.Operation{
// From RFC 6902 (JSON patch):
//
// > The "add" operation performs one of the following functions, depending upon
// > what the target location references:
// >
// > [ ... ]
// >
// > * If the target location specifies an object member that does not already
// > exist, a new member is added to the object.
// > * If the target location specifies an object member that does exist, that
// > member's value is replaced.
//
// So: if the value is missing we'll add it. And if it's different, we'll replace it.
Op: patch.OpAdd,
Path: fmt.Sprintf("/metadata/%s/%s", spec.metaField, patch.PathEscape(k)),
Value: e,
})
}
}
// Remove the entries we aren't expecting to be there
var removed []string
for k := range spec.actual {
if _, expected := spec.expected[k]; !expected && !spec.ignoreExtra[k] {
removed = append(removed, k)
patches = append(patches, patch.Operation{
Op: patch.OpRemove,
Path: fmt.Sprintf("/metadata/%s/%s", spec.metaField, patch.PathEscape(k)),
})
}
}
if len(removed) != 0 {
// note: formatting with %q for a []string will print the array normally, but escape the
// strings inside. For example:
//
// fmt.Printf("%q\n", []string{"foo", "bar", "escaped\nstring"})
//
// outputs:
//
// ["foo" "bar" "escaped\nstring"]
//
// So the "message part" might look like `labels ["foo" "test-label"]`
removedMessageParts = append(removedMessageParts, fmt.Sprintf("%s %q", spec.metaField, removed))
}
}
if len(patches) == 0 {
return nil
}
patchData, err := json.Marshal(patches)
if err != nil {
panic(fmt.Errorf("error marshalling JSON patch: %w", err))
}
if len(removedMessageParts) != 0 {
var msg string
if len(removedMessageParts) == 1 {
msg = fmt.Sprintf("removing runner pod %s", removedMessageParts[0])
} else /* len = 2 */ {
msg = fmt.Sprintf("removing runner pod %s and %s", removedMessageParts[0], removedMessageParts[1])
}
// We want to log something when labels/annotations are removed, because the ignoreExtra
// values above might be incomplete, and it'd be hard to debug without an logs for the
// change.
log.Info(msg, "VirtualMachine", vm.Name, "Pod", runnerPod.Name)
}
// NOTE: We don't need to update the data in runnerPod ourselves because c.Patch will update it
// with what we get back from the k8s API after the patch completes.
return c.Patch(ctx, runnerPod, client.RawPatch(types.JSONPatchType, patchData))
}
func extractVirtualMachineUsageJSON(spec vmv1.VirtualMachineSpec) string {
cpu := spec.Guest.CPUs.Use
memorySlots := spec.Guest.MemorySlots.Use
usage := vmv1.VirtualMachineUsage{
CPU: cpu.ToResourceQuantity(),
Memory: resource.NewQuantity(spec.Guest.MemorySlotSize.Value()*int64(memorySlots), resource.BinarySI),
}
usageJSON, err := json.Marshal(usage)
if err != nil {
panic(fmt.Errorf("error marshalling JSON: %w", err))
}
return string(usageJSON)
}
func extractVirtualMachineResourcesJSON(spec vmv1.VirtualMachineSpec) string {
resourcesJSON, err := json.Marshal(spec.Resources())
if err != nil {
panic(fmt.Errorf("error marshalling JSON: %w", err))
}
return string(resourcesJSON)
}
// podForVirtualMachine returns a VirtualMachine Pod object
func (r *VMReconciler) podForVirtualMachine(
vm *vmv1.VirtualMachine,
memoryProvider vmv1.MemoryProvider,
sshSecret *corev1.Secret,
) (*corev1.Pod, error) {
pod, err := podSpec(vm, memoryProvider, sshSecret, r.Config)
if err != nil {
return nil, err
}
// Set the ownerRef for the Pod
// More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/owners-dependents/
if err := ctrl.SetControllerReference(vm, pod, r.Scheme); err != nil {
return nil, err
}
return pod, nil
}
func (r *VMReconciler) sshSecretForVirtualMachine(vm *vmv1.VirtualMachine) (*corev1.Secret, error) {
secret, err := sshSecretSpec(vm)
if err != nil {
return nil, err
}
// Set the ownerRef for the Secret
// More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/owners-dependents/
if err := ctrl.SetControllerReference(vm, secret, r.Scheme); err != nil {
return nil, err
}
return secret, nil
}
func sshSecretSpec(vm *vmv1.VirtualMachine) (*corev1.Secret, error) {
// using ed25519 signatures it takes ~16us to finish
publicKey, privateKey, err := sshKeygen()
if err != nil {
return nil, err
}
secret := &corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: vm.Status.SSHSecretName,
Namespace: vm.Namespace,
},
Immutable: lo.ToPtr(true),
Type: corev1.SecretTypeSSHAuth,
Data: map[string][]byte{
"ssh-publickey": publicKey,
"ssh-privatekey": privateKey,
},
}
return secret, nil
}
// labelsForVirtualMachine returns the labels for selecting the resources
// More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/
func labelsForVirtualMachine(vm *vmv1.VirtualMachine, runnerVersion *api.RunnerProtoVersion) map[string]string {
l := make(map[string]string, len(vm.Labels)+3)
for k, v := range vm.Labels {
l[k] = v
}
l["app.kubernetes.io/name"] = "NeonVM"
l[vmv1.VirtualMachineNameLabel] = vm.Name
if runnerVersion != nil {
l[vmv1.RunnerPodVersionLabel] = fmt.Sprintf("%d", *runnerVersion)
}
return l
}
func annotationsForVirtualMachine(vm *vmv1.VirtualMachine) map[string]string {
// use bool here so `if ignored[key] { ... }` works
ignored := map[string]bool{
"kubectl.kubernetes.io/last-applied-configuration": true,
}
a := make(map[string]string, len(vm.Annotations)+2)
for k, v := range vm.Annotations {
if !ignored[k] {
a[k] = v
}
}
a["kubectl.kubernetes.io/default-container"] = "neonvm-runner"
a[vmv1.VirtualMachineUsageAnnotation] = extractVirtualMachineUsageJSON(vm.Spec)
a[vmv1.VirtualMachineResourcesAnnotation] = extractVirtualMachineResourcesJSON(vm.Spec)
return a
}
func affinityForVirtualMachine(vm *vmv1.VirtualMachine) *corev1.Affinity {
a := vm.Spec.Affinity
if a == nil {
a = &corev1.Affinity{}
}
if a.NodeAffinity == nil {
a.NodeAffinity = &corev1.NodeAffinity{}
}
if a.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil {
a.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &corev1.NodeSelector{}
}
// if NodeSelectorTerms list is empty - add default values (arch==amd64 or os==linux)
if len(a.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms) == 0 {
a.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms = append(
a.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms,
corev1.NodeSelectorTerm{
MatchExpressions: []corev1.NodeSelectorRequirement{
{
Key: "kubernetes.io/arch",
Operator: "In",
Values: []string{"amd64"},
},
{
Key: "kubernetes.io/os",
Operator: "In",
Values: []string{"linux"},
},
},
})
}
return a
}
func setRunnerCgroup(ctx context.Context, vm *vmv1.VirtualMachine, cpu vmv1.MilliCPU) error {
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
url := fmt.Sprintf("http://%s:%d/cpu_change", vm.Status.PodIP, vm.Spec.RunnerPort)
update := api.VCPUChange{VCPUs: cpu}
data, err := json.Marshal(update)
if err != nil {
return err
}
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(data))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return fmt.Errorf("unexpected status %s", resp.Status)
}
return nil
}
func getRunnerCgroup(ctx context.Context, vm *vmv1.VirtualMachine) (*api.VCPUCgroup, error) {
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
url := fmt.Sprintf("http://%s:%d/cpu_current", vm.Status.PodIP, vm.Spec.RunnerPort)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, err
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
if resp.StatusCode != 200 {
return nil, fmt.Errorf("unexpected status %s", resp.Status)
}
body, err := io.ReadAll(resp.Body)
defer resp.Body.Close()
if err != nil {
return nil, err
}
var result api.VCPUCgroup
err = json.Unmarshal(body, &result)
if err != nil {
return nil, err
}
return &result, nil
}
// imageForVirtualMachine gets the Operand image which is managed by this controller
// from the VM_RUNNER_IMAGE environment variable defined in the config/manager/manager.yaml
func imageForVmRunner() (string, error) {
var imageEnvVar = "VM_RUNNER_IMAGE"
image, found := os.LookupEnv(imageEnvVar)
if !found {
return "", fmt.Errorf("unable to find %s environment variable with the image", imageEnvVar)
}
return image, nil
}
func podSpec(
vm *vmv1.VirtualMachine,
memoryProvider vmv1.MemoryProvider,
sshSecret *corev1.Secret,
config *ReconcilerConfig,
) (*corev1.Pod, error) {
runnerVersion := api.RunnerProtoV1
labels := labelsForVirtualMachine(vm, &runnerVersion)
annotations := annotationsForVirtualMachine(vm)
affinity := affinityForVirtualMachine(vm)
// Get the Operand image
image, err := imageForVmRunner()
if err != nil {
return nil, err
}
vmSpecJson, err := json.Marshal(vm.Spec)
if err != nil {
return nil, fmt.Errorf("marshal VM Spec: %w", err)
}
vmStatusJson, err := json.Marshal(vm.Status)
if err != nil {
return nil, fmt.Errorf("marshal VM Status: %w", err)
}
delegatedCPULimits := lo.FromPtr(vm.Spec.DelegatedCPULimits)
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: vm.Status.PodName,
Namespace: vm.Namespace,
Labels: labels,
Annotations: annotations,
},
Spec: corev1.PodSpec{
EnableServiceLinks: vm.Spec.ServiceLinks,
AutomountServiceAccountToken: lo.ToPtr(false),
RestartPolicy: corev1.RestartPolicyNever,
TerminationGracePeriodSeconds: vm.Spec.TerminationGracePeriodSeconds,
NodeSelector: vm.Spec.NodeSelector,
ImagePullSecrets: vm.Spec.ImagePullSecrets,
Tolerations: vm.Spec.Tolerations,
ServiceAccountName: vm.Spec.ServiceAccountName,
SchedulerName: vm.Spec.SchedulerName,
Affinity: affinity,
InitContainers: []corev1.Container{
{
Image: vm.Spec.Guest.RootDisk.Image,
Name: "init",
ImagePullPolicy: vm.Spec.Guest.RootDisk.ImagePullPolicy,
VolumeMounts: []corev1.VolumeMount{{
Name: "virtualmachineimages",
MountPath: "/vm/images",
}},
Command: []string{
"sh", "-c",
"cp /disk.qcow2 /vm/images/rootdisk.qcow2 && " +
/* uid=36(qemu) gid=34(kvm) groups=34(kvm) */
"chown 36:34 /vm/images/rootdisk.qcow2 && " +
"sysctl -w net.ipv4.ip_forward=1",
},
SecurityContext: &corev1.SecurityContext{
Privileged: lo.ToPtr(true),
},
},
},
// generate containers as an inline function so the context isn't isolated
Containers: func() []corev1.Container {
runner := corev1.Container{
Image: image,
Name: "neonvm-runner",
ImagePullPolicy: corev1.PullIfNotPresent,
// Ensure restrictive context for the container
// More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
SecurityContext: &corev1.SecurityContext{
Privileged: lo.ToPtr(false),
Capabilities: &corev1.Capabilities{
Add: []corev1.Capability{
"NET_ADMIN",
"SYS_ADMIN",
"SYS_RESOURCE",
},
},
},
Ports: []corev1.ContainerPort{{
ContainerPort: vm.Spec.QMP,
Name: "qmp",
}, {
ContainerPort: vm.Spec.QMPManual,
Name: "qmp-manual",
}},
Command: func() []string {
cmd := []string{"runner"}
if delegatedCPULimits {
cmd = append(cmd, "-delegated-cgroup")
} else if config.UseContainerMgr || config.DisableRunnerCgroup {
cmd = append(cmd, "-skip-cgroup-management")
}
if config.DisableRunnerCgroup {
// cgroup management disabled, but we still need something to provide
// the server, so the runner will just provide a dummy implementation.
cmd = append(cmd, "-enable-dummy-cpu-server")
}
cmd = append(
cmd,
"-qemu-disk-cache-settings", config.QEMUDiskCacheSettings,
"-memory-provider", string(memoryProvider),
)
if memoryProvider == vmv1.MemoryProviderVirtioMem {
cmd = append(cmd, "-memhp-auto-movable-ratio", config.MemhpAutoMovableRatio)
}
// put these last, so that the earlier args are easier to see (because these
// can get quite large)
cmd = append(
cmd,
"-vmspec", base64.StdEncoding.EncodeToString(vmSpecJson),
"-vmstatus", base64.StdEncoding.EncodeToString(vmStatusJson),
)
return cmd
}(),
Env: []corev1.EnvVar{{
Name: "K8S_POD_NAME",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.name",
},
},
}},
VolumeMounts: func() []corev1.VolumeMount {
images := corev1.VolumeMount{
Name: "virtualmachineimages",
MountPath: "/vm/images",
}
cgroups := corev1.VolumeMount{
Name: "sysfscgroup",
MountPath: "/sys/fs/cgroup",
// MountPropagationNone means that the volume in a container will
// not receive new mounts from the host or other containers, and filesystems
// mounted inside the container won't be propagated to the host or other
// containers.
// Note that this mode corresponds to "private" in Linux terminology.
MountPropagation: lo.ToPtr(corev1.MountPropagationNone),
}
if config.UseContainerMgr || config.DisableRunnerCgroup || delegatedCPULimits {
return []corev1.VolumeMount{images}
} else {
// the /sys/fs/cgroup mount is only necessary if neonvm-runner has to
// do is own cpu limiting
return []corev1.VolumeMount{images, cgroups}
}
}(),
Resources: vm.Spec.PodResources,
}
containerMgr := corev1.Container{
Image: image,
Name: "neonvm-container-mgr",
Command: []string{
"container-mgr",
"-port", strconv.Itoa(int(vm.Spec.RunnerPort)),
"-init-milli-cpu", strconv.Itoa(int(vm.Spec.Guest.CPUs.Use)),
},
Env: []corev1.EnvVar{
{
Name: "K8S_POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
{
Name: "CRI_ENDPOINT",
Value: fmt.Sprintf("unix://%s", config.criEndpointSocketPath()),
},
},
LivenessProbe: &corev1.Probe{
InitialDelaySeconds: 10,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/healthz",
Port: intstr.FromInt(int(vm.Spec.RunnerPort)),
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("50m"),
corev1.ResourceMemory: resource.MustParse("50Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"), // cpu limit > request, because usage is spiky
corev1.ResourceMemory: resource.MustParse("50Mi"),
},
},
// socket for crictl to connect to
VolumeMounts: []corev1.VolumeMount{
{
Name: "containerdsock",
MountPath: config.criEndpointSocketPath(),
},
},
}
if config.UseContainerMgr && !delegatedCPULimits {
return []corev1.Container{runner, containerMgr}
} else {
// Return only the runner if we aren't supposed to use container-mgr
return []corev1.Container{runner}
}
}(),
Volumes: func() []corev1.Volume {
images := corev1.Volume{
Name: "virtualmachineimages",
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
}
cgroup := corev1.Volume{
Name: "sysfscgroup",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/sys/fs/cgroup",
Type: lo.ToPtr(corev1.HostPathDirectory),
},
},
}
containerdSock := corev1.Volume{
Name: "containerdsock",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: config.criEndpointSocketPath(),
Type: lo.ToPtr(corev1.HostPathSocket),
},
},
}
if delegatedCPULimits {
return []corev1.Volume{images}
} else if config.UseContainerMgr {
return []corev1.Volume{images, containerdSock}
} else if config.DisableRunnerCgroup {
return []corev1.Volume{images}
} else {
return []corev1.Volume{images, cgroup}
}
}(),
},
}
if sshSecret != nil {
pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts,
corev1.VolumeMount{
Name: "ssh-privatekey",
MountPath: "/mnt/ssh",
},
corev1.VolumeMount{
Name: "ssh-publickey",
MountPath: "/vm/ssh",
},
)
pod.Spec.Volumes = append(pod.Spec.Volumes,
corev1.Volume{
Name: "ssh-privatekey",
VolumeSource: corev1.VolumeSource{
Secret: &corev1.SecretVolumeSource{
SecretName: sshSecret.Name,
Items: []corev1.KeyToPath{
{
Key: "ssh-privatekey",
Path: "id_ed25519",
Mode: lo.ToPtr[int32](0600),
},
},
},
},
},
corev1.Volume{
Name: "ssh-publickey",
VolumeSource: corev1.VolumeSource{
Secret: &corev1.SecretVolumeSource{
SecretName: sshSecret.Name,
Items: []corev1.KeyToPath{
{
Key: "ssh-publickey",
Path: "authorized_keys",
Mode: lo.ToPtr[int32](0644),
},
},
},
},
},
)
}
// If a custom neonvm-runner image is requested, use that instead:
if vm.Spec.RunnerImage != nil {
pod.Spec.Containers[0].Image = *vm.Spec.RunnerImage
if config.UseContainerMgr && !delegatedCPULimits {
pod.Spec.Containers[1].Image = *vm.Spec.RunnerImage
}
}
// If a custom kernel is used, add that image:
if vm.Spec.Guest.KernelImage != nil {
pod.Spec.Containers[0].Args = append(pod.Spec.Containers[0].Args, "-kernelpath=/vm/images/vmlinuz")
pod.Spec.InitContainers = append(pod.Spec.InitContainers, corev1.Container{
Image: *vm.Spec.Guest.KernelImage,
Name: "init-kernel",
ImagePullPolicy: vm.Spec.Guest.RootDisk.ImagePullPolicy,
Args: []string{"cp", "/vmlinuz", "/vm/images/vmlinuz"},
VolumeMounts: []corev1.VolumeMount{{
Name: "virtualmachineimages",
MountPath: "/vm/images",
}},
SecurityContext: &corev1.SecurityContext{
// uid=36(qemu) gid=34(kvm) groups=34(kvm)
RunAsUser: lo.ToPtr[int64](36),
RunAsGroup: lo.ToPtr[int64](34),
},
})
}
if vm.Spec.Guest.AppendKernelCmdline != nil {
pod.Spec.Containers[0].Args = append(pod.Spec.Containers[0].Args, fmt.Sprintf("-appendKernelCmdline=%s", *vm.Spec.Guest.AppendKernelCmdline))
}
// Add any InitContainers that were specified by the spec
pod.Spec.InitContainers = append(pod.Spec.InitContainers, vm.Spec.ExtraInitContainers...)
// allow access to /dev/kvm and /dev/vhost-net devices by generic-device-plugin for kubelet
if pod.Spec.Containers[0].Resources.Limits == nil {
pod.Spec.Containers[0].Resources.Limits = corev1.ResourceList{}
}
pod.Spec.Containers[0].Resources.Limits["neonvm/vhost-net"] = resource.MustParse("1")
// NB: EnableAcceleration guaranteed non-nil because the k8s API server sets the default for us.
if *vm.Spec.EnableAcceleration {
pod.Spec.Containers[0].Resources.Limits["neonvm/kvm"] = resource.MustParse("1")
}
for _, port := range vm.Spec.Guest.Ports {
cPort := corev1.ContainerPort{
ContainerPort: int32(port.Port),
}
if len(port.Name) != 0 {
cPort.Name = port.Name
}
if len(port.Protocol) != 0 {
cPort.Protocol = corev1.Protocol(port.Protocol)
}
pod.Spec.Containers[0].Ports = append(pod.Spec.Containers[0].Ports, cPort)
}
if settings := vm.Spec.Guest.Settings; settings != nil {
if swapSize := settings.Swap; swapSize != nil {
diskName := "swapdisk"
pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, corev1.VolumeMount{
Name: diskName,
MountPath: fmt.Sprintf("/vm/mounts/%s", diskName),
})
pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
Name: diskName,
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{
SizeLimit: swapSize,
},
},
})
}
}
for _, disk := range vm.Spec.Disks {
mnt := corev1.VolumeMount{
Name: disk.Name,
MountPath: fmt.Sprintf("/vm/mounts%s", disk.MountPath),
}
if disk.ReadOnly != nil {
mnt.ReadOnly = *disk.ReadOnly
}
switch {
case disk.ConfigMap != nil:
pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, mnt)
pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
Name: disk.Name,
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: disk.ConfigMap.Name,
},
Items: disk.ConfigMap.Items,
},
},
})
case disk.Secret != nil:
pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, mnt)
pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
Name: disk.Name,
VolumeSource: corev1.VolumeSource{
Secret: &corev1.SecretVolumeSource{
SecretName: disk.Secret.SecretName,
Items: disk.Secret.Items,
},
},
})
case disk.EmptyDisk != nil:
pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, mnt)
pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
Name: disk.Name,
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{
SizeLimit: &disk.EmptyDisk.Size,
},
},
})
default:
// do nothing
}
}
// use multus network to add extra network interface
if vm.Spec.ExtraNetwork != nil && vm.Spec.ExtraNetwork.Enable {
var nadNetwork string
if len(vm.Spec.ExtraNetwork.MultusNetwork) > 0 { // network specified in spec
nadNetwork = vm.Spec.ExtraNetwork.MultusNetwork
} else { // get network from env variables
nadName, err := nadRunnerName()
if err != nil {
return nil, err
}
nadNamespace, err := nadRunnerNamespace()
if err != nil {
return nil, err
}
nadNetwork = fmt.Sprintf("%s/%s", nadNamespace, nadName)
}
pod.ObjectMeta.Annotations[nadapiv1.NetworkAttachmentAnnot] = fmt.Sprintf("%s@%s", nadNetwork, vm.Spec.ExtraNetwork.Interface)
}
return pod, nil
}
// SetupWithManager sets up the controller with the Manager.
// Note that the Runner Pod will be also watched in order to ensure its
// desirable state on the cluster
func (r *VMReconciler) SetupWithManager(mgr ctrl.Manager) (ReconcilerWithMetrics, error) {
cntrlName := "virtualmachine"
reconciler := WithMetrics(
withCatchPanic(r),
r.Metrics,
cntrlName,
r.Config.FailurePendingPeriod,
r.Config.FailingRefreshInterval,
)
err := ctrl.NewControllerManagedBy(mgr).
For(&vmv1.VirtualMachine{}).
Owns(&corev1.Pod{}).
WithOptions(controller.Options{MaxConcurrentReconciles: r.Config.MaxConcurrentReconciles}).
Named(cntrlName).
Complete(reconciler)
return reconciler, err
}
func DeepEqual(v1, v2 interface{}) bool {
if reflect.DeepEqual(v1, v2) {
return true
}
var x1 interface{}
bytesA, _ := json.Marshal(v1)
_ = json.Unmarshal(bytesA, &x1)
var x2 interface{}
bytesB, _ := json.Marshal(v2)
_ = json.Unmarshal(bytesB, &x2)
return reflect.DeepEqual(x1, x2)
}
// TODO: reimplement to r.Patch()
func (r *VMReconciler) tryUpdateVM(ctx context.Context, vm *vmv1.VirtualMachine) error {
return r.Update(ctx, vm)
}
// return Network Attachment Definition name with IPAM settings
func nadIpamName() (string, error) {
return getEnvVarValue("NAD_IPAM_NAME")
}
// return Network Attachment Definition namespace with IPAM settings
func nadIpamNamespace() (string, error) {
return getEnvVarValue("NAD_IPAM_NAMESPACE")
}
// return Network Attachment Definition name for second interface in Runner
func nadRunnerName() (string, error) {
return getEnvVarValue("NAD_RUNNER_NAME")
}
// return Network Attachment Definition namespace for second interface in Runner
func nadRunnerNamespace() (string, error) {
return getEnvVarValue("NAD_RUNNER_NAMESPACE")
}
// return env variable value
func getEnvVarValue(envVarName string) (string, error) {
value, found := os.LookupEnv(envVarName)
if !found {
return "", fmt.Errorf("unable to find %s environment variable", envVarName)
}
return value, nil
}
// sshKeygen generates a pair of public and private keys using the ed25519
// algorithm. It returns the generated public key and private key as byte
// slices. If an error occurs during key generation or encoding, it returns nil
// for both keys and the error.
func sshKeygen() (publicKeyBytes []byte, privateKeyBytes []byte, err error) {
publicKey, privateKey, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
return nil, nil, err
}
publicKeyBytes, err = encodePublicKey(publicKey)
if err != nil {
return nil, nil, err
}
privateKeyBytes, err = encodePrivateKey(privateKey)
if err != nil {
return nil, nil, err
}
return
}
func encodePrivateKey(privateKey ed25519.PrivateKey) ([]byte, error) {
privBlock, err := ssh.MarshalPrivateKey(privateKey, "")
if err != nil {
return nil, err
}
privatePEM := pem.EncodeToMemory(privBlock)
return privatePEM, nil
}
func encodePublicKey(publicKey ed25519.PublicKey) ([]byte, error) {
sshPublicKey, err := ssh.NewPublicKey(publicKey)
if err != nil {
return nil, err
}
pubKeyBytes := ssh.MarshalAuthorizedKey(sshPublicKey)
return pubKeyBytes, nil
}
package controllers
import (
"context"
"encoding/json"
"errors"
"fmt"
"strconv"
"strings"
"time"
"github.com/digitalocean/go-qemu/qmp"
"github.com/go-logr/logr"
"sigs.k8s.io/controller-runtime/pkg/log"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/client-go/tools/record"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)
type QmpCpus struct {
Return []struct {
Props struct {
CoreId int32 `json:"core-id"`
ThreadId int32 `json:"thread-id"`
SocketId int32 `json:"socket-id"`
} `json:"props"`
VcpusCount int32 `json:"vcpus-count"`
QomPath *string `json:"qom-path"`
Type string `json:"type"`
} `json:"return"`
}
type QmpMemorySize struct {
Return struct {
BaseMemory int64 `json:"base-memory"`
PluggedMemory int64 `json:"plugged-memory"`
} `json:"return"`
}
type QmpCpuSlot struct {
Core int32 `json:"core"`
QOM string `json:"qom"`
Type string `json:"type"`
}
type QmpMemoryDevices struct {
Return []QmpMemoryDevice `json:"return"`
}
type QmpMemoryDevice struct {
Type string `json:"type"`
Data struct {
Memdev string `json:"memdev"`
Hotplugged bool `json:"hotplugged"`
Addr int64 `json:"addr"`
Hotplugguble bool `json:"hotpluggable"`
Size int64 `json:"size"`
Slot int64 `json:"slot"`
Node int64 `json:"node"`
Id string `json:"id"`
} `json:"data"`
}
type QmpObjects struct {
Return []QmpObject `json:"return"`
}
type QmpObject struct {
Name string `json:"name"`
Type string `json:"type"`
}
type QmpMigrationInfo struct {
Return MigrationInfo `json:"return"`
}
type MigrationInfo struct {
Status string `json:"status"`
TotalTimeMs int64 `json:"total-time"`
SetupTimeMs int64 `json:"setup-time"`
DowntimeMs int64 `json:"downtime"`
Ram struct {
Transferred int64 `json:"transferred"`
Remaining int64 `json:"remaining"`
Total int64 `json:"total"`
Duplicate int64 `json:"duplicate"`
Normal int64 `json:"normal"`
NormalBytes int64 `json:"normal-bytes"`
DirtySyncCount int64 `json:"dirty-sync-count"`
} `json:"ram"`
Compression struct {
CompressedSize int64 `json:"compressed-size"`
CompressionRate float64 `json:"compression-rate"`
} `json:"compression"`
}
func QmpAddr(vm *vmv1.VirtualMachine) (ip string, port int32) {
return vm.Status.PodIP, vm.Spec.QMP
}
func QmpConnect(ip string, port int32) (*qmp.SocketMonitor, error) {
mon, err := qmp.NewSocketMonitor("tcp", fmt.Sprintf("%s:%d", ip, port), 2*time.Second)
if err != nil {
return nil, err
}
if err := mon.Connect(); err != nil {
return nil, err
}
return mon, nil
}
func QmpGetCpus(ip string, port int32) ([]QmpCpuSlot, []QmpCpuSlot, error) {
mon, err := QmpConnect(ip, port)
if err != nil {
return nil, nil, err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
qmpcmd := []byte(`{"execute": "query-hotpluggable-cpus"}`)
raw, err := mon.Run(qmpcmd)
if err != nil {
return nil, nil, err
}
var result QmpCpus
if err := json.Unmarshal(raw, &result); err != nil {
return nil, nil, fmt.Errorf("error unmarshaling json: %w", err)
}
plugged := []QmpCpuSlot{}
empty := []QmpCpuSlot{}
for _, entry := range result.Return {
if entry.QomPath != nil {
plugged = append(plugged, QmpCpuSlot{Core: entry.Props.CoreId, QOM: *entry.QomPath, Type: entry.Type})
} else {
empty = append(empty, QmpCpuSlot{Core: entry.Props.CoreId, QOM: "", Type: entry.Type})
}
}
return plugged, empty, nil
}
func QmpPlugCpu(ip string, port int32) error {
_, empty, err := QmpGetCpus(ip, port)
if err != nil {
return err
}
if len(empty) == 0 {
return errors.New("no empty slots for CPU hotplug")
}
mon, err := QmpConnect(ip, port)
if err != nil {
return err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
// empty list reversed, first cpu slot in the end of list and last cpu slot in the beginning
slot := empty[len(empty)-1]
qmpcmd := []byte(fmt.Sprintf(`{
"execute": "device_add",
"arguments": {
"id": "cpu%d",
"driver": %q,
"core-id": %d,
"socket-id": 0,
"thread-id": 0
}
}`, slot.Core, slot.Type, slot.Core))
_, err = mon.Run(qmpcmd)
if err != nil {
return err
}
return nil
}
func QmpUnplugCpu(ip string, port int32) error {
plugged, _, err := QmpGetCpus(ip, port)
if err != nil {
return err
}
slot := -1
found := false
for i, s := range plugged {
if strings.Contains(s.QOM, "machine/peripheral/cpu") {
found = true
slot = i
break
}
}
if !found {
return errors.New("there are no unpluggable CPUs")
}
mon, err := QmpConnect(ip, port)
if err != nil {
return err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
cmd := []byte(fmt.Sprintf(`{"execute": "device_del", "arguments": {"id": %q}}`, plugged[slot].QOM))
_, err = mon.Run(cmd)
if err != nil {
return err
}
// small pause to let hypervisor do unplug
time.Sleep(500 * time.Millisecond)
return nil
}
func QmpSyncCpuToTarget(vm *vmv1.VirtualMachine, migration *vmv1.VirtualMachineMigration) error {
plugged, _, err := QmpGetCpus(QmpAddr(vm))
if err != nil {
return err
}
pluggedInTarget, _, err := QmpGetCpus(migration.Status.TargetPodIP, vm.Spec.QMP)
if err != nil {
return err
}
if len(plugged) == len(pluggedInTarget) {
// no need plug anything
return nil
}
target, err := QmpConnect(migration.Status.TargetPodIP, vm.Spec.QMP)
if err != nil {
return err
}
defer target.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
searchForEmpty:
for _, slot := range plugged {
// firsly check if slot occupied already
// run over Target CPUs and compare with source
for _, tslot := range pluggedInTarget {
if slot == tslot {
// that mean such CPU already present in Target, skip it
continue searchForEmpty
}
}
qmpcmd := []byte(fmt.Sprintf(`{
"execute": "device_add",
"arguments": {
"id": "cpu%d",
"driver": %q,
"core-id": %d,
"socket-id": 0,
"thread-id": 0
}
}`, slot.Core, slot.Type, slot.Core))
_, err = target.Run(qmpcmd)
if err != nil {
return err
}
}
return nil
}
func QmpQueryMemoryDevices(ip string, port int32) ([]QmpMemoryDevice, error) {
mon, err := QmpConnect(ip, port)
if err != nil {
return nil, err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
return QmpMonQueryMemoryDevices(mon)
}
func QmpMonQueryMemoryDevices(mon *qmp.SocketMonitor) ([]QmpMemoryDevice, error) {
cmd := []byte(`{"execute": "query-memory-devices"}`)
raw, err := mon.Run(cmd)
if err != nil {
return nil, err
}
var result QmpMemoryDevices
if err := json.Unmarshal(raw, &result); err != nil {
return nil, fmt.Errorf("error unmarshaling json: %w", err)
}
return result.Return, nil
}
// MemslotIdxFromName takes "/objects/memslot3" or "memslot3 and returns 3
func MemslotIdxFromName(name string) (int, error) {
name = strings.TrimPrefix(name, "/objects/")
idxStr := strings.TrimPrefix(name, "memslot")
idx, err := strconv.Atoi(idxStr)
if err != nil {
// doesn't reference `err`, because we don't know the actual issue
return 0, fmt.Errorf("failed to parse memory device id: %q", name)
}
return idx, nil
}
func QmpQueryMemoryBackendIds(mon *qmp.SocketMonitor) (map[int]struct{}, error) {
cmd := []byte(`{"execute": "qom-list", "arguments": {"path": "/objects"}}`)
raw, err := mon.Run(cmd)
if err != nil {
return nil, err
}
var result QmpObjects
if err := json.Unmarshal(raw, &result); err != nil {
return nil, fmt.Errorf("error unmarshaling json: %w", err)
}
backends := map[int]struct{}{}
for _, o := range result.Return {
if o.Name == "pc.ram" { // Non-hotplugged memory
continue
}
if o.Type != "child<memory-backend-ram>" {
continue
}
idx, err := MemslotIdxFromName(o.Name)
if err != nil {
return nil, err
}
backends[idx] = struct{}{}
}
return backends, nil
}
type QMPRunner interface {
Run([]byte) ([]byte, error)
}
// QmpSetVirtioMem updates virtio-mem to the new target size, returning the previous target.
//
// If the new target size is equal to the previous one, this function does nothing but query the
// target.
func QmpSetVirtioMem(vm *vmv1.VirtualMachine, targetVirtioMemSize int64) (previous int64, _ error) {
// Note: The virtio-mem device only exists when max mem != min mem.
// So if min == max, we should just short-cut, skip the queries, and say it's all good.
// Refer to the instantiation in neonvm-runner for more.
if vm.Spec.Guest.MemorySlots.Min == vm.Spec.Guest.MemorySlots.Max {
// if target size is non-zero even though min == max, something went very wrong
if targetVirtioMemSize != 0 {
panic(fmt.Sprintf(
"VM min mem slots == max mem slots, but target virtio-mem size %d != 0",
targetVirtioMemSize,
))
}
// Otherwise, we're all good, just pretend like we talked to the VM.
return 0, nil
}
mon, err := QmpConnect(QmpAddr(vm))
if err != nil {
return 0, err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
// First, fetch current desired virtio-mem size. If it's the same as targetVirtioMemSize, then
// we can report that it was already the same.
cmd := []byte(`{"execute": "qom-get", "arguments": {"path": "vm0", "property": "requested-size"}}`)
raw, err := mon.Run(cmd)
if err != nil {
return 0, err
}
result := struct {
Return int64 `json:"return"`
}{Return: 0}
if err := json.Unmarshal(raw, &result); err != nil {
return 0, fmt.Errorf("error unmarshaling json: %w", err)
}
previous = result.Return
if previous == targetVirtioMemSize {
return previous, nil
}
// The current requested size is not equal to the new desired size. Let's change that.
cmd = []byte(fmt.Sprintf(
`{"execute": "qom-set", "arguments": {"path": "vm0", "property": "requested-size", "value": %d}}`,
targetVirtioMemSize,
))
_, err = mon.Run(cmd)
if err != nil {
return 0, err
}
return previous, nil
}
// QmpAddMemoryBackend adds a single memory slot to the VM with the given size.
//
// The memory slot does nothing until a corresponding "device" is added to the VM for the same memory slot.
// See QmpAddMemoryDevice for more.
// When unplugging, QmpDelMemoryDevice must be called before QmpDelMemoryBackend.
func QmpAddMemoryBackend(mon QMPRunner, idx int, sizeBytes int64) error {
cmd := []byte(fmt.Sprintf(
`{"execute": "object-add",
"arguments": {"id": "memslot%d",
"size": %d,
"qom-type": "memory-backend-ram"}}`, idx, sizeBytes,
))
_, err := mon.Run(cmd)
return err
}
func QmpDelMemoryBackend(mon *qmp.SocketMonitor, idx int) error {
cmd := []byte(fmt.Sprintf(
`{"execute": "object-del",
"arguments": {"id": "memslot%d"}}`, idx,
))
_, err := mon.Run(cmd)
return err
}
func QmpAddMemoryDevice(mon *qmp.SocketMonitor, idx int) error {
cmd := []byte(fmt.Sprintf(
`{"execute": "device_add",
"arguments": {"id": "dimm%d",
"driver": "pc-dimm",
"memdev": "memslot%d"}}`, idx, idx,
))
_, err := mon.Run(cmd)
return err
}
func QmpDelMemoryDevice(mon *qmp.SocketMonitor, idx int) error {
cmd := []byte(fmt.Sprintf(
`{"execute": "device_del",
"arguments": {"id": "dimm%d"}}`, idx,
))
_, err := mon.Run(cmd)
return err
}
type QmpMemorySetter struct {
vm *vmv1.VirtualMachine
targetCnt int
recorder record.EventRecorder
log logr.Logger
mon *qmp.SocketMonitor
memBackends map[int]bool // idx -> is active
maxBackend int // stores the max idx that was discovered to added.
// Is needed to know where to start deletion
memDevCount int
errs []error
}
func (r *QmpMemorySetter) buildState() error {
memDevs, err := QmpMonQueryMemoryDevices(r.mon)
if err != nil {
return err
}
r.memDevCount = len(memDevs)
for _, m := range memDevs {
idx, err := MemslotIdxFromName(m.Data.Memdev)
if err == nil {
r.memBackends[idx] = true
}
}
backends, err := QmpQueryMemoryBackendIds(r.mon)
if err != nil {
return err
}
for b := range backends {
if _, ok := r.memBackends[b]; !ok {
r.memBackends[b] = false
}
}
for idx := range r.memBackends {
r.maxBackend = max(r.maxBackend, idx)
}
r.log.Info("QMP memory state", "backends", r.memBackends, "maxBackend", r.maxBackend)
return nil
}
func (r *QmpMemorySetter) Disconnect() {
if r.mon != nil {
err := r.mon.Disconnect()
if err != nil {
r.log.Error(err, "Failed to disconnect QMP")
}
}
}
// attemptsCounter limits the total number of operations in each phase.
// In case QMP keeps timeouting, but the operation silently succeeding,
// we don't want to keep doing the QMP actions until we get enough positive
// results.
type attemptsCounter struct {
target int
done int
}
func newAttemptsCounter(target int) *attemptsCounter {
return &attemptsCounter{
target: target,
done: 0,
}
}
// Registers an attempt and returns true if it allowed to continue
func (t *attemptsCounter) attempt() bool {
if t.done < t.target {
t.done++
return true
}
return false
}
func (t *attemptsCounter) didSomething() bool {
return t.done > 0
}
func (r *QmpMemorySetter) AddBackends() {
if r.targetCnt <= len(r.memBackends) {
return
}
attempts := newAttemptsCounter(r.targetCnt - len(r.memBackends))
for idx := 1; idx <= r.targetCnt; idx++ {
if _, ok := r.memBackends[idx]; ok {
continue
}
if !attempts.attempt() {
break
}
err := QmpAddMemoryBackend(r.mon, idx, r.vm.Spec.Guest.MemorySlotSize.Value())
if err != nil {
r.errs = append(r.errs, err)
r.recorder.Event(r.vm, "Warning", "ScaleUp",
fmt.Sprintf("Failed to add memslot%d: %s",
idx, err.Error()))
continue
}
r.recorder.Event(r.vm, "Normal", "ScaleUp",
fmt.Sprintf("Added memslot%d", idx))
r.memBackends[idx] = false
// The one we just added might be the new max
r.maxBackend = max(r.maxBackend, idx)
}
if attempts.didSomething() {
// might need to wait for QEMU to allocate the memory
time.Sleep(time.Second)
}
}
func (r *QmpMemorySetter) AddDevices() {
if r.targetCnt <= r.memDevCount {
return
}
attempts := newAttemptsCounter(r.targetCnt - r.memDevCount)
for idx := 1; idx <= r.maxBackend; idx++ {
active, ok := r.memBackends[idx]
if !ok || active {
continue
}
// Found unused backend to plug into
if !attempts.attempt() {
break
}
err := QmpAddMemoryDevice(r.mon, idx)
if err != nil {
r.errs = append(r.errs, err)
r.recorder.Event(r.vm, "Warning", "ScaleUp",
fmt.Sprintf("Failed to add dimm%d to VM %s: %s",
idx, r.vm.Name, err.Error()))
continue
}
r.recorder.Event(r.vm, "Normal", "ScaleUp",
fmt.Sprintf("Added dimm%d", idx))
r.memBackends[idx] = true
r.memDevCount++
}
}
func (r *QmpMemorySetter) RemoveDevices() {
if r.memDevCount <= r.targetCnt {
return
}
attempts := newAttemptsCounter(r.memDevCount - r.targetCnt)
// Removing from the end to keep memslot1,memslot2,...
for idx := r.maxBackend; idx >= 1; idx-- {
active, ok := r.memBackends[idx]
if !ok || !active {
continue
}
// Found used backend to remove
if !attempts.attempt() {
break
}
err := QmpDelMemoryDevice(r.mon, idx)
if err != nil {
r.errs = append(r.errs, err)
r.recorder.Event(r.vm, "Warning", "ScaleDown",
fmt.Sprintf("Failed to remove dimm%d: %s",
idx, err.Error()))
continue
}
r.recorder.Event(r.vm, "Normal", "ScaleDown",
fmt.Sprintf("Removed dimm%d", idx))
r.memBackends[idx] = false
r.memDevCount--
}
if attempts.didSomething() {
// wait a bit to allow guest kernel remove memory block
time.Sleep(time.Second)
}
}
func (r *QmpMemorySetter) RemoveBackends() {
if len(r.memBackends) <= r.targetCnt {
return
}
attempts := newAttemptsCounter(len(r.memBackends) - r.targetCnt)
for idx := r.maxBackend; idx >= 1; idx-- {
active, ok := r.memBackends[idx]
if !ok || active {
continue
}
if !attempts.attempt() {
break
}
err := QmpDelMemoryBackend(r.mon, idx)
if err != nil {
r.errs = append(r.errs, err)
r.recorder.Event(r.vm, "Warning", "ScaleDown",
fmt.Sprintf("Failed to remove memslot%d: %s",
idx, err.Error()))
continue
}
r.recorder.Event(r.vm, "Normal", "ScaleDown",
fmt.Sprintf("Removed memslot%d", idx))
delete(r.memBackends, idx)
}
}
func (r *QmpMemorySetter) run() (int, error) {
// Usually, runs first two or last two phases.
// If there are leftover slots, might run 2 and 4.
// If there are errors, last two phases serve as cleanup.
phases := []func(){
r.AddBackends,
r.AddDevices,
r.RemoveDevices,
r.RemoveBackends,
}
for _, phase := range phases {
phase()
}
return r.memDevCount, errors.Join(r.errs...)
}
// QmpSetMemorySlots attempts to plug/unplug memory slots to match targetCnt.
//
// Returns the number of slots, which the function managed to plug.
// Ideally, it matches targetCnt, but can be less or more if there are
// errors.
//
// Returns -1 if failed to father current state of memory, otherwise,
// the return value is valid even if there are errors.
//
// In order for the hotplug to occur, we have to do two things:
// 1. Plug memory backend (memslot<n>) - a QEMU object, which physically
// allocates the memory from host
// 2. Plug DIMM device (dimm<n>) - a device, which exposes the memory to the
// host. dimm<n> is always plugged into memslot<n> with the same n.
//
// In order to do hotunplug, we need to make the same actions in the reversed
// order.
func QmpSetMemorySlots(
ctx context.Context,
vm *vmv1.VirtualMachine,
targetCnt int,
recorder record.EventRecorder,
) (int, error) {
log := log.FromContext(ctx)
mon, err := QmpConnect(QmpAddr(vm))
if err != nil {
return -1, err
}
setter := &QmpMemorySetter{
vm: vm,
targetCnt: targetCnt,
recorder: recorder,
log: log,
mon: mon,
memBackends: map[int]bool{},
maxBackend: 0,
memDevCount: 0,
errs: []error{},
}
defer setter.Disconnect()
err = setter.buildState()
if err != nil {
return -1, err
}
return setter.run()
}
func QmpSyncMemoryToTarget(vm *vmv1.VirtualMachine, migration *vmv1.VirtualMachineMigration) error {
memoryDevices, err := QmpQueryMemoryDevices(QmpAddr(vm))
if err != nil {
return err
}
memoryDevicesInTarget, err := QmpQueryMemoryDevices(migration.Status.TargetPodIP, vm.Spec.QMP)
if err != nil {
return err
}
target, err := QmpConnect(migration.Status.TargetPodIP, vm.Spec.QMP)
if err != nil {
return err
}
defer target.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
for _, m := range memoryDevices {
// firsly check if slot occupied already
// run over Target memory and compare device id
found := false
for _, tm := range memoryDevicesInTarget {
if DeepEqual(m, tm) {
found = true
}
}
if found {
// that mean such memory device 'm' already present in Target, skip it
continue
}
// add memdev object
memdevIdx, err := MemslotIdxFromName(m.Data.Memdev)
if err != nil {
return err
}
err = QmpAddMemoryBackend(target, memdevIdx, m.Data.Size)
if err != nil {
return err
}
// now add pc-dimm device
err = QmpAddMemoryDevice(target, memdevIdx)
if err != nil {
// device_add command failed... so try remove object that we just created
_ = QmpDelMemoryBackend(target, memdevIdx)
return err
}
}
return nil
}
func QmpGetMemorySize(ip string, port int32) (*resource.Quantity, error) {
mon, err := QmpConnect(ip, port)
if err != nil {
return nil, err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
qmpcmd := []byte(`{"execute": "query-memory-size-summary"}`)
raw, err := mon.Run(qmpcmd)
if err != nil {
return nil, err
}
var result QmpMemorySize
if err := json.Unmarshal(raw, &result); err != nil {
return nil, fmt.Errorf("error unmarshaling json: %w", err)
}
return resource.NewQuantity(result.Return.BaseMemory+result.Return.PluggedMemory, resource.BinarySI), nil
}
func QmpStartMigration(virtualmachine *vmv1.VirtualMachine, virtualmachinemigration *vmv1.VirtualMachineMigration) error {
// QMP port
port := virtualmachine.Spec.QMP
// connect to source runner QMP
s_ip := virtualmachinemigration.Status.SourcePodIP
smon, err := qmp.NewSocketMonitor("tcp", fmt.Sprintf("%s:%d", s_ip, port), 2*time.Second)
if err != nil {
return err
}
if err := smon.Connect(); err != nil {
return err
}
defer smon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
// connect to target runner QMP
t_ip := virtualmachinemigration.Status.TargetPodIP
tmon, err := qmp.NewSocketMonitor("tcp", fmt.Sprintf("%s:%d", t_ip, port), 2*time.Second)
if err != nil {
return err
}
if err := tmon.Connect(); err != nil {
return err
}
defer tmon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
cache := resource.MustParse("256Mi")
var qmpcmd []byte
// setup migration on source runner
qmpcmd = []byte(fmt.Sprintf(`{
"execute": "migrate-set-capabilities",
"arguments":
{
"capabilities": [
{"capability": "postcopy-ram", "state": %t},
{"capability": "xbzrle", "state": true},
{"capability": "compress", "state": true},
{"capability": "auto-converge", "state": %t},
{"capability": "zero-blocks", "state": true}
]
}
}`, virtualmachinemigration.Spec.AllowPostCopy, virtualmachinemigration.Spec.AutoConverge))
_, err = smon.Run(qmpcmd)
if err != nil {
return err
}
qmpcmd = []byte(fmt.Sprintf(`{
"execute": "migrate-set-parameters",
"arguments":
{
"xbzrle-cache-size": %d,
"max-bandwidth": %d,
"multifd-compression": "zstd"
}
}`, cache.Value(), virtualmachinemigration.Spec.MaxBandwidth.Value()))
_, err = smon.Run(qmpcmd)
if err != nil {
return err
}
// setup migration on target runner
qmpcmd = []byte(fmt.Sprintf(`{
"execute": "migrate-set-capabilities",
"arguments":
{
"capabilities": [
{"capability": "postcopy-ram", "state": %t},
{"capability": "xbzrle", "state": true},
{"capability": "compress", "state": true},
{"capability": "auto-converge", "state": %t},
{"capability": "zero-blocks", "state": true}
]
}
}`, virtualmachinemigration.Spec.AllowPostCopy, virtualmachinemigration.Spec.AutoConverge))
_, err = tmon.Run(qmpcmd)
if err != nil {
return err
}
qmpcmd = []byte(fmt.Sprintf(`{
"execute": "migrate-set-parameters",
"arguments":
{
"xbzrle-cache-size": %d,
"max-bandwidth": %d,
"multifd-compression": "zstd"
}
}`, cache.Value(), virtualmachinemigration.Spec.MaxBandwidth.Value()))
_, err = tmon.Run(qmpcmd)
if err != nil {
return err
}
// trigger migration
qmpcmd = []byte(fmt.Sprintf(`{
"execute": "migrate",
"arguments":
{
"uri": "tcp:%s:%d",
"inc": %t,
"blk": %t
}
}`, t_ip, vmv1.MigrationPort, virtualmachinemigration.Spec.Incremental, !virtualmachinemigration.Spec.Incremental))
_, err = smon.Run(qmpcmd)
if err != nil {
return err
}
if virtualmachinemigration.Spec.AllowPostCopy {
qmpcmd = []byte(`{"execute": "migrate-start-postcopy"}`)
_, err = smon.Run(qmpcmd)
if err != nil {
return err
}
}
return nil
}
func QmpGetMigrationInfo(ip string, port int32) (*MigrationInfo, error) {
mon, err := QmpConnect(ip, port)
if err != nil {
return nil, err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
qmpcmd := []byte(`{"execute": "query-migrate"}`)
raw, err := mon.Run(qmpcmd)
if err != nil {
return nil, err
}
var result QmpMigrationInfo
if err := json.Unmarshal(raw, &result); err != nil {
return nil, fmt.Errorf("error unmarshaling json: %w", err)
}
return &result.Return, nil
}
func QmpCancelMigration(ip string, port int32) error {
mon, err := QmpConnect(ip, port)
if err != nil {
return err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
qmpcmd := []byte(`{"execute": "migrate_cancel"}`)
_, err = mon.Run(qmpcmd)
if err != nil {
return err
}
return nil
}
func QmpQuit(ip string, port int32) error {
mon, err := QmpConnect(ip, port)
if err != nil {
return err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
qmpcmd := []byte(`{"execute": "quit"}`)
_, err = mon.Run(qmpcmd)
if err != nil {
return err
}
return nil
}
/*
Copyright 2023.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controllers
import (
"context"
"errors"
"fmt"
"math"
"time"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/log"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apiserver/pkg/storage/names"
"k8s.io/client-go/tools/record"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/neonvm/controllers/buildtag"
)
const virtualmachinemigrationFinalizer = "vm.neon.tech/finalizer"
// Definitions to manage status conditions
const (
// typeAvailableVirtualMachineMigration represents the status of the Deployment reconciliation
typeAvailableVirtualMachineMigration = "Available"
// typeDegradedVirtualMachineMigration represents the status used when the custom resource is deleted and the finalizer operations are must to occur.
typeDegradedVirtualMachineMigration = "Degraded"
)
// VirtualMachineMigrationReconciler reconciles a VirtualMachineMigration object
type VirtualMachineMigrationReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
Config *ReconcilerConfig
Metrics ReconcilerMetrics
}
// The following markers are used to generate the rules permissions (RBAC) on config/rbac using controller-gen
// when controller-gen (used by 'make generate') is executed.
// To know more about markers see: https://book.kubebuilder.io/reference/markers.html
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachinemigrations,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachinemigrations/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachinemigrations/finalizers,verbs=update
//+kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;list;watch
// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// It is essential for the controller's reconciliation loop to be idempotent. By following the Operator
// pattern you will create Controllers which provide a reconcile function
// responsible for synchronizing resources until the desired state is reached on the cluster.
// Breaking this recommendation goes against the design principles of controller-runtime.
// and may lead to unforeseen consequences such as resources becoming stuck and requiring manual intervention.
// For further info:
// - About Operator Pattern: https://kubernetes.io/docs/concepts/extend-kubernetes/operator/
// - About Controllers: https://kubernetes.io/docs/concepts/architecture/controller/
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.13.0/pkg/reconcile
func (r *VirtualMachineMigrationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := log.FromContext(ctx)
// Fetch the VirtualMachineMigration instance
// The purpose is check if the Custom Resource for the Kind VirtualMachineMigration
// is applied on the cluster if not we return nil to stop the reconciliation
migration := new(vmv1.VirtualMachineMigration)
if err := r.Get(ctx, req.NamespacedName, migration); err != nil {
// ignore error and stop reconcile loop if object not found (already deleted?)
if apierrors.IsNotFound(err) {
return ctrl.Result{}, nil
}
log.Error(err, "Unable to fetch Migration")
return ctrl.Result{}, err
}
// examine DeletionTimestamp to determine if object is under deletion
if migration.ObjectMeta.DeletionTimestamp.IsZero() {
// The object is not being deleted, so if it does not have our finalizer,
// then lets add the finalizer and update the object. This is equivalent
// registering our finalizer.
if !controllerutil.ContainsFinalizer(migration, virtualmachinemigrationFinalizer) {
log.Info("Adding Finalizer to Migration")
if !controllerutil.AddFinalizer(migration, virtualmachinemigrationFinalizer) {
return ctrl.Result{}, errors.New("Failed to add finalizer to Migration")
}
if err := r.Update(ctx, migration); err != nil {
return ctrl.Result{}, err
}
// stop this reconciliation cycle, new will be triggered as Migration updated
return ctrl.Result{}, nil
}
} else {
// The object is being deleted
if controllerutil.ContainsFinalizer(migration, virtualmachinemigrationFinalizer) {
// our finalizer is present, so lets handle any external dependency
log.Info("Performing Finalizer Operations for Migration")
vm := new(vmv1.VirtualMachine)
err := r.Get(ctx, types.NamespacedName{Name: migration.Spec.VmName, Namespace: migration.Namespace}, vm)
if err != nil {
log.Error(err, "Failed to get VM", "VmName", migration.Spec.VmName)
}
if err := r.doFinalizerOperationsForVirtualMachineMigration(ctx, migration, vm); err != nil {
// if fail to delete the external dependency here, return with error
// so that it can be retried
return ctrl.Result{}, err
}
// remove our finalizer from the list and update it.
log.Info("Removing Finalizer from Migration")
if !controllerutil.RemoveFinalizer(migration, virtualmachinemigrationFinalizer) {
return ctrl.Result{}, errors.New("Failed to remove finalizer from Migration")
}
if err := r.Update(ctx, migration); err != nil {
return ctrl.Result{}, err
}
}
// Stop reconciliation as the item is being deleted
return ctrl.Result{}, nil
}
// Fetch the corresponding VirtualMachine instance
vm := new(vmv1.VirtualMachine)
err := r.Get(ctx, types.NamespacedName{Name: migration.Spec.VmName, Namespace: migration.Namespace}, vm)
if err != nil {
log.Error(err, "Failed to get VM", "VmName", migration.Spec.VmName)
if apierrors.IsNotFound(err) {
// stop reconcile loop if vm not found (already deleted?)
message := fmt.Sprintf("VM (%s) not found", migration.Spec.VmName)
r.Recorder.Event(migration, "Warning", "Failed", message)
meta.SetStatusCondition(&migration.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachineMigration,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: message})
migration.Status.Phase = vmv1.VmmFailed
return r.updateMigrationStatus(ctx, migration)
}
// return err and try reconcile again
return ctrl.Result{}, err
}
// Set owner for VM migration object
if !metav1.IsControlledBy(migration, vm) {
log.Info("Set VM as owner for Migration", "vm.Name", vm.Name)
if err := ctrl.SetControllerReference(vm, migration, r.Scheme); err != nil {
return ctrl.Result{}, err
}
if err := r.Update(ctx, migration); err != nil {
log.Info("Failed to add owner to Migration", "error", err)
return ctrl.Result{}, err
}
// stop this reconciliation cycle, new will be triggered as Migration updated
return ctrl.Result{}, nil
}
// MAIN RECONCILE LOOP START
// Let's check and just set the condition status as Unknown when no status are available
if len(migration.Status.Conditions) == 0 {
log.Info("Set initial Unknown condition status")
meta.SetStatusCondition(&migration.Status.Conditions, metav1.Condition{Type: typeAvailableVirtualMachineMigration, Status: metav1.ConditionUnknown, Reason: "Reconciling", Message: "Starting reconciliation"})
return r.updateMigrationStatus(ctx, migration)
}
// target runner pod details - generate name
if len(migration.Status.TargetPodName) == 0 {
targetPodName := names.SimpleNameGenerator.GenerateName(fmt.Sprintf("%s-", vm.Name))
log.Info("Set Target Pod Name", "TargetPod.Name", targetPodName)
migration.Status.TargetPodName = targetPodName
return r.updateMigrationStatus(ctx, migration)
}
switch migration.Status.Phase {
case "":
// need change VM status asap to prevent autoscler change CPU/RAM in VM
// but only if VM running
if vm.Status.Phase == vmv1.VmRunning {
vm.Status.Phase = vmv1.VmPreMigrating
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to update VM status to PreMigrating", "Status", vm.Status.Phase)
return ctrl.Result{}, err
}
// Migration just created, change Phase to "Pending"
migration.Status.Phase = vmv1.VmmPending
return r.updateMigrationStatus(ctx, migration)
}
// some other VM status (Scaling may be), requeue after second
return ctrl.Result{RequeueAfter: time.Second}, nil
case vmv1.VmmPending:
// Check if the target runner pod already exists,
// if not create a new one using source pod as template
targetRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: migration.Status.TargetPodName, Namespace: vm.Namespace}, targetRunner)
if err != nil && apierrors.IsNotFound(err) {
// NB: .Spec.EnableSSH guaranteed non-nil because the k8s API server sets the default for us.
enableSSH := *vm.Spec.EnableSSH
var sshSecret *corev1.Secret
if enableSSH {
// We require the SSH secret to exist because we cannot unmount and
// mount the new secret into the VM after the live migration. If a
// VM's SSH secret is deleted accidentally then live migration is
// not possible.
if len(vm.Status.SSHSecretName) == 0 {
err := errors.New("VM has .Spec.EnableSSH but its .Status.SSHSecretName is empty")
log.Error(err, "Failed to get VM's SSH Secret")
r.Recorder.Event(migration, "Warning", "Failed", err.Error())
return ctrl.Result{}, err
}
sshSecret = &corev1.Secret{}
err := r.Get(ctx, types.NamespacedName{Name: vm.Status.SSHSecretName, Namespace: vm.Namespace}, sshSecret)
if err != nil {
log.Error(err, "Failed to get VM's SSH Secret")
r.Recorder.Event(migration, "Warning", "Failed", fmt.Sprintf("Failed to get VM's SSH Secret: %v", err))
return ctrl.Result{}, err
}
}
// Define a new target pod
tpod, err := r.targetPodForVirtualMachine(vm, migration, sshSecret)
if err != nil {
log.Error(err, "Failed to generate Target Pod spec")
return ctrl.Result{}, err
}
log.Info("Creating a Target Pod", "Pod.Namespace", tpod.Namespace, "Pod.Name", tpod.Name)
if err = r.Create(ctx, tpod); err != nil {
log.Error(err, "Failed to create Target Pod", "Pod.Namespace", tpod.Namespace, "Pod.Name", tpod.Name)
return ctrl.Result{}, err
}
log.Info("Target runner Pod was created", "Pod.Namespace", tpod.Namespace, "Pod.Name", tpod.Name)
// add event with some info
r.Recorder.Event(migration, "Normal", "Created",
fmt.Sprintf("VM (%s) ready migrate to target pod (%s)",
vm.Name, tpod.Name))
// target pod was just created, so requeue reconcile
return ctrl.Result{RequeueAfter: time.Second}, nil
} else if err != nil {
log.Error(err, "Failed to get Target Pod")
return ctrl.Result{}, err
}
// Update the metadata (including "usage" annotation) before anything else, so that it
// will be correctly set even if the rest of the reconcile operation fails.
if err := updatePodMetadataIfNecessary(ctx, r.Client, vm, targetRunner); err != nil {
log.Error(err, "Failed to sync pod labels and annotations", "TargetPod.Name", targetRunner.Name)
}
// If not already, set an additional (non-controller) owner reference for the source pod:
sourceRunner := &corev1.Pod{}
err = r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, sourceRunner)
if err != nil {
log.Error(err, "Failed to get migration source pod")
return ctrl.Result{}, err
}
ownedByMigration := false
for _, ref := range sourceRunner.OwnerReferences {
if ref.UID == migration.UID {
ownedByMigration = true
break
}
}
if !ownedByMigration {
if err = controllerutil.SetOwnerReference(migration, sourceRunner, r.Scheme); err != nil {
log.Error(err, "Failed to set owner reference for source pod")
return ctrl.Result{}, err
}
if err = r.Update(ctx, sourceRunner); err != nil {
log.Error(err, "Failed to update owner of source runner")
// Requeue so that we try again, even though we're not an owner of the source runner
return ctrl.Result{RequeueAfter: time.Second}, err
}
}
// now inspect target pod status and update migration
switch runnerStatus(targetRunner) {
case runnerRunning:
// update migration status
migration.Status.SourcePodName = vm.Status.PodName
migration.Status.SourcePodIP = vm.Status.PodIP
migration.Status.TargetPodIP = targetRunner.Status.PodIP
// do hotplugCPU in targetRunner before migration
log.Info("Syncing CPUs in Target runner", "TargetPod.Name", migration.Status.TargetPodName)
if err := QmpSyncCpuToTarget(vm, migration); err != nil {
return ctrl.Result{}, err
}
log.Info("CPUs in Target runner synced", "TargetPod.Name", migration.Status.TargetPodName)
// do hotplug Memory in targetRunner -- only needed for dimm slots; virtio-mem Just Works™
switch *vm.Status.MemoryProvider {
case vmv1.MemoryProviderVirtioMem:
// ref "Migration works out of the box" - https://lwn.net/Articles/755423/
log.Info(
"No need to sync memory in Target runner because MemoryProvider is VirtioMem",
"TargetPod.Name", migration.Status.TargetPodName,
)
case vmv1.MemoryProviderDIMMSlots:
log.Info("Syncing Memory in Target runner", "TargetPod.Name", migration.Status.TargetPodName)
if err := QmpSyncMemoryToTarget(vm, migration); err != nil {
return ctrl.Result{}, err
}
log.Info("Memory in Target runner synced", "TargetPod.Name", migration.Status.TargetPodName)
default:
panic(fmt.Errorf("unexpected vm.status.memoryProvider %q", *vm.Status.MemoryProvider))
}
// Migrate only running VMs to target with plugged devices
if vm.Status.Phase == vmv1.VmPreMigrating {
// update VM status
vm.Status.Phase = vmv1.VmMigrating
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to update VirtualMachine status to 'Migrating'")
return ctrl.Result{}, err
}
// trigger migration
if err := QmpStartMigration(vm, migration); err != nil {
migration.Status.Phase = vmv1.VmmFailed
return ctrl.Result{}, err
}
message := fmt.Sprintf("Migration was started to target runner (%s)", targetRunner.Name)
log.Info(message)
r.Recorder.Event(migration, "Normal", "Started", message)
meta.SetStatusCondition(&migration.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachineMigration,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: message})
// finally update migration phase to Running
migration.Status.Phase = vmv1.VmmRunning
return r.updateMigrationStatus(ctx, migration)
}
case runnerSucceeded:
// target runner pod finished without error? but it shouldn't finish
message := fmt.Sprintf("Target Pod (%s) completed suddenly", targetRunner.Name)
log.Info(message)
r.Recorder.Event(migration, "Warning", "Failed", message)
meta.SetStatusCondition(&migration.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachineMigration,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: message})
migration.Status.Phase = vmv1.VmmFailed
return r.updateMigrationStatus(ctx, migration)
case runnerFailed:
message := fmt.Sprintf("Target Pod (%s) failed", targetRunner.Name)
log.Info(message)
r.Recorder.Event(migration, "Warning", "Failed", message)
meta.SetStatusCondition(&migration.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachineMigration,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: message})
migration.Status.Phase = vmv1.VmmFailed
return r.updateMigrationStatus(ctx, migration)
case runnerUnknown:
message := fmt.Sprintf("Target Pod (%s) in Unknown phase", targetRunner.Name)
log.Info(message)
r.Recorder.Event(migration, "Warning", "Unknown", message)
meta.SetStatusCondition(&migration.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachineMigration,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: message})
migration.Status.Phase = vmv1.VmmPending
return r.updateMigrationStatus(ctx, migration)
default:
// not sure what to do, so try rqueue
return ctrl.Result{RequeueAfter: time.Second}, nil
}
case vmv1.VmmRunning:
// retrieve target pod details
targetRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: migration.Status.TargetPodName, Namespace: migration.Namespace}, targetRunner)
if err != nil && apierrors.IsNotFound(err) {
// lost target pod for running Migration ?
message := fmt.Sprintf("Target Pod (%s) disappeared", migration.Status.TargetPodName)
r.Recorder.Event(migration, "Error", "NotFound", message)
meta.SetStatusCondition(&migration.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachineMigration,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: message})
migration.Status.Phase = vmv1.VmmFailed
return r.updateMigrationStatus(ctx, migration)
} else if err != nil {
log.Error(err, "Failed to get target runner Pod")
return ctrl.Result{}, err
}
// Update the metadata (including "usage" annotation) before anything else, so that it
// will be correctly set even if the rest of the reconcile operation fails.
if err := updatePodMetadataIfNecessary(ctx, r.Client, vm, targetRunner); err != nil {
log.Error(err, "Failed to sync pod labels and annotations", "TargetPod.Name", targetRunner.Name)
}
// retrieve migration statistics
migrationInfo, err := QmpGetMigrationInfo(QmpAddr(vm))
if err != nil {
log.Error(err, "Failed to get migration info")
return ctrl.Result{}, err
}
// check if migration done
if migrationInfo.Status == "completed" {
message := fmt.Sprintf("Migration finished with success to target pod (%s)",
targetRunner.Name)
log.Info(message)
r.Recorder.Event(migration, "Normal", "Finished", message)
// re-fetch the vm
err := r.Get(ctx, types.NamespacedName{Name: migration.Spec.VmName, Namespace: migration.Namespace}, vm)
if err != nil {
log.Error(err, "Failed to re-fetch VM", "VmName", migration.Spec.VmName)
return ctrl.Result{}, err
}
// Redefine runner Pod for VM
vm.Status.PodName = migration.Status.TargetPodName
vm.Status.PodIP = migration.Status.TargetPodIP
vm.Status.Phase = vmv1.VmRunning
// update VM status
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to redefine runner pod in VM")
return ctrl.Result{}, err
}
// Redefine ownerRef for the target Pod
targetRunner.OwnerReferences = []metav1.OwnerReference{}
if err := ctrl.SetControllerReference(vm, targetRunner, r.Scheme); err != nil {
return ctrl.Result{}, err
}
if err := r.Update(ctx, targetRunner); err != nil {
log.Error(err, "Failed to update ownerRef for target runner pod")
return ctrl.Result{}, err
}
// Redefine ownerRef for the source Pod
sourceRunner := &corev1.Pod{}
err = r.Get(ctx, types.NamespacedName{Name: migration.Status.SourcePodName, Namespace: migration.Namespace}, sourceRunner)
if err == nil {
sourceRunner.OwnerReferences = []metav1.OwnerReference{}
if err := ctrl.SetControllerReference(migration, sourceRunner, r.Scheme); err != nil {
return ctrl.Result{}, err
}
if err := r.Update(ctx, sourceRunner); err != nil {
log.Error(err, "Failed to update ownerRef for source runner pod")
return ctrl.Result{}, err
}
} else if !apierrors.IsNotFound(err) {
return ctrl.Result{}, err
}
// try to stop hypervisor in source runner if it running still
if sourceRunner.Status.Phase == corev1.PodRunning {
if err := QmpQuit(migration.Status.SourcePodIP, vm.Spec.QMP); err != nil {
log.Error(err, "Failed stop hypervisor in source runner pod")
} else {
log.Info("Hypervisor in source runner pod stopped")
}
} else {
log.Info("Skip stopping hypervisor in source runner pod", "pod.Status.Phase", sourceRunner.Status.Phase)
}
// finally update migration phase to Succeeded
migration.Status.Phase = vmv1.VmmSucceeded
migration.Status.Info.Status = migrationInfo.Status
return r.updateMigrationStatus(ctx, migration)
}
// check if migration failed
if migrationInfo.Status == "failed" {
// oops, migration failed
message := fmt.Sprintf("Migration to target pod (%s) was failed",
targetRunner.Name)
log.Info(message)
r.Recorder.Event(migration, "Warning", "Failed", message)
// try to stop hypervisor in target runner
if targetRunner.Status.Phase == corev1.PodRunning {
if err := QmpQuit(migration.Status.TargetPodIP, vm.Spec.QMP); err != nil {
log.Error(err, "Failed stop hypervisor in target runner pod")
} else {
log.Info("Hypervisor in target runner pod stopped")
}
} else {
log.Info("Skip stopping hypervisor in target runner pod", "pod.Status.Phase", targetRunner.Status.Phase)
}
// change VM status to Running
vm.Status.Phase = vmv1.VmRunning
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to update VM status from Migrating back to Running as Migration was failed")
return ctrl.Result{}, err
}
// finally update migration phase to Failed
migration.Status.Phase = vmv1.VmmFailed
migration.Status.Info.Status = migrationInfo.Status
return r.updateMigrationStatus(ctx, migration)
}
// seems migration still going on, just update status with migration progress once per second
time.Sleep(time.Second)
// re-retrieve migration statistics
migrationInfo, err = QmpGetMigrationInfo(QmpAddr(vm))
if err != nil {
log.Error(err, "Failed to re-get migration info")
return ctrl.Result{}, err
}
// re-fetch the vm
err = r.Get(ctx, types.NamespacedName{Name: migration.Spec.VmName, Namespace: migration.Namespace}, vm)
if err != nil {
log.Error(err, "Failed to re-fetch VM before Mgration progress update", "VmName", migration.Spec.VmName)
return ctrl.Result{}, err
}
migration.Status.Info.Status = migrationInfo.Status
migration.Status.Info.TotalTimeMs = migrationInfo.TotalTimeMs
migration.Status.Info.SetupTimeMs = migrationInfo.SetupTimeMs
migration.Status.Info.DowntimeMs = migrationInfo.DowntimeMs
migration.Status.Info.Ram.Transferred = migrationInfo.Ram.Transferred
migration.Status.Info.Ram.Remaining = migrationInfo.Ram.Remaining
migration.Status.Info.Ram.Total = migrationInfo.Ram.Total
migration.Status.Info.Compression.CompressedSize = migrationInfo.Compression.CompressedSize
migration.Status.Info.Compression.CompressionRate = int64(math.Round(migrationInfo.Compression.CompressionRate))
return r.updateMigrationStatus(ctx, migration)
case vmv1.VmmSucceeded:
// do additional VM status checks
if vm.Status.Phase == vmv1.VmMigrating {
// migration Succeeded and VM should have status Running
vm.Status.Phase = vmv1.VmRunning
// update VM status
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to update VM status from Migrating to Running as Migration succeeded")
return ctrl.Result{}, err
}
}
if len(migration.Status.SourcePodName) > 0 {
// try to find and remove source runner Pod
sourceRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: migration.Status.SourcePodName, Namespace: migration.Namespace}, sourceRunner)
if err != nil && !apierrors.IsNotFound(err) {
log.Error(err, "Failed to get source runner Pod for deletion")
return ctrl.Result{}, err
}
var msg, eventReason string
if buildtag.NeverDeleteRunnerPods {
msg = fmt.Sprintf("Source runner pod deletion was skipped due to '%s' build tag", buildtag.TagnameNeverDeleteRunnerPods)
eventReason = "DeleteSkipped"
} else {
if err := r.Delete(ctx, sourceRunner); err != nil {
log.Error(err, "Failed to delete source runner Pod")
return ctrl.Result{}, err
}
msg = "Source runner was deleted"
eventReason = "Deleted"
}
log.Info(msg, "Pod.Namespace", sourceRunner.Namespace, "Pod.Name", sourceRunner.Name)
r.Recorder.Event(migration, "Normal", eventReason, fmt.Sprintf("%s: %s", msg, sourceRunner.Name))
migration.Status.SourcePodName = ""
migration.Status.SourcePodIP = ""
return r.updateMigrationStatus(ctx, migration)
}
// all done, stop reconciliation
return ctrl.Result{}, nil
case vmv1.VmmFailed:
// do additional VM status checks
if vm.Status.Phase == vmv1.VmMigrating {
// migration Failed and VM should back to Running state
vm.Status.Phase = vmv1.VmRunning
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to update VM status from Migrating back to Running as Migration was failed")
return ctrl.Result{}, err
}
}
// all done, stop reconciliation
return ctrl.Result{}, nil
default:
// not sure what to do, so try rqueue
log.Info("Requeuing current request")
return ctrl.Result{RequeueAfter: time.Second}, nil
}
// MAIN RECONCILE LOOP END
return ctrl.Result{}, nil
}
func (r *VirtualMachineMigrationReconciler) updateMigrationStatus(ctx context.Context, migration *vmv1.VirtualMachineMigration) (ctrl.Result, error) {
log := log.FromContext(ctx)
if err := r.Status().Update(ctx, migration); err != nil {
log.Error(err, "Failed update Migration status")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
// finalizeVirtualMachineMigration will perform the required operations before delete the CR.
func (r *VirtualMachineMigrationReconciler) doFinalizerOperationsForVirtualMachineMigration(ctx context.Context, migration *vmv1.VirtualMachineMigration, vm *vmv1.VirtualMachine) error {
log := log.FromContext(ctx)
if migration.Status.Phase == vmv1.VmmRunning || vm.Status.Phase == vmv1.VmPreMigrating {
message := fmt.Sprintf("Running Migration (%s) is being deleted", migration.Name)
log.Info(message)
r.Recorder.Event(migration, "Warning", "Deleting", message)
// try to cancel migration
log.Info("Canceling migration")
if err := QmpCancelMigration(QmpAddr(vm)); err != nil {
// inform about error but not return error to avoid stuckness in reconciliation cycle
log.Error(err, "Migration canceling failed")
}
if vm.Status.Phase == vmv1.VmMigrating || vm.Status.Phase == vmv1.VmPreMigrating {
// migration being deleted and VM should have status Running
vm.Status.Phase = vmv1.VmRunning
// update VM status
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to update VM status from Migrating to Running on Migration deletion")
return err
}
}
// try to remove target runner pod
if len(migration.Status.TargetPodName) > 0 {
pod := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: migration.Status.TargetPodName, Namespace: migration.Namespace}, pod)
if err != nil && !apierrors.IsNotFound(err) {
log.Error(err, "Failed to get target runner Pod for deletion")
return err
}
if apierrors.IsNotFound(err) {
// pod already deleted ?
return nil
}
// NB: here, we ignore buildtag.NeverDeleteRunnerPods because we delete runner pods on
// VM object deletion with the tag anyways, so it's more consistent to keep the same
// behavior for VMMs.
if err := r.Delete(ctx, pod); err != nil {
log.Error(err, "Failed to delete target runner Pod")
return err
}
message := fmt.Sprintf("Target runner (%s) was deleted", pod.Name)
log.Info(message)
r.Recorder.Event(migration, "Normal", "Deleted", message)
}
}
return nil
}
// SetupWithManager sets up the controller with the Manager.
// Note that the Pods will be also watched in order to ensure its
// desirable state on the cluster
func (r *VirtualMachineMigrationReconciler) SetupWithManager(mgr ctrl.Manager) (ReconcilerWithMetrics, error) {
cntrlName := "virtualmachinemigration"
reconciler := WithMetrics(
withCatchPanic(r),
r.Metrics,
cntrlName,
r.Config.FailurePendingPeriod,
r.Config.FailingRefreshInterval,
)
err := ctrl.NewControllerManagedBy(mgr).
For(&vmv1.VirtualMachineMigration{}).
Owns(&corev1.Pod{}).
WithOptions(controller.Options{MaxConcurrentReconciles: r.Config.MaxConcurrentReconciles}).
Named(cntrlName).
Complete(reconciler)
return reconciler, err
}
// targetPodForVirtualMachine returns a VirtualMachine Pod object
func (r *VirtualMachineMigrationReconciler) targetPodForVirtualMachine(
vm *vmv1.VirtualMachine,
migration *vmv1.VirtualMachineMigration,
sshSecret *corev1.Secret,
) (*corev1.Pod, error) {
if vm.Status.MemoryProvider == nil {
return nil, errors.New("cannot create target pod because vm.status.memoryProvider is not set")
}
// TODO: this is technically racy because target pod creation happens before we set the
// migration source pod, so in between reading this and starting the migration, it's
// *technically* possible that we create a target pod with a different memory provider than a
// newer source pod.
// Given that this requires (a) restart *during* initial live migration, and (b) that restart to
// change the memory provider, this is low enough risk that it's ok to leave to a follow-up.
memoryProvider := *vm.Status.MemoryProvider
pod, err := podSpec(vm, memoryProvider, sshSecret, r.Config)
if err != nil {
return nil, err
}
// override pod name
pod.Name = migration.Status.TargetPodName
// add env variable to turn on migration receiver
pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, corev1.EnvVar{Name: "RECEIVE_MIGRATION", Value: "true"})
// add podAntiAffinity to schedule target pod to another k8s node
if migration.Spec.PreventMigrationToSameHost {
if pod.Spec.Affinity == nil {
pod.Spec.Affinity = &corev1.Affinity{}
}
if pod.Spec.Affinity.PodAntiAffinity == nil {
pod.Spec.Affinity.PodAntiAffinity = &corev1.PodAntiAffinity{}
}
if pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil {
pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = []corev1.PodAffinityTerm{}
}
pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = append(pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, corev1.PodAffinityTerm{
LabelSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{
vmv1.VirtualMachineNameLabel: migration.Spec.VmName,
},
},
TopologyKey: "kubernetes.io/hostname",
})
}
// Set the ownerRef for the Pod
if err := ctrl.SetControllerReference(migration, pod, r.Scheme); err != nil {
return nil, err
}
return pod, nil
}
package controllers
// Wrapper around the default VirtualMachine/VirtualMachineMigration webhook interfaces so that the
// controller has a bit more control over them, without needing to actually implement that control
// inside of the apis package.
import (
"context"
"fmt"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/webhook"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/util/stack"
)
func validateUpdate(
ctx context.Context,
cfg *ReconcilerConfig,
recorder record.EventRecorder,
oldObj runtime.Object,
newObj interface {
webhook.Validator
metav1.Object
},
) (admission.Warnings, error) {
log := log.FromContext(ctx)
namespacedName := client.ObjectKeyFromObject(newObj)
_, skipValidation := cfg.SkipUpdateValidationFor[namespacedName]
warnings, err := func() (w admission.Warnings, e error) {
// if we plan to skip validation, catch any panics so that they can be ignored.
if skipValidation {
defer func() {
if err := recover(); err != nil {
e = fmt.Errorf("validation panicked with: %v", err)
st := stack.GetStackTrace(nil, 1).String()
log.Error(e, "webhook update validation panicked", "stack", st)
}
}()
}
return newObj.ValidateUpdate(oldObj)
}()
if err != nil && skipValidation {
recorder.Event(
newObj,
"Warning",
"SkippedValidation",
"Ignoring failed webhook validation because of controller's '--skip-update-validation-for' flag",
)
log.Error(err, "Ignoring failed webhook validation")
return warnings, nil
}
return warnings, err
}
type VMWebhook struct {
Recorder record.EventRecorder
Config *ReconcilerConfig
}
func (w *VMWebhook) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewWebhookManagedBy(mgr).
For(&vmv1.VirtualMachine{}).
WithDefaulter(w).
WithValidator(w).
Complete()
}
var _ webhook.CustomDefaulter = (*VMWebhook)(nil)
// Default implements webhook.CustomDefaulter
func (w *VMWebhook) Default(ctx context.Context, obj runtime.Object) error {
vm := obj.(*vmv1.VirtualMachine)
vm.Default()
return nil
}
var _ webhook.CustomValidator = (*VMWebhook)(nil)
// ValidateCreate implements webhook.CustomValidator
func (w *VMWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
vm := obj.(*vmv1.VirtualMachine)
return vm.ValidateCreate()
}
// ValidateUpdate implements webhook.CustomValidator
func (w *VMWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
newVM := newObj.(*vmv1.VirtualMachine)
return validateUpdate(ctx, w.Config, w.Recorder, oldObj, newVM)
}
// ValidateDelete implements webhook.CustomValidator
func (w *VMWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
vm := obj.(*vmv1.VirtualMachine)
return vm.ValidateDelete()
}
type VMMigrationWebhook struct {
Recorder record.EventRecorder
Config *ReconcilerConfig
}
func (w *VMMigrationWebhook) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewWebhookManagedBy(mgr).
For(&vmv1.VirtualMachineMigration{}).
WithDefaulter(w).
WithValidator(w).
Complete()
}
var _ webhook.CustomDefaulter = (*VMWebhook)(nil)
// Default implements webhook.CustomDefaulter
func (w *VMMigrationWebhook) Default(ctx context.Context, obj runtime.Object) error {
vmm := obj.(*vmv1.VirtualMachineMigration)
vmm.Default()
return nil
}
var _ webhook.CustomValidator = (*VMWebhook)(nil)
// ValidateCreate implements webhook.CustomValidator
func (w *VMMigrationWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
vmm := obj.(*vmv1.VirtualMachineMigration)
return vmm.ValidateCreate()
}
// ValidateUpdate implements webhook.CustomValidator
func (w *VMMigrationWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
newVMM := newObj.(*vmv1.VirtualMachineMigration)
return validateUpdate(ctx, w.Config, w.Recorder, oldObj, newVMM)
}
// ValidateDelete implements webhook.CustomValidator
func (w *VMMigrationWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
vmm := obj.(*vmv1.VirtualMachineMigration)
return vmm.ValidateDelete()
}
package main
import (
"errors"
"flag"
"fmt"
"io"
"net/http"
"os"
"strconv"
"strings"
"time"
"go.uber.org/zap"
)
// the default period is 100000 (i.e. 100 milliseconds). We use 5 milliseconds here because
// running out of quota can result in stalling until the end of the period, and a shorter period
// *generally* helps keep latencies more consistent (at the cost of using more CPU for scheduling).
const cpuPeriodMicroseconds = 5000
func main() {
addr := flag.String("addr", "", `address to bind for HTTP requests`)
cgroup := flag.String("cgroup", "", `cgroup for CPU limits`)
flag.Parse()
if *addr == "" {
fmt.Println("neonvm-daemon missing -addr flag")
os.Exit(1)
}
logConfig := zap.NewProductionConfig()
logConfig.Sampling = nil // Disable sampling, which the production config enables by default.
logConfig.Level.SetLevel(zap.InfoLevel) // Only "info" level and above (i.e. not debug logs)
logger := zap.Must(logConfig.Build()).Named("neonvm-daemon")
defer logger.Sync() //nolint:errcheck // what are we gonna do, log something about it?
logger.Info("Starting neonvm-daemon", zap.String("addr", *addr), zap.String("cgroup", *cgroup))
srv := cpuServer{
cgroup: *cgroup,
}
srv.run(logger, *addr)
}
type cpuServer struct {
cgroup string
}
func (s *cpuServer) run(logger *zap.Logger, addr string) {
logger = logger.Named("cpu-srv")
mux := http.NewServeMux()
mux.HandleFunc("/cpu", func(w http.ResponseWriter, r *http.Request) {
if r.Method == http.MethodGet {
_ = r.Body.Close()
cpu, err := s.getCPU(logger)
if err != nil {
w.WriteHeader(http.StatusInternalServerError)
return
}
w.WriteHeader(http.StatusOK)
w.Write([]byte(fmt.Sprintf("%d", cpu)))
} else if r.Method == http.MethodPut {
body, err := io.ReadAll(r.Body)
if err != nil {
logger.Error("could not read request body", zap.Error(err))
w.WriteHeader(http.StatusBadRequest)
return
}
milliCPU, err := strconv.ParseUint(string(body), 10, 32)
if err != nil {
logger.Error("could not parse request body as uint32", zap.Error(err))
w.WriteHeader(http.StatusBadRequest)
return
}
s.setCPU(logger, uint32(milliCPU))
} else {
// unknown method
w.WriteHeader(http.StatusNotFound)
}
})
timeout := 5 * time.Second
server := http.Server{
Addr: addr,
Handler: mux,
ReadTimeout: timeout,
ReadHeaderTimeout: timeout,
WriteTimeout: timeout,
}
err := server.ListenAndServe()
if err != nil {
logger.Fatal("CPU server exited with error", zap.Error(err))
}
logger.Info("CPU server exited without error")
}
func (s *cpuServer) cpuMaxPath() string {
return fmt.Sprintf("/sys/fs/cgroup/%s/cpu.max", s.cgroup)
}
func (s *cpuServer) setCPU(logger *zap.Logger, milliCPU uint32) error {
path := s.cpuMaxPath()
quota := milliCPU * (cpuPeriodMicroseconds / 1000)
fileContents := fmt.Sprintf("%d %d", quota, cpuPeriodMicroseconds)
file, err := os.OpenFile(path, os.O_WRONLY, 0)
if err != nil {
logger.Error("could not open cgroup cpu.max file for writing", zap.Error(err))
return err
}
_, err = file.WriteString(fileContents)
if err != nil {
logger.Error("could not write to cgroup cpu.max", zap.Error(err))
return err
}
return nil
}
// returns the current CPU limit, measured in milli-CPUs
func (s *cpuServer) getCPU(logger *zap.Logger) (uint32, error) {
data, err := os.ReadFile(s.cpuMaxPath())
if err != nil {
logger.Error("could not read cgroup cpu.max", zap.Error(err))
return 0, err
}
cpuLimit, err := parseCgroupCPUMax(string(data))
if err != nil {
logger.Error("could not parse cgroup cpu.max", zap.Error(err))
return 0, err
}
if cpuLimit.quota == nil {
// "0" isn't quite correct here (maybe it should be 1<<32 - 1), but zero is a more typical
// sentinel value, and will still produce the same results.
return 0, nil
}
return uint32(1000 * (*cpuLimit.quota) / cpuLimit.period), nil
}
type cpuMax struct {
quota *uint64
period uint64
}
func parseCgroupCPUMax(data string) (*cpuMax, error) {
// the contents of cpu.max are "$MAX $PERIOD", where:
// - $MAX is either a number of microseconds or the literal string "max" (meaning no limit), and
// - $PERIOD is a number of microseconds over which to account $MAX
arr := strings.Split(strings.Trim(string(data), "\n"), " ")
if len(arr) != 2 {
return nil, errors.New("unexpected contents of cgroup cpu.max")
}
var quota *uint64
if arr[0] != "max" {
q, err := strconv.ParseUint(arr[0], 10, 64)
if err != nil {
return nil, fmt.Errorf("could not parse cpu quota: %w", err)
}
quota = &q
}
period, err := strconv.ParseUint(arr[1], 10, 64)
if err != nil {
return nil, fmt.Errorf("could not parse cpu period: %w", err)
}
return &cpuMax{quota: quota, period: period}, nil
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"net/http"
"os"
"os/signal"
"strings"
"syscall"
"time"
"github.com/go-logr/zapr"
"github.com/tychoish/fun/srv"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/manager"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/client-go/kubernetes"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
_ "k8s.io/client-go/plugin/pkg/client/auth"
"k8s.io/client-go/rest"
"k8s.io/klog/v2"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/neonvm/controllers"
"github.com/neondatabase/autoscaling/pkg/util"
)
var (
scheme = runtime.NewScheme()
setupLog = ctrl.Log.WithName("setup")
)
func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(vmv1.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
}
func run(mgr manager.Manager) error {
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
defer cancel()
ctx = srv.SetShutdownSignal(ctx)
ctx = srv.SetBaseContext(ctx)
ctx = srv.WithOrchestrator(ctx)
orca := srv.GetOrchestrator(ctx)
defer func() {
if err := orca.Wait(); err != nil {
setupLog.Error(err, "failed to shut down orchestrator")
}
setupLog.Info("main loop returned, exiting")
}()
if err := orca.Add(srv.HTTP("pprof", time.Second, util.MakePPROF("0.0.0.0:7777"))); err != nil {
return fmt.Errorf("failed to add pprof service: %w", err)
}
setupLog.Info("starting manager")
if err := mgr.Start(ctx); err != nil {
return fmt.Errorf("problem running manager: %w", err)
}
return nil
}
func main() {
var metricsAddr string
var enableLeaderElection bool
var probeAddr string
var concurrencyLimit int
var skipUpdateValidationFor map[types.NamespacedName]struct{}
var enableContainerMgr bool
var disableRunnerCgroup bool
var qemuDiskCacheSettings string
var defaultMemoryProvider vmv1.MemoryProvider
var memhpAutoMovableRatio string
var failurePendingPeriod time.Duration
var failingRefreshInterval time.Duration
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
"Enable leader election for controller manager. "+
"Enabling this will ensure there is only one active controller manager.")
flag.IntVar(&concurrencyLimit, "concurrency-limit", 1, "Maximum number of concurrent reconcile operations")
flag.Func(
"skip-update-validation-for",
"Comma-separated list of object names to skip webhook validation, like 'foo' or 'default/bar'",
func(value string) error {
objSet := make(map[types.NamespacedName]struct{})
if value != "" {
for _, name := range strings.Split(value, ",") {
if name == "" {
return errors.New("name must not be empty")
}
var namespacedName types.NamespacedName
splitBySlash := strings.SplitN(name, "/", 1)
if len(splitBySlash) == 1 {
namespacedName = types.NamespacedName{Namespace: "default", Name: splitBySlash[0]}
} else {
namespacedName = types.NamespacedName{Namespace: splitBySlash[0], Name: splitBySlash[1]}
}
objSet[namespacedName] = struct{}{}
}
}
skipUpdateValidationFor = objSet
return nil
},
)
// note: cannot have both -enable-container-mgr and -disable-runner-cgroup.
flag.BoolVar(&enableContainerMgr, "enable-container-mgr", false, "Enable crictl-based container-mgr alongside each VM")
flag.BoolVar(&disableRunnerCgroup, "disable-runner-cgroup", false, "Disable creation of a cgroup in neonvm-runner for fractional CPU limiting")
flag.StringVar(&qemuDiskCacheSettings, "qemu-disk-cache-settings", "cache=none", "Set neonvm-runner's QEMU disk cache settings")
flag.Func("default-memory-provider", "Set default memory provider to use for new VMs", defaultMemoryProvider.FlagFunc)
flag.StringVar(&memhpAutoMovableRatio, "memhp-auto-movable-ratio", "301", "For virtio-mem, set VM kernel's memory_hotplug.auto_movable_ratio")
flag.DurationVar(&failurePendingPeriod, "failure-pending-period", 1*time.Minute,
"the period for the propagation of reconciliation failures to the observability instruments")
flag.DurationVar(&failingRefreshInterval, "failing-refresh-interval", 1*time.Minute,
"the interval between consecutive updates of metrics and logs, related to failing reconciliations")
flag.Parse()
if defaultMemoryProvider == "" {
fmt.Fprintln(os.Stderr, "missing required flag '-default-memory-provider'")
os.Exit(1)
}
if disableRunnerCgroup && enableContainerMgr {
fmt.Fprintln(os.Stderr, "Cannot have both '-enable-container-mgr' and '-disable-runner-cgroup'")
os.Exit(1)
}
logConfig := zap.NewProductionConfig()
logConfig.Sampling = nil // Disabling sampling; it's enabled by default for zap's production configs.
logConfig.Level.SetLevel(zap.InfoLevel)
logConfig.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder
logger := zapr.NewLogger(zap.Must(logConfig.Build(zap.AddStacktrace(zapcore.PanicLevel))))
ctrl.SetLogger(logger)
// define klog settings (used in LeaderElector)
klog.SetLogger(logger.V(2))
// tune k8s client for manager
cfg := ctrl.GetConfigOrDie()
cfg.QPS = 1000
cfg.Burst = 2000
// fetch node info to determine if we're running in k3s
isK3s, err := checkIfRunningInK3sCluster(cfg)
if err != nil {
setupLog.Error(err, "unable to check if running in k3s")
os.Exit(1)
}
mgr, err := ctrl.NewManager(cfg, ctrl.Options{
Scheme: scheme,
Metrics: metricsserver.Options{
BindAddress: metricsAddr,
},
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "a3b22509.neon.tech",
// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
// when the Manager ends. This requires the binary to immediately end when the
// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly
// speeds up voluntary leader transitions as the new leader don't have to wait
// LeaseDuration time first.
//
// In the default scaffold provided, the program ends immediately after
// the manager stops, so would be fine to enable this option. However,
// if you are doing or is intended to do any operation such as perform cleanups
// after the manager stops then its usage might be unsafe.
// LeaderElectionReleaseOnCancel: true,
})
if err != nil {
setupLog.Error(err, "unable to start manager")
os.Exit(1)
}
reconcilerMetrics := controllers.MakeReconcilerMetrics()
rc := &controllers.ReconcilerConfig{
IsK3s: isK3s,
UseContainerMgr: enableContainerMgr,
DisableRunnerCgroup: disableRunnerCgroup,
MaxConcurrentReconciles: concurrencyLimit,
SkipUpdateValidationFor: skipUpdateValidationFor,
QEMUDiskCacheSettings: qemuDiskCacheSettings,
DefaultMemoryProvider: defaultMemoryProvider,
MemhpAutoMovableRatio: memhpAutoMovableRatio,
FailurePendingPeriod: failurePendingPeriod,
FailingRefreshInterval: failingRefreshInterval,
}
vmReconciler := &controllers.VMReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("virtualmachine-controller"),
Config: rc,
Metrics: reconcilerMetrics,
}
vmReconcilerMetrics, err := vmReconciler.SetupWithManager(mgr)
if err != nil {
setupLog.Error(err, "unable to create controller", "controller", "VirtualMachine")
os.Exit(1)
}
vmWebhook := &controllers.VMWebhook{
Recorder: mgr.GetEventRecorderFor("virtualmachine-webhook"),
Config: rc,
}
if err := vmWebhook.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create webhook", "webhook", "VirtualMachine")
os.Exit(1)
}
migrationReconciler := &controllers.VirtualMachineMigrationReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("virtualmachinemigration-controller"),
Config: rc,
Metrics: reconcilerMetrics,
}
migrationReconcilerMetrics, err := migrationReconciler.SetupWithManager(mgr)
if err != nil {
setupLog.Error(err, "unable to create controller", "controller", "VirtualMachineMigration")
os.Exit(1)
}
migrationWebhook := &controllers.VMMigrationWebhook{
Recorder: mgr.GetEventRecorderFor("virtualmachinemigration-webhook"),
Config: rc,
}
if err := migrationWebhook.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create webhook", "webhook", "VirtualMachine")
os.Exit(1)
}
//+kubebuilder:scaffold:builder
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
}
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
dbgSrv := debugServerFunc(vmReconcilerMetrics, migrationReconcilerMetrics)
if err := mgr.Add(dbgSrv); err != nil {
setupLog.Error(err, "unable to set up debug server")
os.Exit(1)
}
if err := mgr.Add(vmReconcilerMetrics.FailingRefresher()); err != nil {
setupLog.Error(err, "unable to set up failing refresher")
os.Exit(1)
}
if err := run(mgr); err != nil {
setupLog.Error(err, "run manager error")
os.Exit(1)
}
}
func checkIfRunningInK3sCluster(cfg *rest.Config) (bool, error) {
client, err := kubernetes.NewForConfig(cfg)
if err != nil {
return false, fmt.Errorf("failed to create new k8s client: %w", err)
}
ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second)
defer cancel()
nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return false, err
}
for _, node := range nodes.Items {
if strings.HasPrefix(node.Status.NodeInfo.OSImage, "K3s") {
return true, nil
}
}
return false, nil
}
func debugServerFunc(reconcilers ...controllers.ReconcilerWithMetrics) manager.RunnableFunc {
return manager.RunnableFunc(func(ctx context.Context) error {
mux := http.NewServeMux()
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
defer r.Body.Close()
if r.Method != http.MethodGet {
w.WriteHeader(http.StatusMethodNotAllowed)
_, _ = w.Write([]byte(fmt.Sprintf("request method must be %s", http.MethodGet)))
return
}
response := make([]controllers.ReconcileSnapshot, 0, len(reconcilers))
for _, r := range reconcilers {
response = append(response, r.Snapshot())
}
responseBody, err := json.Marshal(&response)
if err != nil {
w.WriteHeader(http.StatusInternalServerError)
_, _ = w.Write([]byte(fmt.Sprintf("failed to marshal JSON response: %s", err)))
return
}
w.WriteHeader(http.StatusOK)
_, _ = w.Write(responseBody)
})
server := &http.Server{
Addr: "0.0.0.0:7778",
Handler: mux,
}
ctx, cancel := context.WithCancel(ctx)
defer cancel()
go func() {
<-ctx.Done()
_ = server.Shutdown(context.TODO())
}()
return server.ListenAndServe()
})
}
package main
import (
"context"
"flag"
"fmt"
"os"
"sync"
"time"
"go.uber.org/zap/zapcore"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"k8s.io/klog/v2"
"github.com/neondatabase/autoscaling/neonvm/pkg/ipam"
)
var (
nadName = flag.String("nad-name", "ipam-demo", "Network Attachment Definition name")
nadNs = flag.String("nad-namespace", "default", "Network Attachment Definition namespace")
demoLoggerName = "ipam-demo"
demoNamespace = "default"
demoCount = 10
)
func main() {
opts := zap.Options{ //nolint:exhaustruct // typical options struct; not all fields expected to be filled.
Development: true,
StacktraceLevel: zapcore.Level(zapcore.PanicLevel),
TimeEncoder: zapcore.ISO8601TimeEncoder,
}
opts.BindFlags(flag.CommandLine)
flag.Parse()
// define logger
logger := zap.New(zap.UseFlagOptions(&opts)).WithName(demoLoggerName)
// define klog settings (used in LeaderElector)
klog.SetLogger(logger.V(2))
// define context with logger
ctx := log.IntoContext(context.Background(), logger)
// Create IPAM object
ipam, err := ipam.New(ctx, *nadName, *nadNs)
if err != nil {
logger.Error(err, "failed to create IPAM")
os.Exit(1)
}
defer ipam.Close()
var wg sync.WaitGroup
// acquire IPs in parallel
for i := 1; i <= demoCount; i++ {
wg.Add(1)
go func(i int) {
defer wg.Done()
startTime := time.Now()
id := fmt.Sprintf("demo-ipam-%d", i)
logger.Info("try to lease", "id", id)
if ip, err := ipam.AcquireIP(ctx, id, demoNamespace); err != nil {
logger.Error(err, "lease failed", "id", id)
} else {
logger.Info("acquired", "id", id, "ip", ip.String(), "acquired in", time.Since(startTime))
}
}(i)
time.Sleep(time.Millisecond * 200)
}
wg.Wait()
// release IPs in parallel
for i := 1; i <= demoCount; i++ {
wg.Add(1)
go func(i int) {
defer wg.Done()
startTime := time.Now()
id := fmt.Sprintf("demo-ipam-%d", i)
logger.Info("try to release", "id", id)
if ip, err := ipam.ReleaseIP(ctx, id, demoNamespace); err != nil {
logger.Error(err, "release failed", "id", id)
} else {
logger.Info("released", "id", id, "ip", ip.String(), "released in", time.Since(startTime))
}
}(i)
time.Sleep(time.Millisecond * 200)
}
wg.Wait()
}
package ipam
import (
"context"
"fmt"
"net"
whereaboutsallocate "github.com/k8snetworkplumbingwg/whereabouts/pkg/allocate"
whereaboutslogging "github.com/k8snetworkplumbingwg/whereabouts/pkg/logging"
whereaboutstypes "github.com/k8snetworkplumbingwg/whereabouts/pkg/types"
)
func doAcquire(
_ context.Context,
ipRange RangeConfiguration,
reservation []whereaboutstypes.IPReservation,
vmName string,
vmNamespace string,
) (net.IPNet, []whereaboutstypes.IPReservation, error) {
// reduce whereabouts logging
whereaboutslogging.SetLogLevel("error")
vmID := fmt.Sprintf("%s/%s", vmNamespace, vmName)
_, ipnet, _ := net.ParseCIDR(ipRange.Range)
// check if IP reserved for VM already
foundidx := getMatchingIPReservationIndex(reservation, vmID)
if foundidx >= 0 {
return net.IPNet{IP: reservation[foundidx].IP, Mask: ipnet.Mask}, reservation, nil
}
// try to reserve new IP gor given VM
ip, newReservation, err := whereaboutsallocate.IterateForAssignment(*ipnet,
ipRange.RangeStart, ipRange.RangeEnd,
reservation, ipRange.OmitRanges, vmID, "")
if err != nil {
return net.IPNet{}, nil, err
}
return net.IPNet{IP: ip, Mask: ipnet.Mask}, newReservation, nil
}
func doRelease(
_ context.Context,
ipRange RangeConfiguration,
reservation []whereaboutstypes.IPReservation,
vmName string,
vmNamespace string,
) (net.IPNet, []whereaboutstypes.IPReservation, error) {
// reduce whereabouts logging
whereaboutslogging.SetLogLevel("error")
vmID := fmt.Sprintf("%s/%s", vmNamespace, vmName)
_, ipnet, _ := net.ParseCIDR(ipRange.Range)
// try to release IP for given VM
newReservation, ip, err := whereaboutsallocate.IterateForDeallocation(reservation, vmID, getMatchingIPReservationIndex)
if err != nil {
return net.IPNet{}, nil, err
}
return net.IPNet{IP: ip, Mask: ipnet.Mask}, newReservation, nil
}
func getMatchingIPReservationIndex(reservation []whereaboutstypes.IPReservation, id string) int {
foundidx := -1
for idx, v := range reservation {
if v.ContainerID == id {
foundidx = idx
break
}
}
return foundidx
}
package ipam
import (
nad "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/clientset/versioned"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
neonvm "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
)
// Set of kubernetets clients
type Client struct {
kubeClient kubernetes.Interface
vmClient neonvm.Interface
nadClient nad.Interface
}
func NewKubeClient(cfg *rest.Config) (*Client, error) {
kubeClient, err := kubernetes.NewForConfig(cfg)
if err != nil {
return nil, err
}
vmClient, err := neonvm.NewForConfig(cfg)
if err != nil {
return nil, err
}
nadClient, err := nad.NewForConfig(cfg)
if err != nil {
return nil, err
}
return &Client{
kubeClient: kubeClient,
vmClient: vmClient,
nadClient: nadClient,
}, nil
}
package ipam
import (
"context"
"encoding/json"
"errors"
"fmt"
"net"
"strconv"
"strings"
"sync"
"time"
whereaboutsallocate "github.com/k8snetworkplumbingwg/whereabouts/pkg/allocate"
whereaboutstypes "github.com/k8snetworkplumbingwg/whereabouts/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client/config"
"sigs.k8s.io/controller-runtime/pkg/log"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/leaderelection"
"k8s.io/client-go/tools/leaderelection/resourcelock"
neonvmapiv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
neonvm "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
)
const (
// IP Acquire operation identifier
Acquire = 0
// IP Release operation identifier
Release = 1
UnnamedNetwork string = ""
// kubernetes client-go rate limiter settings
// https://pkg.go.dev/k8s.io/client-go@v0.27.2/rest#Config
KubernetesClientQPS = 100
KubernetesClientBurst = 200
// RequestTimeout for IPAM queries
IpamRequestTimeout = 10 * time.Second
// DatastoreRetries defines how many retries are attempted when reading/updating the IP Pool
DatastoreRetries = 5
DatastoreRetriesDelay = 100 * time.Millisecond
DefaultLeaderLeaseDurationMs = 3000
DefaultLeaderRenewDeadlineMs = 2500
DefaultLeaderRetryPeriodMs = 2000
)
type Temporary interface {
Temporary() bool
}
type IPAM struct {
Client
Config IPAMConfig
}
func (i *IPAM) AcquireIP(ctx context.Context, vmName string, vmNamespace string) (net.IPNet, error) {
return i.acquireORrelease(ctx, vmName, vmNamespace, Acquire)
}
func (i *IPAM) ReleaseIP(ctx context.Context, vmName string, vmNamespace string) (net.IPNet, error) {
return i.acquireORrelease(ctx, vmName, vmNamespace, Release)
}
// New returns a new IPAM object with ipam config and k8s/crd clients
func New(ctx context.Context, nadName string, nadNamespace string) (*IPAM, error) {
// get Kubernetes client config
cfg, err := config.GetConfig()
if err != nil {
return nil, fmt.Errorf("error building kubernetes configuration: %w", err)
}
// tune Kubernetes client performance
cfg.QPS = KubernetesClientQPS
cfg.Burst = KubernetesClientBurst
kClient, err := NewKubeClient(cfg)
if err != nil {
return nil, fmt.Errorf("error creating kubernetes client: %w", err)
}
// read network-attachment-definition from Kubernetes
nad, err := kClient.nadClient.K8sCniCncfIoV1().NetworkAttachmentDefinitions(nadNamespace).Get(ctx, nadName, metav1.GetOptions{})
if err != nil {
return nil, err
}
if len(nad.Spec.Config) == 0 {
return nil, fmt.Errorf("network-attachment-definition %s hasn't IPAM config section", nad.Name)
}
ipamConfig, err := LoadFromNad(nad.Spec.Config, nadNamespace)
if err != nil {
return nil, fmt.Errorf("network-attachment-definition IPAM config parse error: %w", err)
}
return &IPAM{
Config: *ipamConfig,
Client: *kClient,
}, nil
}
// Load Network Attachment Definition and parse config to fill IPAM config
func LoadFromNad(nadConfig string, nadNamespace string) (*IPAMConfig, error) {
var n Nad
if err := json.Unmarshal([]byte(nadConfig), &n); err != nil {
return nil, fmt.Errorf("json parsing error: %w", err)
}
if n.IPAM == nil {
return nil, fmt.Errorf("missing 'ipam' key")
}
// process old-style Range to Ranges array
if n.IPAM.Range != "" {
oldRange := RangeConfiguration{
OmitRanges: n.IPAM.OmitRanges,
Range: n.IPAM.Range,
RangeStart: n.IPAM.RangeStart,
RangeEnd: n.IPAM.RangeEnd,
}
n.IPAM.IPRanges = append([]RangeConfiguration{oldRange}, n.IPAM.IPRanges...)
}
// check IP ranges
for idx := range n.IPAM.IPRanges {
firstip, ipNet, err := net.ParseCIDR(n.IPAM.IPRanges[idx].Range)
if err != nil {
return nil, fmt.Errorf("invalid CIDR %s: %w", n.IPAM.IPRanges[idx].Range, err)
}
n.IPAM.IPRanges[idx].Range = ipNet.String()
if n.IPAM.IPRanges[idx].RangeStart == nil {
firstip = net.ParseIP(firstip.Mask(ipNet.Mask).String()) // get real first IP from cidr
n.IPAM.IPRanges[idx].RangeStart = firstip
}
if n.IPAM.IPRanges[idx].RangeStart != nil && !ipNet.Contains(n.IPAM.IPRanges[idx].RangeStart) {
return nil, fmt.Errorf("range_start IP %s not in IP Range %s",
n.IPAM.IPRanges[idx].RangeStart.String(),
n.IPAM.IPRanges[idx].Range)
}
if n.IPAM.IPRanges[idx].RangeEnd != nil && !ipNet.Contains(n.IPAM.IPRanges[idx].RangeEnd) {
return nil, fmt.Errorf("range_end IP %s not in IP Range %s",
n.IPAM.IPRanges[idx].RangeEnd.String(),
n.IPAM.IPRanges[idx].Range)
}
}
// delete old style settings
n.IPAM.OmitRanges = nil
n.IPAM.Range = ""
n.IPAM.RangeStart = nil
n.IPAM.RangeEnd = nil
// check Excluded IP ranges
for idx := range n.IPAM.OmitRanges {
_, _, err := net.ParseCIDR(n.IPAM.OmitRanges[idx])
if err != nil {
return nil, fmt.Errorf("invalid exclude CIDR %s: %w", n.IPAM.OmitRanges[idx], err)
}
}
// set network namespace
n.IPAM.NetworkNamespace = nadNamespace
return n.IPAM, nil
}
// Performing IPAM actions with Leader Election to avoid duplicates
func (i *IPAM) acquireORrelease(ctx context.Context, vmName string, vmNamespace string, action int) (net.IPNet, error) {
var ip net.IPNet
var err error
var ipamerr error
leOverallTimeout := IpamRequestTimeout * 3
lockName := "neonvmipam"
lockIdentity := fmt.Sprintf("%s/%s", vmNamespace, vmName)
// define resource lock
lock := &resourcelock.LeaseLock{
LeaseMeta: metav1.ObjectMeta{
Name: lockName,
Namespace: i.Config.NetworkNamespace,
},
Client: i.kubeClient.CoordinationV1(),
LockConfig: resourcelock.ResourceLockConfig{
Identity: lockIdentity,
},
}
done := make(chan struct{})
// define leader elector
le, err := leaderelection.NewLeaderElector(leaderelection.LeaderElectionConfig{
Lock: lock,
ReleaseOnCancel: true,
LeaseDuration: time.Millisecond * time.Duration(DefaultLeaderLeaseDurationMs),
RenewDeadline: time.Millisecond * time.Duration(DefaultLeaderRenewDeadlineMs),
RetryPeriod: time.Millisecond * time.Duration(DefaultLeaderRetryPeriodMs),
Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: func(c context.Context) {
ip, ipamerr = i.runIPAM(ctx, vmName, vmNamespace, action)
close(done)
<-c.Done()
},
OnStoppedLeading: func() {
// do nothing
err = error(nil)
},
},
})
if err != nil {
return ip, err
}
// context with timeout for leader elector
leCtx, leCancel := context.WithTimeout(ctx, leOverallTimeout)
defer leCancel()
// run election in background
var wg sync.WaitGroup
wg.Add(1)
go func() { defer wg.Done(); le.Run(leCtx) }()
// wait until job was done and then cancel election context
// or exit with error when context got timeout
select {
case <-done:
leCancel()
case <-leCtx.Done():
err = errors.New("context got timeout while waiting to become leader")
}
if err != nil {
return ip, err
}
wg.Wait()
// ip.String() returns string "<nil>" on errors in ip struct parsing or if *ip is nil
if ip.String() == "<nil>" {
return ip, errors.New("something wrong, probably with leader election")
}
return ip, ipamerr
}
// Performing IPAM actions
func (i *IPAM) runIPAM(ctx context.Context, vmName string, vmNamespace string, action int) (net.IPNet, error) {
var ip net.IPNet
var ipamerr error
// check action
switch action {
case Acquire, Release:
default:
return ip, fmt.Errorf("got an unknown action: %v", action)
}
ctxWithTimeout, ctxCancel := context.WithTimeout(ctx, IpamRequestTimeout)
defer ctxCancel()
// Check connectivity to kubernetes
if err := i.Status(ctxWithTimeout); err != nil {
return ip, fmt.Errorf("connectivity error: %w", err)
}
// handle the ip add/del until successful
for _, ipRange := range i.Config.IPRanges {
// retry loop used to retry CRUD operations against Kubernetes
// if we meet some issue then just do another attepmt
RETRY:
for retry := 0; retry < DatastoreRetries; retry++ {
select {
case <-ctx.Done():
break RETRY
default:
// live in retry loop until context not cancelled
}
// read IPPool from ipppols.vm.neon.tech custom resource
pool, err := i.getNeonvmIPPool(ctxWithTimeout, ipRange.Range)
if err != nil {
if e, ok := err.(Temporary); ok && e.Temporary() {
// retry attempt to read IPPool
time.Sleep(DatastoreRetriesDelay)
continue
}
return ip, fmt.Errorf("error reading IP pool: %w", err)
}
currentReservation := pool.Allocations(ctx)
var newReservation []whereaboutstypes.IPReservation
switch action {
case Acquire:
ip, newReservation, ipamerr = doAcquire(ctx, ipRange, currentReservation, vmName, vmNamespace)
if ipamerr != nil {
// no space in the pool ? try another pool
break RETRY
}
case Release:
ip, newReservation, ipamerr = doRelease(ctx, ipRange, currentReservation, vmName, vmNamespace)
if ipamerr != nil {
// not found in the pool ? try another pool
break RETRY
}
}
// update IPPool with newReservation
err = pool.Update(ctxWithTimeout, newReservation)
if err != nil {
if e, ok := err.(Temporary); ok && e.Temporary() {
// retry attempt to update IPPool
time.Sleep(DatastoreRetriesDelay)
continue
}
return ip, fmt.Errorf("error updating IP pool: %w", err)
}
// pool was read, acquire or release was processed, pool was updated
// now we can break retry loop
break
}
// break ipRanges loop if ip was acquired/released
if ip.IP != nil {
break
}
}
if ip.IP == nil && action == Acquire {
return ip, errors.New("can not acquire IP, probably there are no space in IP pools")
}
return ip, ipamerr
}
// Status do List() request to check NeonVM client connectivity
func (i *IPAM) Status(ctx context.Context) error {
_, err := i.vmClient.NeonvmV1().IPPools(i.Config.NetworkNamespace).List(ctx, metav1.ListOptions{})
return err
}
// TODO: think about
func (i *IPAM) Close() error {
return nil
}
// NeonvmIPPool represents an IPPool resource and its parsed set of allocations
type NeonvmIPPool struct {
vmClient neonvm.Interface
pool *neonvmapiv1.IPPool
firstip net.IP
}
// Allocations returns the initially retrieved set of allocations for this pool
func (p *NeonvmIPPool) Allocations(ctx context.Context) []whereaboutstypes.IPReservation {
return toIPReservation(ctx, p.pool.Spec.Allocations, p.firstip)
}
// getNeonvmIPPool returns a NeonVM IPPool for the given IP range
func (i *IPAM) getNeonvmIPPool(ctx context.Context, ipRange string) (*NeonvmIPPool, error) {
// for IP range 10.11.22.0/24 poll name will be
// "10.11.22.0-24" if no network name in ipam spec, or
// "samplenet-10.11.22.0-24" if nametwork name is `samplenet`
var poolName string
if i.Config.NetworkName == UnnamedNetwork {
poolName = strings.ReplaceAll(ipRange, "/", "-")
} else {
poolName = fmt.Sprintf("%s-%s", i.Config.NetworkName, strings.ReplaceAll(ipRange, "/", "-"))
}
pool, err := i.vmClient.NeonvmV1().IPPools(i.Config.NetworkNamespace).Get(ctx, poolName, metav1.GetOptions{})
if err != nil && apierrors.IsNotFound(err) {
// pool does not exist, create it
newPool := &neonvmapiv1.IPPool{
ObjectMeta: metav1.ObjectMeta{
Name: poolName,
Namespace: i.Config.NetworkNamespace,
},
Spec: neonvmapiv1.IPPoolSpec{
Range: ipRange,
Allocations: make(map[string]neonvmapiv1.IPAllocation),
},
}
_, err = i.vmClient.NeonvmV1().IPPools(i.Config.NetworkNamespace).Create(ctx, newPool, metav1.CreateOptions{})
if err != nil && apierrors.IsAlreadyExists(err) {
// the pool was just created -- allow retry
return nil, &temporaryError{err}
} else if err != nil {
return nil, err
}
// if the pool was created for the first time, trigger another retry of the allocation loop
return nil, &temporaryError{errors.New("NeonvmIPPool was initialized")}
} else if err != nil {
return nil, err
}
// get first IP in the pool
ip, _, err := net.ParseCIDR(pool.Spec.Range)
if err != nil {
return nil, err
}
return &NeonvmIPPool{
vmClient: i.Client.vmClient,
pool: pool,
firstip: ip,
}, nil
}
// Update NeonvmIPPool with new IP reservation
func (p *NeonvmIPPool) Update(ctx context.Context, reservation []whereaboutstypes.IPReservation) error {
p.pool.Spec.Allocations = toAllocations(reservation, p.firstip)
_, err := p.vmClient.NeonvmV1().IPPools(p.pool.Namespace).Update(ctx, p.pool, metav1.UpdateOptions{})
if err != nil {
if apierrors.IsConflict(err) {
return &temporaryError{err}
}
return err
}
return nil
}
// taken from whereabouts code as it not exported
func toIPReservation(ctx context.Context, allocations map[string]neonvmapiv1.IPAllocation, firstip net.IP) []whereaboutstypes.IPReservation {
log := log.FromContext(ctx)
reservelist := []whereaboutstypes.IPReservation{}
for offset, a := range allocations {
numOffset, err := strconv.ParseInt(offset, 10, 64)
if err != nil {
// allocations that are invalid int64s should be ignored
// toAllocationMap should be the only writer of offsets, via `fmt.Sprintf("%d", ...)``
log.Error(err, "error decoding ip offset")
continue
}
ip := whereaboutsallocate.IPAddOffset(firstip, uint64(numOffset))
reservelist = append(reservelist, whereaboutstypes.IPReservation{
IP: ip,
ContainerID: a.ContainerID,
PodRef: a.PodRef,
IsAllocated: false,
})
}
return reservelist
}
// taken from whereabouts code as it not exported
func toAllocations(reservelist []whereaboutstypes.IPReservation, firstip net.IP) map[string]neonvmapiv1.IPAllocation {
allocations := make(map[string]neonvmapiv1.IPAllocation)
for _, r := range reservelist {
index := whereaboutsallocate.IPGetOffset(r.IP, firstip)
allocations[fmt.Sprintf("%d", index)] = neonvmapiv1.IPAllocation{ContainerID: r.ContainerID, PodRef: r.PodRef}
}
return allocations
}
package ipam
import (
"net"
cnitypes "github.com/containernetworking/cni/pkg/types"
)
type temporaryError struct {
error
}
func (t *temporaryError) Temporary() bool {
return true
}
type RangeConfiguration struct {
OmitRanges []string `json:"exclude,omitempty"`
Range string `json:"range"`
RangeStart net.IP `json:"range_start,omitempty"`
RangeEnd net.IP `json:"range_end,omitempty"`
}
type Nad struct {
IPAM *IPAMConfig `json:"ipam"`
}
// IPAMConfig describes the expected json configuration for this plugin
type IPAMConfig struct {
Routes []*cnitypes.Route `json:"routes"`
IPRanges []RangeConfiguration `json:"ipRanges"`
OmitRanges []string `json:"exclude,omitempty"`
DNS cnitypes.DNS `json:"dns"`
Range string `json:"range"`
RangeStart net.IP `json:"range_start,omitempty"`
RangeEnd net.IP `json:"range_end,omitempty"`
NetworkNamespace string
NetworkName string `json:"network_name,omitempty"`
}
package main
// crictl abstraction and commands
import (
"encoding/json"
"fmt"
"os"
"os/exec"
"strconv"
"go.uber.org/zap"
)
type Crictl struct {
endpoint string
}
// Pods calls 'crictl pods' and, if successful, returns the parsed output
//
// This command lists all running pods.
func (c *Crictl) Pods(logger *zap.Logger) (*CrictlPods, error) {
var pods CrictlPods
if err := c.run(logger, &pods, "pods", "-o", "json"); err != nil {
return nil, err
}
return &pods, nil
}
// CrictlPods represents the JSON output of 'crictl pods', limited to the subset we care about.
type CrictlPods struct {
Items []CrictlPod `json:"items"`
}
type CrictlPod struct {
ID string `json:"id"`
Metadata CrictlPodMetadata `json:"metadata"`
}
type CrictlPodMetadata struct {
UID string `json:"uid"`
}
// Ps calls 'crictl ps -p <podID>' and, if successful, returns the parsed output
//
// This command lists the containers in the pod.
func (c *Crictl) Ps(logger *zap.Logger, podID string) (*CrictlContainers, error) {
var containers CrictlContainers
if err := c.run(logger, &containers, "ps", "-p", podID, "-o", "json"); err != nil {
return nil, err
}
return &containers, nil
}
// CrictlContainers represents the JSON output of 'crictl ps', limited to the subset we care about.
type CrictlContainers struct {
Containers []CrictlContainer `json:"containers"`
}
type CrictlContainer struct {
ID string `json:"id"`
Metadata CrictlContainerMetadata `json:"metadata"`
}
type CrictlContainerMetadata struct {
Name string `json:"name"`
}
func (c *Crictl) Update(logger *zap.Logger, containerID string, values CrictlContainerUpdate) error {
return c.run(
logger, nil,
"update",
"--cpu-share", strconv.Itoa(values.cpuShares),
"--cpu-quota", strconv.FormatInt(values.cpuQuota, 10),
"--cpu-period", strconv.FormatInt(values.cpuPeriod, 10),
containerID,
)
}
type CrictlContainerUpdate struct {
cpuShares int
cpuQuota int64
cpuPeriod int64
}
func (c *Crictl) Inspect(logger *zap.Logger, containerID string) (*CrictlContainerInspect, error) {
var container CrictlContainerInspect
if err := c.run(logger, &container, "inspect", containerID); err != nil {
return nil, err
}
return &container, nil
}
// CrictlContainerInspect represents the JSON output of 'crictl inspect', limited to the subset we
// care about.
type CrictlContainerInspect struct {
Info CrictlContainerInfo `json:"info"`
}
type CrictlContainerInfo struct {
RuntimeSpec CrictlContainerRuntimeSpec `json:"runtimeSpec"`
}
type CrictlContainerRuntimeSpec struct {
Linux CrictlContainerRuntimeSpecLinux `json:"linux"`
}
type CrictlContainerRuntimeSpecLinux struct {
Resources CrictlContainerResources `json:"resources"`
}
type CrictlContainerResources struct {
CPU CrictlContainerResourcesCPU `json:"cpu"`
}
type CrictlContainerResourcesCPU struct {
Period uint64 `json:"period"`
Quota uint64 `json:"quota"`
Shares uint64 `json:"shares"`
}
func (c *Crictl) run(logger *zap.Logger, output any, args ...string) error {
actualArgs := []string{
"--runtime-endpoint",
c.endpoint,
}
actualArgs = append(actualArgs, args...)
logger.Info("running crictl", zap.Strings("args", actualArgs))
cmd := exec.Command("/usr/bin/crictl", actualArgs...)
stderr, err := os.OpenFile("/dev/stderr", os.O_RDWR, 0 /* unused */)
if err != nil {
panic(fmt.Errorf("failed to open /dev/stderr: %w", err))
}
cmd.Stderr = stderr
if output == nil {
if err := cmd.Run(); err != nil {
return fmt.Errorf("failed to run command: %w", err)
}
return nil
} else {
out, err := cmd.Output()
if err != nil {
return fmt.Errorf("failed to run command: %w", err)
}
if err := json.Unmarshal(out, output); err != nil {
return fmt.Errorf("error parsing JSON: %w", err)
}
return nil
}
}
package main
// container-mgr ('neonvm-container-runner') runs in a container alongside neonvm-runner, and is
// responsible for updating the CPU shares & quotas for neonvm-runner by communicating directly with
// containerd (via CRI API, over containerd's socket).
//
// We have to go this back-channel route because kubernetes <1.27 doesn't support updating container
// resources without restarting, and with cgroups v2, kubernetes 1.25+ uses cgroup namespaces which
// prevents our ability to interact with cgroups inside the container *except by* going through the
// containerd API.
//
// We use a separate container to limit possibilities for privilege escalation.
import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"io"
"net/http"
"os"
"sync/atomic"
"time"
"go.uber.org/zap"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
)
const (
runnerContainerName = "neonvm-runner"
retryInitEvery = 5 * time.Second
// cpuLimitOvercommitFactor sets the amount above the VM's spec.guest.cpus.use that we set the
// QEMU cgroup's CPU limit to. e.g. if cpuLimitOvercommitFactor = 3 and the VM is using 0.5
// CPUs, we set the cgroup to limit QEMU+VM to 1.5 CPUs.
//
// This exists because setting the cgroup exactly equal to the VM's CPU value is overly
// pessimistic, results in a lot of unused capacity on the host, and particularly impacts
// operations that parallelize between the VM and QEMU, like heavy disk access.
//
// See also: https://neondb.slack.com/archives/C03TN5G758R/p1693462680623239
cpuLimitOvercommitFactor = 4
// number of CPU shares per vCPU to use when updating a container
sharesPerCPU = 1024
cpuPeriod = 100000
)
func main() {
logger := zap.Must(zap.NewProduction()).Named("neonvm-container-mgr")
selfPodUID, ok := os.LookupEnv("K8S_POD_UID")
if !ok {
logger.Fatal("environment variable K8S_POD_UID missing")
}
logger.Info("Got pod UID", zap.String("uid", selfPodUID))
containerRuntimeEndpoint, ok := os.LookupEnv("CRI_ENDPOINT")
if !ok {
logger.Fatal("environment variable CRI_ENDPOINT missing")
}
logger.Info("Got CRI endpoint", zap.String("endpoint", containerRuntimeEndpoint))
crictl := &Crictl{
endpoint: containerRuntimeEndpoint,
}
var httpPort int
var initMilliCPU int
flag.IntVar(&httpPort, "port", -1, "Port for the CPU http server")
flag.IntVar(&initMilliCPU, "init-milli-cpu", -1, "Initial milli-CPU to use for the VM")
flag.Parse()
if httpPort < 0 {
logger.Fatal("missing 'port' flag")
} else if initMilliCPU < 0 {
logger.Fatal("missing 'init-milli-cpu' flag")
}
pods, err := crictl.Pods(logger)
if err != nil {
logger.Fatal("failed to run crictl command to get CRI ID for pod UID", zap.String("uid", selfPodUID), zap.Error(err))
}
// find pod with matching uid
var criPodID string
for _, p := range pods.Items {
if p.Metadata.UID == selfPodUID {
criPodID = p.ID
break
}
}
if criPodID == "" {
logger.Fatal("could not find CRI pod with matching UID", zap.String("uid", selfPodUID))
}
logger.Info("Got CRI ID for pod", zap.String("podID", criPodID))
var criRunnerContainerID string
for criRunnerContainerID == "" {
containers, err := crictl.Ps(logger, criPodID)
if err != nil {
logger.Fatal(
"failed to run crictl command to get CRI ID for container",
zap.String("podID", criPodID),
zap.String("name", runnerContainerName),
zap.Error(err),
)
}
for _, c := range containers.Containers {
if c.Metadata.Name == runnerContainerName {
criRunnerContainerID = c.ID
break
}
}
if criRunnerContainerID == "" {
logger.Error(
"could not find CRI container with matching name",
zap.String("podID", criPodID),
zap.String("name", runnerContainerName),
)
time.Sleep(retryInitEvery)
}
}
logger.Info(
fmt.Sprintf("Got CRI ID for %s container", runnerContainerName),
zap.String("containerID", criRunnerContainerID),
)
// Set the CPU to initMilliCPU:
err = updateContainerCPU(logger, crictl, criRunnerContainerID, vmv1.MilliCPU(initMilliCPU))
if err != nil {
logger.Fatal("could not set initial runner container CPU", zap.Error(err))
}
srvState := cpuServerState{
podID: criPodID,
containerID: criRunnerContainerID,
lastMilliCPU: atomic.Uint32{},
}
srvState.lastMilliCPU.Store(uint32(initMilliCPU))
srvState.listenForCPUChanges(context.TODO(), logger, crictl, int32(httpPort))
}
type cpuServerState struct {
podID string
containerID string
lastMilliCPU atomic.Uint32
}
func (s *cpuServerState) listenForCPUChanges(ctx context.Context, logger *zap.Logger, crictl *Crictl, port int32) {
mux := http.NewServeMux()
loggerHandlers := logger.Named("http-handlers")
cpuChangeLogger := loggerHandlers.Named("cpu_change")
mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) {
_ = r.Body.Close()
w.WriteHeader(200)
})
mux.HandleFunc("/cpu_change", func(w http.ResponseWriter, r *http.Request) {
s.handleCPUChange(cpuChangeLogger, crictl, w, r)
})
cpuCurrentLogger := loggerHandlers.Named("cpu_current")
mux.HandleFunc("/cpu_current", func(w http.ResponseWriter, r *http.Request) {
s.handleCPUCurrent(cpuCurrentLogger, crictl, w, r)
})
server := http.Server{
Addr: fmt.Sprintf("0.0.0.0:%d", port),
Handler: mux,
ReadTimeout: 5 * time.Second,
ReadHeaderTimeout: 5 * time.Second,
WriteTimeout: 5 * time.Second,
}
errChan := make(chan error)
go func() {
errChan <- server.ListenAndServe()
}()
select {
case err := <-errChan:
if errors.Is(err, http.ErrServerClosed) {
logger.Info("cpu_change server closed")
} else if err != nil {
logger.Fatal("cpu_change server exited with error", zap.Error(err))
}
case <-ctx.Done():
err := server.Shutdown(context.Background())
logger.Info("shut down cpu_change server", zap.Error(err))
}
}
func (s *cpuServerState) handleCPUChange(logger *zap.Logger, crictl *Crictl, w http.ResponseWriter, r *http.Request) {
if r.Method != "POST" {
logger.Error("unexpected method", zap.String("method", r.Method))
w.WriteHeader(400)
return
}
body, err := io.ReadAll(r.Body)
if err != nil {
logger.Error("could not read body", zap.Error(err))
w.WriteHeader(400)
return
}
var parsed api.VCPUChange
if err = json.Unmarshal(body, &parsed); err != nil {
logger.Error("could not parse body", zap.Error(err))
w.WriteHeader(400)
return
}
logger.Info("got CPU update", zap.Float64("CPU", parsed.VCPUs.AsFloat64()))
if err := updateContainerCPU(logger, crictl, s.containerID, parsed.VCPUs); err != nil {
logger.Error("could not update container CPU", zap.String("id", s.containerID), zap.Error(err))
w.WriteHeader(500)
return
}
// store the milli CPU now that we've set it, so that in handleCPUCurrent we can handle rounding
// issues based on the last operation we did.
s.lastMilliCPU.Store(uint32(parsed.VCPUs))
w.WriteHeader(200)
}
func updateContainerCPU(logger *zap.Logger, crictl *Crictl, containerID string, cpu vmv1.MilliCPU) error {
shares := sharesForCPU(cpu)
quota := int64(vmv1.MilliCPU(cpuLimitOvercommitFactor*cpu).AsFloat64() * float64(cpuPeriod))
logger.Info(
"calculated CPU quantities for vCPU",
zap.Int("shares", shares),
zap.Int64("quota", quota),
zap.Int("period", cpuPeriod),
)
// update container
return crictl.Update(logger, containerID, CrictlContainerUpdate{
cpuShares: shares,
cpuQuota: quota,
cpuPeriod: cpuPeriod,
})
}
func (s *cpuServerState) handleCPUCurrent(logger *zap.Logger, crictl *Crictl, w http.ResponseWriter, r *http.Request) {
if r.Method != "GET" {
logger.Error("unexpected method", zap.String("method", r.Method))
w.WriteHeader(400)
return
}
logger.Info("got CPU current request")
container, err := crictl.Inspect(logger, s.containerID)
if err != nil {
logger.Error("could not inspect container", zap.String("id", s.containerID), zap.Error(err))
}
shares := int(container.Info.RuntimeSpec.Linux.Resources.CPU.Shares)
logger.Info(
"fetched current CPU shares",
zap.Int("shares", shares),
)
last := vmv1.MilliCPU(s.lastMilliCPU.Load())
expectedIfNoChange := sharesForCPU(last)
var resp api.VCPUCgroup
if shares == expectedIfNoChange {
resp = api.VCPUCgroup{VCPUs: last}
} else {
resp = api.VCPUCgroup{VCPUs: cpuForShares(shares)}
}
logger.Info("responding with current CPU", zap.Float64("cpu", resp.VCPUs.AsFloat64()))
body, err := json.Marshal(resp)
if err != nil {
logger.Error("could not marshal body", zap.Error(err))
w.WriteHeader(500)
return
}
w.Header().Add("Content-Type", "application/json")
w.Write(body) //nolint:errcheck // Not much to do with the error here. TODO: log it?
}
func sharesForCPU(cpu vmv1.MilliCPU) int {
return sharesPerCPU * int(cpu) / 1000
}
func cpuForShares(shares int) vmv1.MilliCPU {
return vmv1.MilliCPU(shares * 1000 / sharesPerCPU)
}
package main
import (
"bufio"
"bytes"
"context"
"crypto/sha256"
"encoding/base64"
"encoding/hex"
"encoding/json"
"errors"
"flag"
"fmt"
"io"
"math"
"net"
"net/http"
"os"
"os/exec"
"os/signal"
"path/filepath"
"regexp"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"github.com/alessio/shellescape"
"github.com/cilium/cilium/pkg/mac"
"github.com/containerd/cgroups/v3"
"github.com/containerd/cgroups/v3/cgroup1"
"github.com/containerd/cgroups/v3/cgroup2"
"github.com/digitalocean/go-qemu/qmp"
"github.com/docker/libnetwork/types"
"github.com/jpillora/backoff"
"github.com/kdomanski/iso9660"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/samber/lo"
"github.com/vishvananda/netlink"
"go.uber.org/zap"
"k8s.io/apimachinery/pkg/api/resource"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util/taskgroup"
)
const (
QEMU_BIN = "qemu-system-x86_64"
QEMU_IMG_BIN = "qemu-img"
defaultKernelPath = "/vm/kernel/vmlinuz"
rootDiskPath = "/vm/images/rootdisk.qcow2"
runtimeDiskPath = "/vm/images/runtime.iso"
mountedDiskPath = "/vm/images"
qmpUnixSocketForSigtermHandler = "/vm/qmp-sigterm.sock"
logSerialSocket = "/vm/log.sock"
bufferedReaderSize = 4096
sshAuthorizedKeysDiskPath = "/vm/images/ssh-authorized-keys.iso"
sshAuthorizedKeysMountPoint = "/vm/ssh"
swapName = "swapdisk"
defaultNetworkBridgeName = "br-def"
defaultNetworkTapName = "tap-def"
defaultNetworkCIDR = "169.254.254.252/30"
overlayNetworkBridgeName = "br-overlay"
overlayNetworkTapName = "tap-overlay"
// defaultPath is the default path to the resolv.conf that contains information to resolve DNS. See Path().
resolveDefaultPath = "/etc/resolv.conf"
// alternatePath is a path different from defaultPath, that may be used to resolve DNS. See Path().
resolveAlternatePath = "/run/systemd/resolve/resolv.conf"
// cgroupPeriod is the period for evaluating cgroup quota
// in microseconds. Min 1000 microseconds, max 1 second
cgroupPeriod = uint64(100000)
cgroupMountPoint = "/sys/fs/cgroup"
// cpuLimitOvercommitFactor sets the amount above the VM's spec.guest.cpus.use that we set the
// QEMU cgroup's CPU limit to. e.g. if cpuLimitOvercommitFactor = 3 and the VM is using 0.5
// CPUs, we set the cgroup to limit QEMU+VM to 1.5 CPUs.
//
// This exists because setting the cgroup exactly equal to the VM's CPU value is overly
// pessimistic, results in a lot of unused capacity on the host, and particularly impacts
// operations that parallelize between the VM and QEMU, like heavy disk access.
//
// See also: https://neondb.slack.com/archives/C03TN5G758R/p1693462680623239
cpuLimitOvercommitFactor = 4
)
var (
ipv4NumBlock = `(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)`
ipv4Address = `(` + ipv4NumBlock + `\.){3}` + ipv4NumBlock
ipv6Address = `([0-9A-Fa-f]{0,4}:){2,7}([0-9A-Fa-f]{0,4})(%\w+)?`
nsRegexp = regexp.MustCompile(`^\s*nameserver\s*((` + ipv4Address + `)|(` + ipv6Address + `))\s*$`)
nsIPv4Regexpmatch = regexp.MustCompile(`^\s*nameserver\s*((` + ipv4Address + `))\s*$`)
nsIPv6Regexpmatch = regexp.MustCompile(`^\s*nameserver\s*((` + ipv6Address + `))\s*$`)
searchRegexp = regexp.MustCompile(`^\s*search\s*(([^\s]+\s*)*)$`)
detectSystemdResolvConfOnce sync.Once
pathAfterSystemdDetection = resolveDefaultPath
)
// File contains the resolv.conf content and its hash
type resolveFile struct {
Content []byte
Hash string
}
// Get returns the contents of /etc/resolv.conf and its hash
func getResolvConf() (*resolveFile, error) {
return getSpecific(resolvePath())
}
// hashData returns the sha256 sum of src.
// from https://github.com/moby/moby/blob/v20.10.24/pkg/ioutils/readers.go#L52-L59
func hashData(src io.Reader) (string, error) {
h := sha256.New()
if _, err := io.Copy(h, src); err != nil {
return "", err
}
return "sha256:" + hex.EncodeToString(h.Sum(nil)), nil
}
// GetSpecific returns the contents of the user specified resolv.conf file and its hash
func getSpecific(path string) (*resolveFile, error) {
resolv, err := os.ReadFile(path)
if err != nil {
return nil, err
}
hash, err := hashData(bytes.NewReader(resolv))
if err != nil {
return nil, err
}
return &resolveFile{Content: resolv, Hash: hash}, nil
}
// GetNameservers returns nameservers (if any) listed in /etc/resolv.conf
func getNameservers(resolvConf []byte, kind int) []string {
nameservers := []string{}
for _, line := range getLines(resolvConf, []byte("#")) {
var ns [][]byte
if kind == types.IP {
ns = nsRegexp.FindSubmatch(line)
} else if kind == types.IPv4 {
ns = nsIPv4Regexpmatch.FindSubmatch(line)
} else if kind == types.IPv6 {
ns = nsIPv6Regexpmatch.FindSubmatch(line)
}
if len(ns) > 0 {
nameservers = append(nameservers, string(ns[1]))
}
}
return nameservers
}
// GetSearchDomains returns search domains (if any) listed in /etc/resolv.conf
// If more than one search line is encountered, only the contents of the last
// one is returned.
func getSearchDomains(resolvConf []byte) []string {
domains := []string{}
for _, line := range getLines(resolvConf, []byte("#")) {
match := searchRegexp.FindSubmatch(line)
if match == nil {
continue
}
domains = strings.Fields(string(match[1]))
}
return domains
}
// getLines parses input into lines and strips away comments.
func getLines(input []byte, commentMarker []byte) [][]byte {
lines := bytes.Split(input, []byte("\n"))
var output [][]byte
for _, currentLine := range lines {
var commentIndex = bytes.Index(currentLine, commentMarker)
if commentIndex == -1 {
output = append(output, currentLine)
} else {
output = append(output, currentLine[:commentIndex])
}
}
return output
}
func resolvePath() string {
detectSystemdResolvConfOnce.Do(func() {
candidateResolvConf, err := os.ReadFile(resolveDefaultPath)
if err != nil {
// silencing error as it will resurface at next calls trying to read defaultPath
return
}
ns := getNameservers(candidateResolvConf, types.IP)
if len(ns) == 1 && ns[0] == "127.0.0.53" {
pathAfterSystemdDetection = resolveAlternatePath
}
})
return pathAfterSystemdDetection
}
func createISO9660runtime(
diskPath string,
command []string,
args []string,
sysctl []string,
env []vmv1.EnvVar,
disks []vmv1.Disk,
enableSSH bool,
swapSize *resource.Quantity,
shmsize *resource.Quantity,
) error {
writer, err := iso9660.NewWriter()
if err != nil {
return err
}
defer writer.Cleanup() //nolint:errcheck // Nothing to do with the error, maybe log it ? TODO
if len(sysctl) != 0 {
err = writer.AddFile(bytes.NewReader([]byte(strings.Join(sysctl, "\n"))), "sysctl.conf")
if err != nil {
return err
}
}
if len(command) != 0 {
err = writer.AddFile(bytes.NewReader([]byte(shellescape.QuoteCommand(command))), "command.sh")
if err != nil {
return err
}
}
if len(args) != 0 {
err = writer.AddFile(bytes.NewReader([]byte(shellescape.QuoteCommand(args))), "args.sh")
if err != nil {
return err
}
}
if len(env) != 0 {
envstring := []string{}
for _, e := range env {
envstring = append(envstring, fmt.Sprintf(`export %s=%s`, e.Name, shellescape.Quote(e.Value)))
}
envstring = append(envstring, "")
err = writer.AddFile(bytes.NewReader([]byte(strings.Join(envstring, "\n"))), "env.sh")
if err != nil {
return err
}
}
mounts := []string{
"set -euxo pipefail",
}
if enableSSH {
mounts = append(mounts, "/neonvm/bin/mkdir -p /mnt/ssh")
mounts = append(mounts, "/neonvm/bin/mount -t iso9660 -o ro,mode=0644 $(/neonvm/bin/blkid -L ssh-authorized-keys) /mnt/ssh")
}
if swapSize != nil {
mounts = append(mounts, fmt.Sprintf("/neonvm/bin/sh /neonvm/runtime/resize-swap-internal.sh %d", swapSize.Value()))
}
if len(disks) != 0 {
for _, disk := range disks {
if disk.MountPath != "" {
mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mkdir -p %s`, disk.MountPath))
}
switch {
case disk.EmptyDisk != nil:
opts := ""
if disk.EmptyDisk.Discard {
opts = "-o discard"
}
if disk.EmptyDisk.EnableQuotas {
mounts = append(mounts, fmt.Sprintf(`tune2fs -Q prjquota $(/neonvm/bin/blkid -L %s)`, disk.Name))
mounts = append(mounts, fmt.Sprintf(`tune2fs -E mount_opts=prjquota $(/neonvm/bin/blkid -L %s)`, disk.Name))
}
mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mount %s $(/neonvm/bin/blkid -L %s) %s`, opts, disk.Name, disk.MountPath))
// Note: chmod must be after mount, otherwise it gets overwritten by mount.
mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/chmod 0777 %s`, disk.MountPath))
case disk.ConfigMap != nil || disk.Secret != nil:
mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mount -t iso9660 -o ro,mode=0644 $(/neonvm/bin/blkid -L %s) %s`, disk.Name, disk.MountPath))
case disk.Tmpfs != nil:
mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/chmod 0777 %s`, disk.MountPath))
mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mount -t tmpfs -o size=%d %s %s`, disk.Tmpfs.Size.Value(), disk.Name, disk.MountPath))
default:
// do nothing
}
}
}
if shmsize != nil {
mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mount -o remount,size=%d /dev/shm`, shmsize.Value()))
}
mounts = append(mounts, "")
err = writer.AddFile(bytes.NewReader([]byte(strings.Join(mounts, "\n"))), "mounts.sh")
if err != nil {
return err
}
if swapSize != nil {
lines := []string{
`#!/neonvm/bin/sh`,
`set -euxo pipefail`,
// this script may be run as root, so we should avoid potentially-malicious path
// injection
`export PATH="/neonvm/bin"`,
fmt.Sprintf(`swapdisk="$(/neonvm/bin/blkid -L %s)"`, swapName),
// disable swap. Allow it to fail if it's already disabled.
`swapoff "$swapdisk" || true`,
// if the requested size is zero, then... just exit. There's nothing we need to do.
`new_size="$1"`,
`if [ "$new_size" = '0' ]; then exit 0; fi`,
// re-make the swap.
// mkswap expects the size to be given in KiB, so divide the new size by 1K
fmt.Sprintf(`mkswap -L %s "$swapdisk" $(( new_size / 1024 ))`, swapName),
// ... and then re-enable the swap
//
// nb: busybox swapon only supports '-d', not its long form '--discard'.
`swapon -d "$swapdisk"`,
}
err = writer.AddFile(bytes.NewReader([]byte(strings.Join(lines, "\n"))), "resize-swap-internal.sh")
if err != nil {
return err
}
}
outputFile, err := os.OpenFile(diskPath, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0644)
if err != nil {
return err
}
// uid=36(qemu) gid=34(kvm) groups=34(kvm)
err = outputFile.Chown(36, 34)
if err != nil {
return err
}
err = writer.WriteTo(outputFile, "vmruntime")
if err != nil {
return err
}
err = outputFile.Close()
if err != nil {
return err
}
return nil
}
func calcDirUsage(dirPath string) (int64, error) {
stat, err := os.Lstat(dirPath)
if err != nil {
return 0, err
}
size := stat.Size()
if !stat.IsDir() {
return size, nil
}
dir, err := os.Open(dirPath)
if err != nil {
return size, err
}
defer dir.Close()
files, err := dir.Readdir(-1)
if err != nil {
return size, err
}
for _, file := range files {
if file.Name() == "." || file.Name() == ".." {
continue
}
s, err := calcDirUsage(dirPath + "/" + file.Name())
if err != nil {
return size, err
}
size += s
}
return size, nil
}
func createSwap(diskPath string, swapSize *resource.Quantity) error {
tmpRawFile := "swap.raw"
if err := execFg(QEMU_IMG_BIN, "create", "-q", "-f", "raw", tmpRawFile, fmt.Sprintf("%d", swapSize.Value())); err != nil {
return err
}
if err := execFg("mkswap", "-L", swapName, tmpRawFile); err != nil {
return err
}
if err := execFg(QEMU_IMG_BIN, "convert", "-q", "-f", "raw", "-O", "qcow2", "-o", "cluster_size=2M,lazy_refcounts=on", tmpRawFile, diskPath); err != nil {
return err
}
if err := execFg("rm", "-f", tmpRawFile); err != nil {
return err
}
// uid=36(qemu) gid=34(kvm) groups=34(kvm)
if err := execFg("chown", "36:34", diskPath); err != nil {
return err
}
return nil
}
func createQCOW2(diskName string, diskPath string, diskSize *resource.Quantity, contentPath *string) error {
ext4blocksMin := int64(64)
ext4blockSize := int64(4096)
ext4blockCount := int64(0)
if diskSize != nil {
ext4blockCount = diskSize.Value() / ext4blockSize
} else if contentPath != nil {
dirSize, err := calcDirUsage(*contentPath)
if err != nil {
return err
}
ext4blockCount = int64(math.Ceil(float64(ext4blocksMin) + float64((dirSize / ext4blockSize))))
} else {
return errors.New("diskSize or contentPath should be specified")
}
mkfsArgs := []string{
"-q", // quiet
"-L", // volume-label
diskName,
}
if contentPath != nil {
// [ -d root-directory|tarball ]
mkfsArgs = append(mkfsArgs, "-d", *contentPath)
}
mkfsArgs = append(
mkfsArgs,
"-b", // block-size
fmt.Sprintf("%d", ext4blockSize),
"ext4.raw", // device
fmt.Sprintf("%d", ext4blockCount), // fs-size
)
if err := execFg("mkfs.ext4", mkfsArgs...); err != nil {
return err
}
if err := execFg(QEMU_IMG_BIN, "convert", "-q", "-f", "raw", "-O", "qcow2", "-o", "cluster_size=2M,lazy_refcounts=on", "ext4.raw", diskPath); err != nil {
return err
}
if err := execFg("rm", "-f", "ext4.raw"); err != nil {
return err
}
// uid=36(qemu) gid=34(kvm) groups=34(kvm)
if err := execFg("chown", "36:34", diskPath); err != nil {
return err
}
return nil
}
func createISO9660FromPath(logger *zap.Logger, diskName string, diskPath string, contentPath string) error {
writer, err := iso9660.NewWriter()
if err != nil {
return err
}
defer writer.Cleanup() //nolint:errcheck // Nothing to do with the error, maybe log it ? TODO
dir, err := os.Open(contentPath)
if err != nil {
return err
}
dirEntrys, err := dir.ReadDir(0)
if err != nil {
return err
}
for _, file := range dirEntrys {
fileName := fmt.Sprintf("%s/%s", contentPath, file.Name())
outputPath := file.Name()
if file.IsDir() {
continue
}
// try to resolve symlink and check resolved file IsDir
resolved, err := filepath.EvalSymlinks(fileName)
if err != nil {
return err
}
resolvedOpen, err := os.Open(resolved)
if err != nil {
return err
}
resolvedStat, err := resolvedOpen.Stat()
if err != nil {
return err
}
if resolvedStat.IsDir() {
continue
}
// run the file handling logic in a closure, so the defers happen within the loop body,
// rather than the outer function.
err = func() error {
logger.Info("adding file to ISO9660 disk", zap.String("path", outputPath))
fileToAdd, err := os.Open(fileName)
if err != nil {
return err
}
defer fileToAdd.Close()
return writer.AddFile(fileToAdd, outputPath)
}()
if err != nil {
return err
}
}
outputFile, err := os.OpenFile(diskPath, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0644)
if err != nil {
return err
}
// uid=36(qemu) gid=34(kvm) groups=34(kvm)
err = outputFile.Chown(36, 34)
if err != nil {
return err
}
err = writer.WriteTo(outputFile, diskName)
if err != nil {
return err
}
err = outputFile.Close()
if err != nil {
return err
}
return nil
}
func checkKVM() bool {
info, err := os.Stat("/dev/kvm")
if err != nil {
return false
}
mode := info.Mode()
return mode&os.ModeCharDevice == os.ModeCharDevice
}
func checkDevTun() bool {
info, err := os.Stat("/dev/net/tun")
if err != nil {
return false
}
mode := info.Mode()
return mode&os.ModeCharDevice == os.ModeCharDevice
}
func runInitScript(logger *zap.Logger, script string) error {
if len(script) == 0 {
return nil
}
// creates a tmp file with the script content
tmpFile, err := os.CreateTemp(os.TempDir(), "init-script-")
if err != nil {
return err
}
defer os.Remove(tmpFile.Name()) // clean up
if _, err := tmpFile.Write([]byte(script)); err != nil {
return err
}
if err := tmpFile.Close(); err != nil {
return err
}
logger.Info("running init script", zap.String("path", tmpFile.Name()))
if err := execFg("/bin/sh", tmpFile.Name()); err != nil {
return err
}
return nil
}
type Config struct {
vmSpecDump string
vmStatusDump string
kernelPath string
appendKernelCmdline string
skipCgroupManagement bool
enableDummyCPUServer bool
delegatedCgroup bool
diskCacheSettings string
memoryProvider vmv1.MemoryProvider
autoMovableRatio string
}
func newConfig(logger *zap.Logger) *Config {
cfg := &Config{
vmSpecDump: "",
vmStatusDump: "",
kernelPath: defaultKernelPath,
appendKernelCmdline: "",
skipCgroupManagement: false,
enableDummyCPUServer: false,
delegatedCgroup: false,
diskCacheSettings: "cache=none",
memoryProvider: "", // Require that this is explicitly set. We'll check later.
autoMovableRatio: "", // Require that this is explicitly set IFF memoryProvider is VirtioMem. We'll check later.
}
flag.StringVar(&cfg.vmSpecDump, "vmspec", cfg.vmSpecDump,
"Base64 encoded VirtualMachine json specification")
flag.StringVar(&cfg.vmStatusDump, "vmstatus", cfg.vmStatusDump,
"Base64 encoded VirtualMachine json status")
flag.StringVar(&cfg.kernelPath, "kernelpath", cfg.kernelPath,
"Override path for kernel to use")
flag.StringVar(&cfg.appendKernelCmdline, "appendKernelCmdline",
cfg.appendKernelCmdline, "Additional kernel command line arguments")
flag.BoolVar(&cfg.skipCgroupManagement, "skip-cgroup-management",
cfg.skipCgroupManagement,
"Don't try to manage CPU (use if running alongside container-mgr, or if dummy CPU server is enabled)")
flag.BoolVar(&cfg.enableDummyCPUServer, "enable-dummy-cpu-server",
cfg.skipCgroupManagement,
"Provide a CPU server (unlike -skip-cgroup-management) but don't actually do anything with it")
flag.BoolVar(&cfg.delegatedCgroup, "delegated-cgroup",
cfg.delegatedCgroup,
"Forward CPU requests to neonvm-daemon inside the VM (requires -skip-cgroup-management=false)")
flag.StringVar(&cfg.diskCacheSettings, "qemu-disk-cache-settings",
cfg.diskCacheSettings, "Cache settings to add to -drive args for VM disks")
flag.Func("memory-provider", "Set provider for memory hotplug", cfg.memoryProvider.FlagFunc)
flag.StringVar(&cfg.autoMovableRatio, "memhp-auto-movable-ratio",
cfg.autoMovableRatio, "Set value of kernel's memory_hotplug.auto_movable_ratio [virtio-mem only]")
flag.Parse()
if cfg.memoryProvider == "" {
logger.Fatal("missing required flag '-memory-provider'")
}
if cfg.memoryProvider == vmv1.MemoryProviderVirtioMem && cfg.autoMovableRatio == "" {
logger.Fatal("missing required flag '-memhp-auto-movable-ratio'")
}
if cfg.enableDummyCPUServer && !cfg.skipCgroupManagement {
logger.Fatal("flag '-enable-dummy-cpu-server' requires '-skip-cgroup-management'")
}
if cfg.delegatedCgroup && cfg.skipCgroupManagement {
logger.Fatal("flag '-delegated-cgroup' requires '-skip-cgroup-management'")
}
if cfg.delegatedCgroup && cfg.enableDummyCPUServer {
logger.Fatal("cannot have both '-delegated-cgroup' and '-enable-dummy-cpu-server'")
}
return cfg
}
func main() {
logger := zap.Must(zap.NewProduction()).Named("neonvm-runner")
if err := run(logger); err != nil {
logger.Fatal("Failed to run", zap.Error(err))
}
}
func run(logger *zap.Logger) error {
cfg := newConfig(logger)
vmSpecJson, err := base64.StdEncoding.DecodeString(cfg.vmSpecDump)
if err != nil {
return fmt.Errorf("failed to decode VirtualMachine Spec dump: %w", err)
}
vmStatusJson, err := base64.StdEncoding.DecodeString(cfg.vmStatusDump)
if err != nil {
return fmt.Errorf("failed to decode VirtualMachine Status dump: %w", err)
}
vmSpec := &vmv1.VirtualMachineSpec{}
if err := json.Unmarshal(vmSpecJson, vmSpec); err != nil {
return fmt.Errorf("failed to unmarshal VM spec: %w", err)
}
var vmStatus vmv1.VirtualMachineStatus
if err := json.Unmarshal(vmStatusJson, &vmStatus); err != nil {
return fmt.Errorf("failed to unmarshal VM Status: %w", err)
}
enableSSH := false
if vmSpec.EnableSSH != nil && *vmSpec.EnableSSH {
enableSSH = true
}
// Set hostname, with "vm-" prefix to distinguish it from the pod name
//
// This is just to reduce the risk of mixing things up when ssh'ing to different
// computes, the hostname isn't used for anything as such.
hostname, err := os.Hostname()
if err != nil {
logger.Warn("could not read pod's hostname", zap.Error(err))
} else {
hostname = fmt.Sprintf("vm-%s", hostname)
}
// create iso9660 disk with runtime options (command, args, envs, mounts)
sysctl := []string{
"kernel.core_pattern=core",
"kernel.core_uses_pid=1",
}
var shmSize *resource.Quantity
var swapSize *resource.Quantity
if vmSpec.Guest.Settings != nil {
sysctl = append(sysctl, vmSpec.Guest.Settings.Sysctl...)
swapSize = vmSpec.Guest.Settings.Swap
// By default, Linux sets the size of /dev/shm to 1/2 of the physical memory. If
// swap is configured, we want to set /dev/shm higher, because we can autoscale
// the memory up.
//
// See https://github.com/neondatabase/autoscaling/issues/800
initialMemorySize := vmSpec.Guest.MemorySlotSize.Value() * int64(vmSpec.Guest.MemorySlots.Min)
if swapSize != nil && swapSize.Value() > initialMemorySize/2 {
shmSize = swapSize
}
}
tg := taskgroup.NewGroup(logger)
tg.Go("init-script", func(logger *zap.Logger) error {
return runInitScript(logger, vmSpec.InitScript)
})
tg.Go("iso9660-runtime", func(logger *zap.Logger) error {
return createISO9660runtime(
runtimeDiskPath,
vmSpec.Guest.Command,
vmSpec.Guest.Args,
sysctl,
vmSpec.Guest.Env,
vmSpec.Disks,
enableSSH,
swapSize,
shmSize,
)
})
tg.Go("rootDisk", func(logger *zap.Logger) error {
// resize rootDisk image of size specified and new size more than current
return resizeRootDisk(logger, vmSpec)
})
var qemuCmd []string
tg.Go("qemu-cmd", func(logger *zap.Logger) error {
var err error
qemuCmd, err = buildQEMUCmd(cfg, logger, vmSpec, &vmStatus, enableSSH, swapSize, hostname)
return err
})
if err := tg.Wait(); err != nil {
return err
}
err = runQEMU(cfg, logger, vmSpec, qemuCmd)
if err != nil {
return fmt.Errorf("failed to run QEMU: %w", err)
}
return nil
}
func resizeRootDisk(logger *zap.Logger, vmSpec *vmv1.VirtualMachineSpec) error {
// resize rootDisk image of size specified and new size more than current
type QemuImgOutputPartial struct {
VirtualSize int64 `json:"virtual-size"`
}
// get current disk size by qemu-img info command
qemuImgOut, err := exec.Command(QEMU_IMG_BIN, "info", "--output=json", rootDiskPath).Output()
if err != nil {
return fmt.Errorf("could not get root image size: %w", err)
}
var imageSize QemuImgOutputPartial
if err := json.Unmarshal(qemuImgOut, &imageSize); err != nil {
return fmt.Errorf("failed to unmarshal QEMU image size: %w", err)
}
imageSizeQuantity := resource.NewQuantity(imageSize.VirtualSize, resource.BinarySI)
// going to resize
if !vmSpec.Guest.RootDisk.Size.IsZero() {
if vmSpec.Guest.RootDisk.Size.Cmp(*imageSizeQuantity) == 1 {
logger.Info(fmt.Sprintf("resizing rootDisk from %s to %s", imageSizeQuantity.String(), vmSpec.Guest.RootDisk.Size.String()))
if err := execFg(QEMU_IMG_BIN, "resize", rootDiskPath, fmt.Sprintf("%d", vmSpec.Guest.RootDisk.Size.Value())); err != nil {
return fmt.Errorf("failed to resize rootDisk: %w", err)
}
} else {
logger.Info(fmt.Sprintf("rootDisk.size (%s) is less than than image size (%s)", vmSpec.Guest.RootDisk.Size.String(), imageSizeQuantity.String()))
}
}
return nil
}
func buildQEMUCmd(
cfg *Config,
logger *zap.Logger,
vmSpec *vmv1.VirtualMachineSpec,
vmStatus *vmv1.VirtualMachineStatus,
enableSSH bool,
swapSize *resource.Quantity,
hostname string,
) ([]string, error) {
// prepare qemu command line
qemuCmd := []string{
"-runas", "qemu",
"-machine", "q35",
"-nographic",
"-no-reboot",
"-nodefaults",
"-only-migratable",
"-audiodev", "none,id=noaudio",
"-serial", "pty",
"-serial", "stdio",
"-msg", "timestamp=on",
"-qmp", fmt.Sprintf("tcp:0.0.0.0:%d,server,wait=off", vmSpec.QMP),
"-qmp", fmt.Sprintf("tcp:0.0.0.0:%d,server,wait=off", vmSpec.QMPManual),
"-qmp", fmt.Sprintf("unix:%s,server,wait=off", qmpUnixSocketForSigtermHandler),
"-device", "virtio-serial",
"-chardev", fmt.Sprintf("socket,path=%s,server=on,wait=off,id=log", logSerialSocket),
"-device", "virtserialport,chardev=log,name=tech.neon.log.0",
}
// disk details
qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=rootdisk,file=%s,if=virtio,media=disk,index=0,%s", rootDiskPath, cfg.diskCacheSettings))
qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=runtime,file=%s,if=virtio,media=cdrom,readonly=on,cache=none", runtimeDiskPath))
if enableSSH {
name := "ssh-authorized-keys"
if err := createISO9660FromPath(logger, name, sshAuthorizedKeysDiskPath, sshAuthorizedKeysMountPoint); err != nil {
return nil, fmt.Errorf("Failed to create ISO9660 image: %w", err)
}
qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=%s,file=%s,if=virtio,media=cdrom,cache=none", name, sshAuthorizedKeysDiskPath))
}
if swapSize != nil {
dPath := fmt.Sprintf("%s/swapdisk.qcow2", mountedDiskPath)
logger.Info("creating QCOW2 image for swap", zap.String("diskPath", dPath))
if err := createSwap(dPath, swapSize); err != nil {
return nil, fmt.Errorf("Failed to create swap disk: %w", err)
}
qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=%s,file=%s,if=virtio,media=disk,%s,discard=unmap", swapName, dPath, cfg.diskCacheSettings))
}
for _, disk := range vmSpec.Disks {
switch {
case disk.EmptyDisk != nil:
logger.Info("creating QCOW2 image with empty ext4 filesystem", zap.String("diskName", disk.Name))
dPath := fmt.Sprintf("%s/%s.qcow2", mountedDiskPath, disk.Name)
if err := createQCOW2(disk.Name, dPath, &disk.EmptyDisk.Size, nil); err != nil {
return nil, fmt.Errorf("Failed to create QCOW2 image: %w", err)
}
discard := ""
if disk.EmptyDisk.Discard {
discard = ",discard=unmap"
}
qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=%s,file=%s,if=virtio,media=disk,%s%s", disk.Name, dPath, cfg.diskCacheSettings, discard))
case disk.ConfigMap != nil || disk.Secret != nil:
dPath := fmt.Sprintf("%s/%s.iso", mountedDiskPath, disk.Name)
mnt := fmt.Sprintf("/vm/mounts%s", disk.MountPath)
logger.Info("creating iso9660 image", zap.String("diskPath", dPath), zap.String("diskName", disk.Name), zap.String("mountPath", mnt))
if err := createISO9660FromPath(logger, disk.Name, dPath, mnt); err != nil {
return nil, fmt.Errorf("Failed to create ISO9660 image: %w", err)
}
qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=%s,file=%s,if=virtio,media=cdrom,cache=none", disk.Name, dPath))
default:
// do nothing
}
}
// cpu details
// NB: EnableAcceleration guaranteed non-nil because the k8s API server sets the default for us.
if *vmSpec.EnableAcceleration && checkKVM() {
logger.Info("using KVM acceleration")
qemuCmd = append(qemuCmd, "-enable-kvm")
} else {
logger.Warn("not using KVM acceleration")
}
qemuCmd = append(qemuCmd, "-cpu", "max")
qemuCmd = append(qemuCmd, "-smp", fmt.Sprintf(
"cpus=%d,maxcpus=%d,sockets=1,cores=%d,threads=1",
vmSpec.Guest.CPUs.Min.RoundedUp(),
vmSpec.Guest.CPUs.Max.RoundedUp(),
vmSpec.Guest.CPUs.Max.RoundedUp(),
))
// memory details
logger.Info(fmt.Sprintf("Using memory provider %s", cfg.memoryProvider))
qemuCmd = append(qemuCmd, "-m", fmt.Sprintf(
"size=%db,slots=%d,maxmem=%db",
vmSpec.Guest.MemorySlotSize.Value()*int64(vmSpec.Guest.MemorySlots.Min),
vmSpec.Guest.MemorySlots.Max-vmSpec.Guest.MemorySlots.Min,
vmSpec.Guest.MemorySlotSize.Value()*int64(vmSpec.Guest.MemorySlots.Max),
))
if cfg.memoryProvider == vmv1.MemoryProviderVirtioMem {
// we don't actually have any slots because it's virtio-mem, but we're still using the API
// designed around DIMM slots, so we need to use them to calculate how much memory we expect
// to be able to plug in.
numSlots := vmSpec.Guest.MemorySlots.Max - vmSpec.Guest.MemorySlots.Min
virtioMemSize := int64(numSlots) * vmSpec.Guest.MemorySlotSize.Value()
// We can add virtio-mem if it actually needs to be a non-zero size.
// Otherwise, QEMU fails with:
// property 'size' of memory-backend-ram doesn't take value '0'
if virtioMemSize != 0 {
qemuCmd = append(qemuCmd, "-object", fmt.Sprintf("memory-backend-ram,id=vmem0,size=%db", virtioMemSize))
qemuCmd = append(qemuCmd, "-device", "virtio-mem-pci,id=vm0,memdev=vmem0,block-size=8M,requested-size=0")
}
}
// default (pod) net details
macDefault, err := defaultNetwork(logger, defaultNetworkCIDR, vmSpec.Guest.Ports)
if err != nil {
return nil, fmt.Errorf("Failed to set up default network: %w", err)
}
qemuCmd = append(qemuCmd, "-netdev", fmt.Sprintf("tap,id=default,ifname=%s,script=no,downscript=no,vhost=on", defaultNetworkTapName))
qemuCmd = append(qemuCmd, "-device", fmt.Sprintf("virtio-net-pci,netdev=default,mac=%s", macDefault.String()))
// overlay (multus) net details
if vmSpec.ExtraNetwork != nil && vmSpec.ExtraNetwork.Enable {
macOverlay, err := overlayNetwork(vmSpec.ExtraNetwork.Interface)
if err != nil {
return nil, fmt.Errorf("Failed to set up overlay network: %w", err)
}
qemuCmd = append(qemuCmd, "-netdev", fmt.Sprintf("tap,id=overlay,ifname=%s,script=no,downscript=no,vhost=on", overlayNetworkTapName))
qemuCmd = append(qemuCmd, "-device", fmt.Sprintf("virtio-net-pci,netdev=overlay,mac=%s", macOverlay.String()))
}
// kernel details
qemuCmd = append(
qemuCmd,
"-kernel", cfg.kernelPath,
"-append", makeKernelCmdline(cfg, vmSpec, vmStatus, hostname),
)
// should runner receive migration ?
if os.Getenv("RECEIVE_MIGRATION") == "true" {
qemuCmd = append(qemuCmd, "-incoming", fmt.Sprintf("tcp:0:%d", vmv1.MigrationPort))
}
return qemuCmd, nil
}
const (
baseKernelCmdline = "panic=-1 init=/neonvm/bin/init console=ttyS1 loglevel=7 root=/dev/vda rw"
kernelCmdlineDIMMSlots = "memhp_default_state=online_movable"
kernelCmdlineVirtioMemTmpl = "memhp_default_state=online memory_hotplug.online_policy=auto-movable memory_hotplug.auto_movable_ratio=%s"
)
func makeKernelCmdline(cfg *Config, vmSpec *vmv1.VirtualMachineSpec, vmStatus *vmv1.VirtualMachineStatus, hostname string) string {
cmdlineParts := []string{baseKernelCmdline}
switch cfg.memoryProvider {
case vmv1.MemoryProviderDIMMSlots:
cmdlineParts = append(cmdlineParts, kernelCmdlineDIMMSlots)
case vmv1.MemoryProviderVirtioMem:
cmdlineParts = append(cmdlineParts, fmt.Sprintf(kernelCmdlineVirtioMemTmpl, cfg.autoMovableRatio))
default:
panic(fmt.Errorf("unknown memory provider %s", cfg.memoryProvider))
}
if vmSpec.ExtraNetwork != nil && vmSpec.ExtraNetwork.Enable {
netDetails := fmt.Sprintf("ip=%s:::%s:%s:eth1:off", vmStatus.ExtraNetIP, vmStatus.ExtraNetMask, vmStatus.PodName)
cmdlineParts = append(cmdlineParts, netDetails)
}
if len(hostname) != 0 {
cmdlineParts = append(cmdlineParts, fmt.Sprintf("hostname=%s", hostname))
}
if cfg.appendKernelCmdline != "" {
cmdlineParts = append(cmdlineParts, cfg.appendKernelCmdline)
}
return strings.Join(cmdlineParts, " ")
}
func runQEMU(
cfg *Config,
logger *zap.Logger,
vmSpec *vmv1.VirtualMachineSpec,
qemuCmd []string,
) error {
selfPodName, ok := os.LookupEnv("K8S_POD_NAME")
if !ok {
return fmt.Errorf("environment variable K8S_POD_NAME missing")
}
var cgroupPath string
if !cfg.skipCgroupManagement && !cfg.delegatedCgroup {
selfCgroupPath, err := getSelfCgroupPath(logger)
if err != nil {
return fmt.Errorf("Failed to get self cgroup path: %w", err)
}
// Sometimes we'll get just '/' as our cgroup path. If that's the case, we should reset it so
// that the cgroup '/neonvm-qemu-...' still works.
if selfCgroupPath == "/" {
selfCgroupPath = ""
}
// ... but also we should have some uniqueness just in case, so we're not sharing a root level
// cgroup if that *is* what's happening. This *should* only be relevant for local clusters.
//
// We don't want to just use the VM spec's .status.PodName because during migrations that will
// be equal to the source pod, not this one, which may be... somewhat confusing.
cgroupPath = fmt.Sprintf("%s/neonvm-qemu-%s", selfCgroupPath, selfPodName)
logger.Info("Determined QEMU cgroup path", zap.String("path", cgroupPath))
useCPU := vmSpec.Guest.CPUs.Use
if err := setCgroupLimit(logger, useCPU, cgroupPath); err != nil {
return fmt.Errorf("Failed to set cgroup limit: %w", err)
}
}
ctx, cancel := context.WithCancel(context.Background())
wg := sync.WaitGroup{}
wg.Add(1)
go terminateQemuOnSigterm(ctx, logger, &wg)
if !cfg.skipCgroupManagement || cfg.enableDummyCPUServer || cfg.delegatedCgroup {
var callbacks cpuServerCallbacks
if cfg.enableDummyCPUServer {
lastValue := &atomic.Uint32{}
lastValue.Store(uint32(vmSpec.Guest.CPUs.Min))
callbacks = cpuServerCallbacks{
get: func(logger *zap.Logger) (*vmv1.MilliCPU, error) {
return lo.ToPtr(vmv1.MilliCPU(lastValue.Load())), nil
},
set: func(logger *zap.Logger, cpu vmv1.MilliCPU) error {
lastValue.Store(uint32(cpu))
return nil
},
}
} else if cfg.delegatedCgroup {
// cgroup IS delegated -- we're not handling it, and instead need to pass it off to
// neonvm-daemon inside the VM.
callbacks = cpuServerCallbacks{
get: func(logger *zap.Logger) (*vmv1.MilliCPU, error) {
return getNeonvmDaemonCPU()
},
set: func(logger *zap.Logger, cpu vmv1.MilliCPU) error {
return setNeonvmDaemonCPU(cpu)
},
}
} else {
// Standard implementation -- we're handling it ourselves, and QEMU is running in a
// local cgroup.
callbacks = cpuServerCallbacks{
get: func(logger *zap.Logger) (*vmv1.MilliCPU, error) {
return getCgroupQuota(cgroupPath)
},
set: func(logger *zap.Logger, cpu vmv1.MilliCPU) error {
return setCgroupLimit(logger, cpu, cgroupPath)
},
}
}
wg.Add(1)
go listenForCPUChanges(ctx, logger, vmSpec.RunnerPort, callbacks, &wg)
}
wg.Add(1)
go forwardLogs(ctx, logger, &wg)
var bin string
var cmd []string
if !cfg.skipCgroupManagement && !cfg.delegatedCgroup {
bin = "cgexec"
cmd = append([]string{"-g", fmt.Sprintf("cpu:%s", cgroupPath), QEMU_BIN}, qemuCmd...)
} else {
bin = QEMU_BIN
cmd = qemuCmd
}
logger.Info(fmt.Sprintf("calling %s", bin), zap.Strings("args", cmd))
err := execFg(bin, cmd...)
if err != nil {
msg := "QEMU exited with error" // TODO: technically this might not be accurate. This can also happen if it fails to start.
logger.Error(msg, zap.Error(err))
err = fmt.Errorf("%s: %w", msg, err)
} else {
logger.Info("QEMU exited without error")
}
cancel()
wg.Wait()
return err
}
func handleCPUChange(
logger *zap.Logger,
w http.ResponseWriter,
r *http.Request,
set func(*zap.Logger, vmv1.MilliCPU) error,
) {
if r.Method != "POST" {
logger.Error("unexpected method", zap.String("method", r.Method))
w.WriteHeader(400)
return
}
body, err := io.ReadAll(r.Body)
if err != nil {
logger.Error("could not read body", zap.Error(err))
w.WriteHeader(400)
return
}
var parsed api.VCPUChange
if err = json.Unmarshal(body, &parsed); err != nil {
logger.Error("could not parse body", zap.Error(err))
w.WriteHeader(400)
return
}
// update cgroup
logger.Info("got CPU update", zap.Float64("CPU", parsed.VCPUs.AsFloat64()))
err = set(logger, parsed.VCPUs)
if err != nil {
logger.Error("could not set cgroup limit", zap.Error(err))
w.WriteHeader(500)
return
}
w.WriteHeader(200)
}
func handleCPUCurrent(
logger *zap.Logger,
w http.ResponseWriter,
r *http.Request,
get func(*zap.Logger) (*vmv1.MilliCPU, error),
) {
if r.Method != "GET" {
logger.Error("unexpected method", zap.String("method", r.Method))
w.WriteHeader(400)
return
}
cpus, err := get(logger)
if err != nil {
logger.Error("could not get cgroup quota", zap.Error(err))
w.WriteHeader(500)
return
}
resp := api.VCPUCgroup{VCPUs: *cpus}
body, err := json.Marshal(resp)
if err != nil {
logger.Error("could not marshal body", zap.Error(err))
w.WriteHeader(500)
return
}
w.Header().Add("Content-Type", "application/json")
w.Write(body) //nolint:errcheck // Not much to do with the error here. TODO: log it?
}
type cpuServerCallbacks struct {
get func(*zap.Logger) (*vmv1.MilliCPU, error)
set func(*zap.Logger, vmv1.MilliCPU) error
}
func listenForCPUChanges(
ctx context.Context,
logger *zap.Logger,
port int32,
callbacks cpuServerCallbacks,
wg *sync.WaitGroup,
) {
defer wg.Done()
mux := http.NewServeMux()
loggerHandlers := logger.Named("http-handlers")
cpuChangeLogger := loggerHandlers.Named("cpu_change")
mux.HandleFunc("/cpu_change", func(w http.ResponseWriter, r *http.Request) {
handleCPUChange(cpuChangeLogger, w, r, callbacks.set)
})
cpuCurrentLogger := loggerHandlers.Named("cpu_current")
mux.HandleFunc("/cpu_current", func(w http.ResponseWriter, r *http.Request) {
handleCPUCurrent(cpuCurrentLogger, w, r, callbacks.get)
})
server := http.Server{
Addr: fmt.Sprintf("0.0.0.0:%d", port),
Handler: mux,
ReadTimeout: 5 * time.Second,
ReadHeaderTimeout: 5 * time.Second,
WriteTimeout: 5 * time.Second,
}
errChan := make(chan error)
go func() {
errChan <- server.ListenAndServe()
}()
select {
case err := <-errChan:
if errors.Is(err, http.ErrServerClosed) {
logger.Info("cpu_change server closed")
} else if err != nil {
logger.Fatal("cpu_change exited with error", zap.Error(err))
}
case <-ctx.Done():
err := server.Shutdown(context.Background())
logger.Info("shut down cpu_change server", zap.Error(err))
}
}
func printWithNewline(slice []byte) error {
if len(slice) == 0 {
return nil
}
_, err := os.Stdout.Write(slice)
if err != nil {
return err
}
if slice[len(slice)-1] == '\n' {
return nil
}
_, err = os.Stdout.WriteString("\n")
return err
}
func drainLogsReader(reader *bufio.Reader, logger *zap.Logger) error {
for {
// ReadSlice actually can return no more than bufferedReaderSize bytes
slice, err := reader.ReadSlice('\n')
// If err != nil, slice might not have \n at the end
err2 := printWithNewline(slice)
err = errors.Join(err, err2)
if err != nil {
if errors.Is(err, os.ErrDeadlineExceeded) {
return nil
}
if errors.Is(err, io.EOF) {
logger.Warn("EOF while reading from log serial")
} else {
logger.Error("failed to read from log serial", zap.Error(err))
}
return err
}
}
}
// forwardLogs writes from socket to stdout line by line
func forwardLogs(ctx context.Context, logger *zap.Logger, wg *sync.WaitGroup) {
defer wg.Done()
delay := 3 * time.Second
var conn net.Conn
var reader *bufio.Reader
b := &backoff.Backoff{
Min: 100 * time.Millisecond,
Max: delay,
Factor: 2,
Jitter: true,
}
// Wait a bit to reduce the chance we attempt dialing before
// QEMU is started
select {
case <-time.After(200 * time.Millisecond):
case <-ctx.Done():
logger.Warn("QEMU shut down too soon to start forwarding logs")
}
for {
func() {
if conn == nil {
var err error
conn, err = net.Dial("unix", logSerialSocket)
if err != nil {
logger.Error("failed to dial to logSerialSocket", zap.Error(err))
return
}
reader = bufio.NewReaderSize(conn, bufferedReaderSize)
}
b.Attempt()
err := conn.SetReadDeadline(time.Now().Add(delay))
if err != nil {
logger.Error("failed to set read deadline", zap.Error(err))
conn = nil
return
}
err = drainLogsReader(reader, logger)
if errors.Is(err, os.ErrDeadlineExceeded) {
// We've hit the deadline, meaning the reading session was successful.
b.Reset()
return
}
if err != nil {
conn = nil
}
}()
select {
case <-ctx.Done():
if conn != nil {
conn.Close()
}
if reader != nil {
_ = drainLogsReader(reader, logger)
}
return
case <-time.After(b.Duration()):
}
}
}
func getSelfCgroupPath(logger *zap.Logger) (string, error) {
// There's some fun stuff here. For general information, refer to `man 7 cgroups` - specifically
// the section titled "/proc files" - for "/proc/cgroups" and "/proc/pid/cgroup".
//
// In general, the idea is this: If we start QEMU outside of the cgroup for the container we're
// running in, we run into multiple problems - it won't show up in metrics, and we'll have to
// clean up the cgroup ourselves. (not good!).
//
// So we'd like to start it in the same cgroup - the question is just how to find the name of
// the cgroup we're running in. Thankfully, this is visible in `/proc/self/cgroup`!
// The only difficulty is the file format.
//
// In cgroup v1 (which is what we have on EKS [as of 2023-07]), the contents of
// /proc/<pid>/cgroup tend to look like:
//
// 11:cpuset:/path/to/cgroup
// 10:perf_event:/path/to/cgroup
// 9:hugetlb:/path/to/cgroup
// 8:blkio:/path/to/cgroup
// 7:pids:/path/to/cgroup
// 6:freezer:/path/to/cgroup
// 5:memory:/path/to/cgroup
// 4:net_cls,net_prio:/path/to/cgroup
// 3:cpu,cpuacct:/path/to/cgroup
// 2:devices:/path/to/cgroup
// 1:name=systemd:/path/to/cgroup
//
// For cgroup v2, we have:
//
// 0::/path/to/cgroup
//
// The file format is defined to have 3 fields, separated by colons. The first field gives the
// Hierarchy ID, which is guaranteed to be 0 if the cgroup is part of a cgroup v2 ("unified")
// hierarchy.
// The second field is a comma-separated list of the controllers. Or, if it's cgroup v2, nothing.
// The third field is the "pathname" of the cgroup *in its hierarchy*, relative to the mount
// point of the hierarchy.
//
// So we're looking for EITHER:
// 1. an entry like '<N>:<controller...>,cpu,<controller...>:/path/to/cgroup (cgroup v1); OR
// 2. an entry like '0::/path/to/cgroup', and we'll return the path (cgroup v2)
// We primarily care about the 'cpu' controller, so for cgroup v1, we'll search for that instead
// of e.g. "name=systemd", although it *really* shouldn't matter because the paths will be the
// same anyways.
//
// Now: Technically it's possible to run a "hybrid" system with both cgroup v1 and v2
// hierarchies. If this is the case, it's possible for /proc/self/cgroup to show *some* v1
// hierarchies attached, in addition to the v2 "unified" hierarchy, for the same cgroup. To
// handle this, we should look for a cgroup v1 "cpu" controller, and if we can't find it, try
// for the cgroup v2 unified entry.
//
// As far as I (@sharnoff) can tell, the only case where that might actually get messed up is if
// the CPU controller isn't available for the cgroup we're running in, in which case there's
// nothing we can do about it! (other than e.g. using a cgroup higher up the chain, which would
// be really bad tbh).
// ---
// On to the show!
procSelfCgroupContents, err := os.ReadFile("/proc/self/cgroup")
if err != nil {
return "", fmt.Errorf("failed to read /proc/self/cgroup: %w", err)
}
logger.Info("Read /proc/self/cgroup", zap.String("contents", string(procSelfCgroupContents)))
// Collect all candidate paths from the lines of the file. If there isn't exactly one,
// something's wrong and we should make an error.
var v1Candidates []string
var v2Candidates []string
for lineno, line := range strings.Split(string(procSelfCgroupContents), "\n") {
if line == "" {
continue
}
// Split into the three ':'-delimited fields
fields := strings.Split(line, ":")
if len(fields) != 3 {
return "", fmt.Errorf("line %d of /proc/self/cgroup did not have 3 colon-delimited fields", lineno+1)
}
id := fields[0]
controllers := fields[1]
path := fields[2]
if id == "0" {
v2Candidates = append(v2Candidates, path)
continue
}
// It's not cgroup v2, otherwise id would have been 0. So, check if the comma-separated list
// of controllers contains 'cpu' as an entry.
for _, c := range strings.Split(controllers, ",") {
if c == "cpu" {
v1Candidates = append(v1Candidates, path)
break // ... and then continue to the next loop iteration
}
}
}
var errMsg string
// Check v1, then v2
if len(v1Candidates) == 1 {
return v1Candidates[0], nil
} else if len(v1Candidates) != 0 {
errMsg = "More than one applicable cgroup v1 entry in /proc/self/cgroup"
} else if len(v2Candidates) == 1 {
return v2Candidates[0], nil
} else if len(v2Candidates) != 0 {
errMsg = "More than one applicable cgroup v2 entry in /proc/self/cgroup"
} else {
errMsg = "Couldn't find applicable entry in /proc/self/cgroup"
}
return "", errors.New(errMsg)
}
func setCgroupLimit(logger *zap.Logger, r vmv1.MilliCPU, cgroupPath string) error {
r *= cpuLimitOvercommitFactor
isV2 := cgroups.Mode() == cgroups.Unified
period := cgroupPeriod
// quota may be greater than period if the cgroup is allowed
// to use more than 100% of a CPU.
quota := int64(float64(r) / float64(1000) * float64(cgroupPeriod))
logger.Info(fmt.Sprintf("setting cgroup CPU limit %v %v", quota, period))
if isV2 {
resources := cgroup2.Resources{
CPU: &cgroup2.CPU{
Max: cgroup2.NewCPUMax("a, &period),
},
}
_, err := cgroup2.NewManager(cgroupMountPoint, cgroupPath, &resources)
if err != nil {
return err
}
} else {
_, err := cgroup1.New(cgroup1.StaticPath(cgroupPath), &specs.LinuxResources{
CPU: &specs.LinuxCPU{
Quota: "a,
Period: &period,
},
})
if err != nil {
return err
}
}
return nil
}
func getCgroupQuota(cgroupPath string) (*vmv1.MilliCPU, error) {
isV2 := cgroups.Mode() == cgroups.Unified
var path string
if isV2 {
path = filepath.Join(cgroupMountPoint, cgroupPath, "cpu.max")
} else {
path = filepath.Join(cgroupMountPoint, "cpu", cgroupPath, "cpu.cfs_quota_us")
}
data, err := os.ReadFile(path)
if err != nil {
return nil, err
}
arr := strings.Split(strings.Trim(string(data), "\n"), " ")
if len(arr) == 0 {
return nil, errors.New("unexpected cgroup data")
}
quota, err := strconv.ParseUint(arr[0], 10, 64)
if err != nil {
return nil, err
}
cpu := vmv1.MilliCPU(uint32(quota * 1000 / cgroupPeriod))
cpu /= cpuLimitOvercommitFactor
return &cpu, nil
}
func getNeonvmDaemonCPU() (*vmv1.MilliCPU, error) {
_, vmIP, _, err := calcIPs(defaultNetworkCIDR)
if err != nil {
return nil, fmt.Errorf("could not calculate VM IP address: %w", err)
}
ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
defer cancel()
url := fmt.Sprintf("http://%s:25183/cpu", vmIP)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, fmt.Errorf("could not build request: %w", err)
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, fmt.Errorf("could not send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return nil, fmt.Errorf("neonvm-daemon responded with status %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("could not read response body: %w", err)
}
milliCPU, err := strconv.Atoi(string(body))
if err != nil {
return nil, fmt.Errorf("could not parse response body: %w", err)
}
return lo.ToPtr(vmv1.MilliCPU(milliCPU)), nil
}
func setNeonvmDaemonCPU(cpu vmv1.MilliCPU) error {
_, vmIP, _, err := calcIPs(defaultNetworkCIDR)
if err != nil {
return fmt.Errorf("could not calculate VM IP address: %w", err)
}
ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
defer cancel()
url := fmt.Sprintf("http://%s:25183/cpu", vmIP)
body := bytes.NewReader([]byte(fmt.Sprintf("%d", uint32(cpu))))
req, err := http.NewRequestWithContext(ctx, http.MethodPut, url, body)
if err != nil {
return fmt.Errorf("could not build request: %w", err)
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return fmt.Errorf("could not send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return fmt.Errorf("neonvm-daemon responded with status %d", resp.StatusCode)
}
return nil
}
func terminateQemuOnSigterm(ctx context.Context, logger *zap.Logger, wg *sync.WaitGroup) {
logger = logger.Named("terminate-qemu-on-sigterm")
defer wg.Done()
logger.Info("watching OS signals")
c := make(chan os.Signal, 1) // we need to reserve to buffer size 1, so the notifier are not blocked
signal.Notify(c, os.Interrupt, syscall.SIGTERM)
select {
case <-c:
case <-ctx.Done():
logger.Info("context canceled, not going to powerdown QEMU because it's already finished")
return
}
logger.Info("got signal, sending powerdown command to QEMU")
mon, err := qmp.NewSocketMonitor("unix", qmpUnixSocketForSigtermHandler, 2*time.Second)
if err != nil {
logger.Error("failed to connect to QEMU monitor", zap.Error(err))
return
}
if err := mon.Connect(); err != nil {
logger.Error("failed to start monitor connection", zap.Error(err))
return
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
qmpcmd := []byte(`{"execute": "system_powerdown"}`)
_, err = mon.Run(qmpcmd)
if err != nil {
logger.Error("failed to execute system_powerdown command", zap.Error(err))
return
}
logger.Info("system_powerdown command sent to QEMU")
}
func calcIPs(cidr string) (net.IP, net.IP, net.IPMask, error) {
_, ipv4Net, err := net.ParseCIDR(cidr)
if err != nil {
return nil, nil, nil, err
}
ip0 := ipv4Net.IP.To4()
mask := ipv4Net.Mask
ip1 := append(net.IP{}, ip0...)
ip1[3]++
ip2 := append(net.IP{}, ip1...)
ip2[3]++
return ip1, ip2, mask, nil
}
//lint:ignore U1000 the function is not in use right now, but it's good to have for the future
func execBg(name string, arg ...string) error {
cmd := exec.Command(name, arg...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if err := cmd.Start(); err != nil {
return err
}
return nil
}
func execFg(name string, arg ...string) error {
cmd := exec.Command(name, arg...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
return err
}
return nil
}
func defaultNetwork(logger *zap.Logger, cidr string, ports []vmv1.Port) (mac.MAC, error) {
// gerenare random MAC for default Guest interface
mac, err := mac.GenerateRandMAC()
if err != nil {
logger.Fatal("could not generate random MAC", zap.Error(err))
return nil, err
}
// create an configure linux bridge
logger.Info("setup bridge interface", zap.String("name", defaultNetworkBridgeName))
bridge := &netlink.Bridge{
LinkAttrs: netlink.LinkAttrs{
Name: defaultNetworkBridgeName,
},
}
if err := netlink.LinkAdd(bridge); err != nil {
logger.Fatal("could not create bridge interface", zap.Error(err))
return nil, err
}
ipPod, ipVm, mask, err := calcIPs(cidr)
if err != nil {
logger.Fatal("could not parse IP", zap.Error(err))
return nil, err
}
bridgeAddr := &netlink.Addr{
IPNet: &net.IPNet{
IP: ipPod,
Mask: mask,
},
}
if err := netlink.AddrAdd(bridge, bridgeAddr); err != nil {
logger.Fatal("could not parse IP", zap.Error(err))
return nil, err
}
if err := netlink.LinkSetUp(bridge); err != nil {
logger.Fatal("could not set up bridge", zap.Error(err))
return nil, err
}
// create an configure TAP interface
if !checkDevTun() {
logger.Info("create /dev/net/tun")
if err := execFg("mkdir", "-p", "/dev/net"); err != nil {
return nil, err
}
if err := execFg("mknod", "/dev/net/tun", "c", "10", "200"); err != nil {
return nil, err
}
if err := execFg("chown", "qemu:kvm", "/dev/net/tun"); err != nil {
return nil, err
}
}
logger.Info("setup tap interface", zap.String("name", defaultNetworkTapName))
tap := &netlink.Tuntap{
LinkAttrs: netlink.LinkAttrs{
Name: defaultNetworkTapName,
},
Mode: netlink.TUNTAP_MODE_TAP,
Flags: netlink.TUNTAP_DEFAULTS,
}
if err := netlink.LinkAdd(tap); err != nil {
logger.Error("could not add tap device", zap.Error(err))
return nil, err
}
if err := netlink.LinkSetMaster(tap, bridge); err != nil {
logger.Error("could not set up tap as master", zap.Error(err))
return nil, err
}
if err := netlink.LinkSetUp(tap); err != nil {
logger.Error("could not set up tap device", zap.Error(err))
return nil, err
}
// setup masquerading outgoing (from VM) traffic
logger.Info("setup masquerading for outgoing traffic")
if err := execFg("iptables", "-t", "nat", "-A", "POSTROUTING", "-o", "eth0", "-j", "MASQUERADE"); err != nil {
logger.Error("could not setup masquerading for outgoing traffic", zap.Error(err))
return nil, err
}
// pass incoming traffic to .Guest.Spec.Ports into VM
var iptablesArgs []string
for _, port := range ports {
logger.Info(fmt.Sprintf("setup DNAT rule for incoming traffic to port %d", port.Port))
iptablesArgs = []string{
"-t", "nat", "-A", "PREROUTING",
"-i", "eth0", "-p", fmt.Sprint(port.Protocol), "--dport", fmt.Sprint(port.Port),
"-j", "DNAT", "--to", fmt.Sprintf("%s:%d", ipVm.String(), port.Port),
}
if err := execFg("iptables", iptablesArgs...); err != nil {
logger.Error("could not set up DNAT rule for incoming traffic", zap.Error(err))
return nil, err
}
logger.Info(fmt.Sprintf("setup DNAT rule for traffic originating from localhost to port %d", port.Port))
iptablesArgs = []string{
"-t", "nat", "-A", "OUTPUT",
"-m", "addrtype", "--src-type", "LOCAL", "--dst-type", "LOCAL",
"-p", fmt.Sprint(port.Protocol), "--dport", fmt.Sprint(port.Port),
"-j", "DNAT", "--to-destination", fmt.Sprintf("%s:%d", ipVm.String(), port.Port),
}
if err := execFg("iptables", iptablesArgs...); err != nil {
logger.Error("could not set up DNAT rule for traffic from localhost", zap.Error(err))
return nil, err
}
logger.Info(fmt.Sprintf("setup ACCEPT rule for traffic originating from localhost to port %d", port.Port))
iptablesArgs = []string{
"-A", "OUTPUT",
"-s", "127.0.0.1", "-d", ipVm.String(),
"-p", fmt.Sprint(port.Protocol), "--dport", fmt.Sprint(port.Port),
"-j", "ACCEPT",
}
if err := execFg("iptables", iptablesArgs...); err != nil {
logger.Error("could not set up ACCEPT rule for traffic from localhost", zap.Error(err))
return nil, err
}
}
logger.Info("setup MASQUERADE rule for traffic originating from localhost")
iptablesArgs = []string{
"-t", "nat", "-A", "POSTROUTING",
"-m", "addrtype", "--src-type", "LOCAL", "--dst-type", "UNICAST",
"-j", "MASQUERADE",
}
if err := execFg("iptables", iptablesArgs...); err != nil {
logger.Error("could not set up MASQUERADE rule for traffic from localhost", zap.Error(err))
return nil, err
}
// get dns details from /etc/resolv.conf
resolvConf, err := getResolvConf()
if err != nil {
logger.Error("could not get DNS details", zap.Error(err))
return nil, err
}
dns := getNameservers(resolvConf.Content, types.IP)[0]
dnsSearch := strings.Join(getSearchDomains(resolvConf.Content), ",")
// prepare dnsmask command line (instead of config file)
logger.Info("run dnsmasq for interface", zap.String("name", defaultNetworkBridgeName))
dnsMaskCmd := []string{
// No DNS, DHCP only
"--port=0",
// Because we don't provide DNS, no need to load resolv.conf. This helps to
// avoid "dnsmasq: failed to create inotify: No file descriptors available"
// errors.
"--no-resolv",
"--bind-interfaces",
"--dhcp-authoritative",
fmt.Sprintf("--interface=%s", defaultNetworkBridgeName),
fmt.Sprintf("--dhcp-range=%s,static,%d.%d.%d.%d", ipVm.String(), mask[0], mask[1], mask[2], mask[3]),
fmt.Sprintf("--dhcp-host=%s,%s,infinite", mac.String(), ipVm.String()),
fmt.Sprintf("--dhcp-option=option:router,%s", ipPod.String()),
fmt.Sprintf("--dhcp-option=option:dns-server,%s", dns),
fmt.Sprintf("--dhcp-option=option:domain-search,%s", dnsSearch),
fmt.Sprintf("--shared-network=%s,%s", defaultNetworkBridgeName, ipVm.String()),
}
// run dnsmasq for default Guest interface
if err := execFg("dnsmasq", dnsMaskCmd...); err != nil {
logger.Error("could not run dnsmasq", zap.Error(err))
return nil, err
}
// Adding VM's IP address to the /etc/hosts, so we can access it easily from
// the pod. This is particularly useful for ssh into the VM from the runner
// pod.
f, err := os.OpenFile("/etc/hosts", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
return nil, err
}
defer f.Close()
record := fmt.Sprintf("%v guest-vm\n", ipVm)
if _, err := f.WriteString(record); err != nil {
return nil, err
}
return mac, nil
}
func overlayNetwork(iface string) (mac.MAC, error) {
// gerenare random MAC for overlay Guest interface
mac, err := mac.GenerateRandMAC()
if err != nil {
return nil, err
}
// create and configure linux bridge
bridge := &netlink.Bridge{
LinkAttrs: netlink.LinkAttrs{
Name: overlayNetworkBridgeName,
Protinfo: &netlink.Protinfo{
Learning: false,
},
},
}
if err := netlink.LinkAdd(bridge); err != nil {
return nil, err
}
if err := netlink.LinkSetUp(bridge); err != nil {
return nil, err
}
// create an configure TAP interface
tap := &netlink.Tuntap{
LinkAttrs: netlink.LinkAttrs{
Name: overlayNetworkTapName,
},
Mode: netlink.TUNTAP_MODE_TAP,
Flags: netlink.TUNTAP_DEFAULTS,
}
if err := netlink.LinkAdd(tap); err != nil {
return nil, err
}
if err := netlink.LinkSetMaster(tap, bridge); err != nil {
return nil, err
}
if err := netlink.LinkSetUp(tap); err != nil {
return nil, err
}
// add overlay interface to bridge as well
overlayLink, err := netlink.LinkByName(iface)
if err != nil {
return nil, err
}
// firsly delete IP address(es) (it it exist) from overlay interface
overlayAddrs, err := netlink.AddrList(overlayLink, netlink.FAMILY_V4)
if err != nil {
return nil, err
}
for _, a := range overlayAddrs {
ip := a.IPNet
if ip != nil {
if err := netlink.AddrDel(overlayLink, &a); err != nil {
return nil, err
}
}
}
// and now add overlay link to bridge
if err := netlink.LinkSetMaster(overlayLink, bridge); err != nil {
return nil, err
}
return mac, nil
}
package main
import (
"archive/tar"
"bytes"
"context"
_ "embed"
"errors"
"flag"
"fmt"
"io"
"log"
"os"
"path/filepath"
"strings"
"text/template"
"github.com/alessio/shellescape"
"github.com/distribution/reference"
cliconfig "github.com/docker/cli/cli/config"
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/registry"
"github.com/docker/docker/client"
"github.com/docker/docker/pkg/jsonmessage"
"golang.org/x/term"
"gopkg.in/yaml.v3"
)
// vm-builder --src alpine:3.19 --dst vm-alpine:dev --file vm-alpine.qcow2
var (
//go:embed files/Dockerfile.img
dockerfileVmBuilder string
//go:embed files/helper.move-bins.sh
scriptMoveBinsHelper string
//go:embed files/vmstart
scriptVmStart string
//go:embed files/inittab
scriptInitTab string
//go:embed files/vmacpi
scriptVmAcpi string
//go:embed files/vmshutdown
scriptVmShutdown string
//go:embed files/vminit
scriptVmInit string
//go:embed files/udev-init.sh
scriptUdevInit string
//go:embed files/cg-setup.sh
scriptCgSetup string
//go:embed files/cg-run.sh
scriptCgRun string
//go:embed files/resize-swap.sh
scriptResizeSwap string
//go:embed files/set-disk-quota.sh
scriptSetDiskQuota string
//go:embed files/vector.yaml
configVector string
//go:embed files/chrony.conf
configChrony string
//go:embed files/sshd_config
configSshd string
)
var (
Version string
NeonvmDaemonImage string
srcImage = flag.String("src", "", `Docker image used as source for virtual machine disk image: --src=alpine:3.19`)
dstImage = flag.String("dst", "", `Docker image with resulting disk image: --dst=vm-alpine:3.19`)
size = flag.String("size", "1G", `Size for disk image: --size=1G`)
outFile = flag.String("file", "", `Save disk image as file: --file=vm-alpine.qcow2`)
specFile = flag.String("spec", "", `File containing additional customization: --spec=spec.yaml`)
quiet = flag.Bool("quiet", false, `Show less output from the docker build process`)
forcePull = flag.Bool("pull", false, `Pull src image even if already present locally`)
version = flag.Bool("version", false, `Print vm-builder version`)
daemonImageFlag = flag.String("daemon-image", "", `Specify the neonvm-daemon image: --daemon-image=neonvm-daemon:dev`)
)
func AddTemplatedFileToTar(tw *tar.Writer, tmplArgs any, filename string, tmplString string) error {
tmpl, err := template.New(filename).Parse(tmplString)
if err != nil {
return fmt.Errorf("failed to parse template for %q: %w", filename, err)
}
var buf bytes.Buffer
if err = tmpl.Execute(&buf, tmplArgs); err != nil {
return fmt.Errorf("failed to execute template for %q: %w", filename, err)
}
return addFileToTar(tw, filename, buf.Bytes())
}
func addFileToTar(tw *tar.Writer, filename string, contents []byte) error {
tarHeader := &tar.Header{
Name: filename,
Size: int64(len(contents)),
Mode: 0755, // TODO: shouldn't just set this for everything.
}
if err := tw.WriteHeader(tarHeader); err != nil {
return fmt.Errorf("failed to write tar header for %q: %w", filename, err)
}
if _, err := tw.Write(contents); err != nil {
return fmt.Errorf("failed to write file content for %q: %w", filename, err)
}
return nil
}
type TemplatesContext struct {
User string
Entrypoint []string
Cmd []string
Env []string
RootDiskImage string
NeonvmDaemonImage string
SpecBuild string
SpecMerge string
InittabCommands []inittabCommand
ShutdownHook string
}
type inittabCommand struct {
SysvInitAction string
CommandUser string
ShellEscapedCommand string
}
func main() {
flag.Parse()
var dstIm string
if *version {
fmt.Println(Version)
os.Exit(0)
}
if len(*daemonImageFlag) == 0 && len(NeonvmDaemonImage) == 0 {
log.Println("neonvm-daemon image not set, needs to be explicitly passed in, or compiled with -ldflags '-X main.NeonvmDaemonImage=...'")
flag.PrintDefaults()
os.Exit(1)
}
neonvmDaemonImage := NeonvmDaemonImage
if len(*daemonImageFlag) != 0 {
neonvmDaemonImage = *daemonImageFlag
}
if len(*srcImage) == 0 {
log.Println("-src not set, see usage info:")
flag.PrintDefaults()
os.Exit(1)
}
if len(*dstImage) == 0 {
dstIm = fmt.Sprintf("vm-%s", *srcImage)
log.Printf("-dst not set, using %s\n", dstIm)
} else {
dstIm = *dstImage
}
var spec *imageSpec
if *specFile != "" {
var err error
spec, err = readImageSpec(*specFile)
if err != nil {
log.Fatalln(err)
os.Exit(1)
}
}
log.Println("Load docker credentials")
dockerConfig, err := cliconfig.Load("" /* auto-detect right directory */)
if err != nil {
log.Fatalln(err)
}
credentials, err := dockerConfig.GetAllCredentials()
if err != nil {
log.Fatalln(err)
}
authConfigs := make(map[string]registry.AuthConfig)
for key, value := range credentials {
log.Printf("Found docker credentials for %s", key)
authConfigs[key] = registry.AuthConfig(value)
}
ctx := context.Background()
log.Println("Setup docker connection")
cli, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation())
if err != nil {
log.Fatalln(err)
}
defer cli.Close()
hostContainsSrcImage := false
if !*forcePull {
hostImages, err := cli.ImageList(ctx, types.ImageListOptions{})
if err != nil {
log.Fatalln(err) //nolint:gocritic // linter complains that Fatalln circumvents deferred cli.Close(). Too much work to fix in #721, leaving for later.
}
for _, img := range hostImages {
for _, name := range img.RepoTags {
if name == *srcImage {
hostContainsSrcImage = true
break
}
}
if hostContainsSrcImage {
break
}
}
}
if !hostContainsSrcImage {
// pull source image
// use a closure so deferred close is closer
err := func() error {
named, err := reference.ParseNormalizedNamed(*srcImage)
if err != nil {
return err
}
reg := reference.Domain(named)
imagePullOptions := types.ImagePullOptions{}
if authConfig, ok := authConfigs[reg]; ok {
encoded, err := registry.EncodeAuthConfig(authConfig)
if err != nil {
return err
}
imagePullOptions.RegistryAuth = encoded
} else {
// Special case handling of docker.io weirdness.
// ref https://github.com/moby/moby/blob/e7347f8a8c2fd3d2abd34b638d6fc8c18b0278d1/registry/config.go#L26-L49
// (and other handling around index.docker.io in that file...)
//
// See also e.g. https://github.com/containrrr/watchtower/issues/1176
legacyConfig, hasLegacyDockerConfig := authConfigs["https://index.docker.io/v1/"]
if hasLegacyDockerConfig && (reg == "docker.io" || reg == "registry-1.docker.io") {
encoded, err := registry.EncodeAuthConfig(legacyConfig)
if err != nil {
return err
}
imagePullOptions.RegistryAuth = encoded
} else {
log.Printf("No docker credentials found for %s", reg)
}
}
log.Printf("Pull source docker image: %s", *srcImage)
pull, err := cli.ImagePull(ctx, *srcImage, imagePullOptions)
if err != nil {
return err
}
defer pull.Close()
// do quiet pull - discard output
_, err = io.Copy(io.Discard, pull)
return err
}()
if err != nil {
log.Fatalln(err)
}
}
log.Printf("Build docker image for virtual machine (disk size %s): %s\n", *size, dstIm)
imageSpec, _, err := cli.ImageInspectWithRaw(ctx, *srcImage)
if err != nil {
log.Fatalln(err)
}
// Shell-escape all the command pieces, twice. We need to do it twice because we're generating
// a shell script that appends these to a second shell script.
for i := range imageSpec.Config.Entrypoint {
imageSpec.Config.Entrypoint[i] = shellescape.Quote(shellescape.Quote(imageSpec.Config.Entrypoint[i]))
}
for i := range imageSpec.Config.Cmd {
imageSpec.Config.Cmd[i] = shellescape.Quote(shellescape.Quote(imageSpec.Config.Cmd[i]))
}
tmplArgs := TemplatesContext{
User: "root", // overridden below, if imageSpec.Config.User != ""
Entrypoint: imageSpec.Config.Entrypoint,
Cmd: imageSpec.Config.Cmd,
Env: imageSpec.Config.Env,
RootDiskImage: *srcImage,
NeonvmDaemonImage: neonvmDaemonImage,
SpecBuild: "", // overridden below if spec != nil
SpecMerge: "", // overridden below if spec != nil
InittabCommands: nil, // overridden below if spec != nil
ShutdownHook: "", // overridden below if spec != nil
}
if len(imageSpec.Config.User) != 0 {
tmplArgs.User = imageSpec.Config.User
}
tarBuffer := new(bytes.Buffer)
tw := tar.NewWriter(tarBuffer)
defer tw.Close()
if spec != nil {
tmplArgs.SpecBuild = spec.Build
tmplArgs.SpecMerge = spec.Merge
tmplArgs.ShutdownHook = strings.ReplaceAll(spec.ShutdownHook, "\n", "\n\t")
for _, c := range spec.Commands {
// Allow core dumps for all inittab targets
c.Shell = fmt.Sprintf("ulimit -c unlimited; %s", c.Shell)
tmplArgs.InittabCommands = append(tmplArgs.InittabCommands, inittabCommand{
SysvInitAction: c.SysvInitAction,
CommandUser: c.User,
ShellEscapedCommand: shellescape.Quote(c.Shell),
})
}
for _, f := range spec.Files {
var contents []byte
switch {
case f.Content != nil:
contents = []byte(*f.Content)
case f.HostPath != nil:
// the 'host path' is relative to the directory that the spec file is in
path := filepath.Join(filepath.Dir(*specFile), *f.HostPath)
var err error
contents, err = os.ReadFile(path)
if err != nil {
err = fmt.Errorf("failed to read file %q: %w", path, err)
log.Fatalln(err)
}
}
if err := addFileToTar(tw, f.Filename, contents); err != nil {
log.Fatalln(err)
}
}
}
files := []struct {
filename string
tmpl string
}{
{"Dockerfile", dockerfileVmBuilder},
{"helper.move-bins.sh", scriptMoveBinsHelper},
{"vmstart", scriptVmStart},
{"vmshutdown", scriptVmShutdown},
{"inittab", scriptInitTab},
{"vmacpi", scriptVmAcpi},
{"vminit", scriptVmInit},
{"vector.yaml", configVector},
{"chrony.conf", configChrony},
{"sshd_config", configSshd},
{"udev-init.sh", scriptUdevInit},
{"cg-setup.sh", scriptCgSetup},
{"cg-run.sh", scriptCgRun},
{"resize-swap.sh", scriptResizeSwap},
{"set-disk-quota.sh", scriptSetDiskQuota},
}
for _, f := range files {
if err := AddTemplatedFileToTar(tw, tmplArgs, f.filename, f.tmpl); err != nil {
log.Fatalln(err)
}
}
buildArgs := make(map[string]*string)
buildArgs["DISK_SIZE"] = size
opt := types.ImageBuildOptions{
AuthConfigs: authConfigs,
Tags: []string{dstIm},
BuildArgs: buildArgs,
SuppressOutput: *quiet,
NoCache: false,
Context: tarBuffer,
Dockerfile: "Dockerfile",
Remove: true,
ForceRemove: true,
}
buildResp, err := cli.ImageBuild(ctx, tarBuffer, opt)
if err != nil {
log.Fatalln(err)
}
defer buildResp.Body.Close()
out := io.Writer(os.Stdout)
if *quiet {
out = io.Discard
}
err = jsonmessage.DisplayJSONMessagesStream(buildResp.Body, out, os.Stdout.Fd(), term.IsTerminal(int(os.Stdout.Fd())), nil)
if err != nil {
log.Fatalln(err)
}
if len(*outFile) != 0 {
log.Printf("Save disk image as %s", *outFile)
// create container from docker image we just built
containerResp, err := cli.ContainerCreate(ctx, &container.Config{
Image: dstIm,
Tty: false,
Entrypoint: imageSpec.Config.Entrypoint,
Cmd: imageSpec.Config.Cmd,
}, nil, nil, nil, "")
if err != nil {
log.Fatalln(err)
}
if len(containerResp.Warnings) > 0 {
log.Println(containerResp.Warnings)
}
// copy file from container as tar archive
fromContainer, _, err := cli.CopyFromContainer(ctx, containerResp.ID, "/disk.qcow2")
if err != nil {
log.Fatalln(err)
}
// untar file from tar archive
tarReader := tar.NewReader(fromContainer)
for {
header, err := tarReader.Next()
if errors.Is(err, io.EOF) {
break
} else if err != nil {
log.Fatalln(err)
}
if header.Name != "disk.qcow2" {
log.Printf("skip file %s", header.Name)
continue
}
path := filepath.Join(*outFile) //nolint:gocritic // FIXME: this is probably incorrect, intended to join with header.Name ?
info := header.FileInfo()
// Open and write to the file inside a closure, so we can defer close
err = func() error {
file, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, info.Mode())
if err != nil {
return err
}
defer file.Close()
_, err = io.Copy(file, tarReader)
return err
}()
if err != nil {
log.Fatalln(err)
}
}
// remove container
if err = cli.ContainerRemove(ctx, containerResp.ID, types.ContainerRemoveOptions{}); err != nil {
log.Println(err)
}
}
}
type imageSpec struct {
Commands []command `yaml:"commands"`
ShutdownHook string `yaml:"shutdownHook,omitempty"`
Build string `yaml:"build"`
Merge string `yaml:"merge"`
Files []file `yaml:"files"`
}
type command struct {
Name string `yaml:"name"`
User string `yaml:"user"`
SysvInitAction string `yaml:"sysvInitAction"`
Shell string `yaml:"shell"`
}
type file struct {
Filename string `yaml:"filename"`
HostPath *string `yaml:"hostPath,omitempty"`
Content *string `yaml:"content,omitempty"`
}
func readImageSpec(path string) (*imageSpec, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("failed to open file at %q: %w", path, err)
}
var spec imageSpec
dec := yaml.NewDecoder(f)
dec.KnownFields(true) // disallow unknown fields
if err := dec.Decode(&spec); err != nil {
return nil, err
}
var errs []error
for i, c := range spec.Commands {
for _, e := range c.validate() {
errs = append(errs, fmt.Errorf("error in commands[%d]: %w", i, e))
}
}
for i, f := range spec.Files {
for _, e := range f.validate() {
errs = append(errs, fmt.Errorf("error in files[%d]: %w", i, e))
}
}
if err := errors.Join(errs...); err != nil {
return nil, fmt.Errorf("invalid image spec: %w", err)
}
return &spec, nil
}
func (c command) validate() []error {
checkNonempty := func(errs *[]error, field string, value string) {
if value == "" {
*errs = append(*errs, fmt.Errorf("command must have non-empty field '%s'", field))
}
}
var errs []error
checkNonempty(&errs, "name", c.Name)
checkNonempty(&errs, "user", c.User)
checkNonempty(&errs, "sysvInitAction", c.SysvInitAction)
checkNonempty(&errs, "shell", c.Shell)
return errs
}
func (f file) validate() []error {
var errs []error
if f.Filename == "" {
errs = append(errs, errors.New("file must have non-empty field 'filename'"))
}
if f.HostPath == nil && f.Content == nil {
errs = append(errs, errors.New("file missing either 'hostPath' or 'content'"))
} else if f.HostPath != nil && f.Content != nil {
errs = append(errs, errors.New("file must have only one of 'hostPath' or 'content'"))
}
return errs
}
//go:build linux
package main
import (
"context"
"flag"
"log"
"net"
"os"
"syscall"
"time"
"github.com/coreos/go-iptables/iptables"
"github.com/vishvananda/netlink"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
)
const (
// vxlan interface details
VXLAN_IF_NAME = "neon-vxlan0"
VXLAN_BRIDGE_NAME = "neon-br0"
VXLAN_ID = 100
// iptables settings details
iptablesChainName = "NEON-EXTRANET"
extraNetCidr = "10.100.0.0/16"
)
var (
deleteIfaces = flag.Bool("delete", false, `delete VXLAN interfaces`)
)
func main() {
flag.Parse()
// creates the in-cluster config
config, err := rest.InClusterConfig()
if err != nil {
log.Fatal(err)
}
// creates the clientset
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
log.Fatal(err)
}
// -delete option used for teardown vxlan setup
if *deleteIfaces {
log.Printf("deleting vxlan interface %s", VXLAN_IF_NAME)
if err := deleteLink(VXLAN_IF_NAME); err != nil {
log.Print(err)
}
log.Printf("deleting bridge interface %s", VXLAN_BRIDGE_NAME)
if err := deleteLink(VXLAN_BRIDGE_NAME); err != nil {
log.Print(err)
}
log.Printf("deleting iptables nat rules")
if err := deleteIptablesRules(); err != nil {
log.Print(err)
}
os.Exit(0)
}
ownNodeIP := os.Getenv("MY_NODE_IP")
log.Printf("own node IP: %s", ownNodeIP)
// create linux bridge
log.Printf("creating linux bridge interface (name: %s)", VXLAN_BRIDGE_NAME)
if err := createBrigeInterface(VXLAN_BRIDGE_NAME); err != nil {
log.Fatal(err)
}
// create vxlan
log.Printf("creating vxlan interface (name: %s, id: %d)", VXLAN_IF_NAME, VXLAN_ID)
if err := createVxlanInterface(VXLAN_IF_NAME, VXLAN_ID, ownNodeIP, VXLAN_BRIDGE_NAME); err != nil {
log.Fatal(err)
}
for {
log.Print("getting nodes IP addresses")
nodeIPs, err := getNodesIPs(clientset)
if err != nil {
log.Fatal(err)
}
log.Printf("found %d ip addresses", len(nodeIPs))
// update FDB
log.Print("update FDB table")
if err := updateFDB(VXLAN_IF_NAME, nodeIPs, ownNodeIP); err != nil {
log.Fatal(err)
}
// upsert iptables nat rules
log.Printf("upsert iptables nat rules")
if err := upsertIptablesRules(); err != nil {
log.Print(err)
}
time.Sleep(30 * time.Second)
}
}
func getNodesIPs(clientset *kubernetes.Clientset) ([]string, error) {
ips := []string{}
nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
if err != nil {
return ips, err
}
for _, n := range nodes.Items {
for _, a := range n.Status.Addresses {
if a.Type == corev1.NodeInternalIP {
ips = append(ips, a.Address)
}
}
}
return ips, nil
}
func createBrigeInterface(name string) error {
// check if interface already exists
_, err := netlink.LinkByName(name)
if err == nil {
log.Printf("link with name %s already found", name)
return nil
}
_, notFound := err.(netlink.LinkNotFoundError) //nolint:errorlint // errors.Is doesn't work, we actually just want to know the type.
if !notFound {
return err
}
// create an configure linux bridge
link := &netlink.Bridge{
LinkAttrs: netlink.LinkAttrs{
Name: name,
},
}
if err := netlink.LinkAdd(link); err != nil {
return err
}
if err := netlink.LinkSetUp(link); err != nil {
return err
}
return nil
}
func createVxlanInterface(name string, vxlanID int, ownIP string, bridgeName string) error {
// check if interface already exists
_, err := netlink.LinkByName(name)
if err == nil {
log.Printf("link with name %s already found", name)
return nil
}
_, notFound := err.(netlink.LinkNotFoundError) //nolint:errorlint // errors.Is doesn't work, we actually just want to know the type.
if !notFound {
return err
}
// create an configure vxlan
link := &netlink.Vxlan{
LinkAttrs: netlink.LinkAttrs{
Name: name,
},
VxlanId: vxlanID,
SrcAddr: net.ParseIP(ownIP),
Port: 4789,
}
if err := netlink.LinkAdd(link); err != nil {
return err
}
// add vxlan to bridge
br, err := netlink.LinkByName(bridgeName)
if err != nil {
return err
}
if err := netlink.LinkSetMaster(link, br); err != nil {
return err
}
if err := netlink.LinkSetUp(link); err != nil {
return err
}
return nil
}
func updateFDB(vxlanName string, nodeIPs []string, ownIP string) error {
broadcastFdbMac, _ := net.ParseMAC("00:00:00:00:00:00")
// get vxlan interface details
link, err := netlink.LinkByName(vxlanName)
if err != nil {
return err
}
for _, ip := range nodeIPs {
if ip != ownIP {
broadcastFdbEntry := netlink.Neigh{
LinkIndex: link.Attrs().Index,
Family: syscall.AF_BRIDGE,
State: netlink.NUD_PERMANENT,
Flags: netlink.NTF_SELF,
IP: net.ParseIP(ip),
HardwareAddr: broadcastFdbMac,
}
// add entry to FDB table
// duplicate append action will not case error.
log.Printf("add/update FDB broadcast entry via %s", ip)
if err := netlink.NeighAppend(&broadcastFdbEntry); err != nil {
return err
}
}
}
return nil
}
func deleteLink(name string) error {
// check if interface already exists
link, err := netlink.LinkByName(name)
if err == nil {
if err := netlink.LinkDel(link); err != nil {
return err
}
log.Printf("link with name %s was deleted", name)
return nil
}
_, notFound := err.(netlink.LinkNotFoundError) //nolint:errorlint // errors.Is doesn't work, we actually just want to know the type.
if !notFound {
return err
}
log.Printf("link with name %s not found", name)
return nil
}
func upsertIptablesRules() error {
// manage iptables
ipt, err := iptables.New(iptables.IPFamily(iptables.ProtocolIPv4), iptables.Timeout(5))
if err != nil {
return err
}
chainExists, err := ipt.ChainExists("nat", iptablesChainName)
if err != nil {
return err
}
if !chainExists {
err := ipt.NewChain("nat", iptablesChainName)
if err != nil {
return err
}
}
if err := insertRule(ipt, "nat", "POSTROUTING", 1, "-d", extraNetCidr, "-j", iptablesChainName); err != nil {
return err
}
if err := insertRule(ipt, "nat", iptablesChainName, 1, "-s", extraNetCidr, "-j", "ACCEPT"); err != nil {
return err
}
if err := insertRule(ipt, "nat", iptablesChainName, 2, "-d", extraNetCidr, "-j", "ACCEPT"); err != nil {
return err
}
return nil
}
func deleteIptablesRules() error {
// manage iptables
ipt, err := iptables.New(iptables.IPFamily(iptables.ProtocolIPv4), iptables.Timeout(5))
if err != nil {
return err
}
err = ipt.ClearAndDeleteChain("nat", iptablesChainName)
if err != nil {
return err
}
return nil
}
// insertRule acts like Insert except that it won't insert a duplicate (no matter the position in the chain)
func insertRule(ipt *iptables.IPTables, table, chain string, pos int, rulespec ...string) error {
exists, err := ipt.Exists(table, chain, rulespec...)
if err != nil {
return err
}
if !exists {
return ipt.Insert(table, chain, pos, rulespec...)
}
return nil
}
package agent
import (
"fmt"
"os"
)
// EnvArgs stores the static configuration data assigned to the autoscaler agent by its
// environment
type EnvArgs struct {
// ConfigPath gives the path to read static configuration from. It is taken from the CONFIG_PATH
// environment variable.
ConfigPath string
// K8sNodeName is the Kubernetes node the autoscaler agent is running on. It is taken from the
// K8S_NODE_NAME environment variable, which is set equal to the pod's Spec.NodeName.
//
// The Kubernetes documentation doesn't say this, but the NodeName is always populated with the
// final node the pod was placed on by the time the environment variables are set.
K8sNodeName string
// K8sPodIP is the IP address of the Kubernetes pod that this autoscaler-agent is running in
K8sPodIP string
}
func getEnvVar(err *error, require_nonempty bool, varName string) string {
if *err != nil {
return ""
}
s := os.Getenv(varName)
if s == "" && require_nonempty {
*err = fmt.Errorf("Missing %s in environment", varName)
}
return s
}
func ArgsFromEnv() (EnvArgs, error) {
var err error
args := EnvArgs{
ConfigPath: getEnvVar(&err, true, "CONFIG_PATH"),
K8sNodeName: getEnvVar(&err, true, "K8S_NODE_NAME"),
K8sPodIP: getEnvVar(&err, true, "K8S_POD_IP"),
}
if err != nil {
return EnvArgs{}, err
} else {
return args, err
}
}
package billing
import (
"context"
"errors"
"fmt"
"math"
"net/http"
"time"
"go.uber.org/zap"
"k8s.io/apimachinery/pkg/types"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/billing"
"github.com/neondatabase/autoscaling/pkg/util"
)
type Config struct {
Clients ClientsConfig `json:"clients"`
CPUMetricName string `json:"cpuMetricName"`
ActiveTimeMetricName string `json:"activeTimeMetricName"`
CollectEverySeconds uint `json:"collectEverySeconds"`
AccumulateEverySeconds uint `json:"accumulateEverySeconds"`
}
type ClientsConfig struct {
AzureBlob *AzureBlobStorageConfig `json:"azureBlob"`
HTTP *HTTPClientConfig `json:"http"`
S3 *S3ClientConfig `json:"s3"`
}
type AzureBlobStorageConfig struct {
BaseClientConfig
billing.AzureBlobStorageClientConfig
}
type HTTPClientConfig struct {
BaseClientConfig
URL string `json:"url"`
}
type S3ClientConfig struct {
BaseClientConfig
billing.S3ClientConfig
}
type BaseClientConfig struct {
PushEverySeconds uint `json:"pushEverySeconds"`
PushRequestTimeoutSeconds uint `json:"pushRequestTimeoutSeconds"`
MaxBatchSize uint `json:"maxBatchSize"`
}
type metricsState struct {
historical map[metricsKey]vmMetricsHistory
present map[metricsKey]vmMetricsInstant
lastCollectTime *time.Time
pushWindowStart time.Time
}
type metricsKey struct {
uid types.UID
endpointID string
}
type vmMetricsHistory struct {
lastSlice *metricsTimeSlice
total vmMetricsSeconds
}
type metricsTimeSlice struct {
metrics vmMetricsInstant
startTime time.Time
endTime time.Time
}
func (m *metricsTimeSlice) Duration() time.Duration { return m.endTime.Sub(m.startTime) }
type vmMetricsInstant struct {
// cpu stores the cpu allocation at a particular instant.
cpu vmapi.MilliCPU
}
// vmMetricsSeconds is like vmMetrics, but the values cover the allocation over time
type vmMetricsSeconds struct {
// cpu stores the CPU seconds allocated to the VM, roughly equivalent to the integral of CPU
// usage over time.
cpu float64
// activeTime stores the total time that the VM was active
activeTime time.Duration
}
type MetricsCollector struct {
conf *Config
clients []clientInfo
}
func NewMetricsCollector(
ctx context.Context,
parentLogger *zap.Logger,
conf *Config,
) (*MetricsCollector, error) {
logger := parentLogger.Named("billing")
mc := &MetricsCollector{
conf: conf,
clients: make([]clientInfo, 0),
}
if c := conf.Clients.AzureBlob; c != nil {
client, err := billing.NewAzureBlobStorageClient(c.AzureBlobStorageClientConfig)
if err != nil {
return nil, fmt.Errorf("error creating AzureBlobStorageClient: %w", err)
}
mc.clients = append(mc.clients, clientInfo{
client: client,
name: "azureblob",
config: c.BaseClientConfig,
})
}
if c := conf.Clients.HTTP; c != nil {
mc.clients = append(mc.clients, clientInfo{
client: billing.NewHTTPClient(c.URL, http.DefaultClient),
name: "http",
config: c.BaseClientConfig,
})
}
if c := conf.Clients.S3; c != nil {
client, err := billing.NewS3Client(ctx, c.S3ClientConfig)
if err != nil {
return nil, fmt.Errorf("failed to create S3 client: %w", err)
}
logger.Info("Created S3 client", client.LogFields())
mc.clients = append(mc.clients, clientInfo{
client: client,
name: "s3",
config: c.BaseClientConfig,
})
}
return mc, nil
}
func (mc *MetricsCollector) Run(
ctx context.Context,
logger *zap.Logger,
store VMStoreForNode,
metrics PromMetrics,
) error {
collectTicker := time.NewTicker(time.Second * time.Duration(mc.conf.CollectEverySeconds))
defer collectTicker.Stop()
// Offset by half a second, so it's a bit more deterministic.
time.Sleep(500 * time.Millisecond)
accumulateTicker := time.NewTicker(time.Second * time.Duration(mc.conf.AccumulateEverySeconds))
defer accumulateTicker.Stop()
state := metricsState{
historical: make(map[metricsKey]vmMetricsHistory),
present: make(map[metricsKey]vmMetricsInstant),
lastCollectTime: nil,
pushWindowStart: time.Now(),
}
var queueWriters []eventQueuePusher[*billing.IncrementalEvent]
for _, c := range mc.clients {
qw, queueReader := newEventQueue[*billing.IncrementalEvent](metrics.queueSizeCurrent.WithLabelValues(c.name))
queueWriters = append(queueWriters, qw)
// Start the sender
signalDone, thisThreadFinished := util.NewCondChannelPair()
defer signalDone.Send() //nolint:gocritic // this defer-in-loop is intentional.
sender := eventSender{
clientInfo: c,
metrics: metrics,
queue: queueReader,
collectorFinished: thisThreadFinished,
lastSendDuration: 0,
}
go sender.senderLoop(logger.Named(fmt.Sprintf("send-%s", c.name)))
}
// The rest of this function is to do with collection
logger = logger.Named("collect")
state.collect(logger, store, metrics)
for {
select {
case <-collectTicker.C:
logger.Info("Collecting billing state")
if store.Stopped() && ctx.Err() == nil {
err := errors.New("VM store stopped but background context is still live")
logger.Panic("Validation check failed", zap.Error(err))
return err
}
state.collect(logger, store, metrics)
case <-accumulateTicker.C:
logger.Info("Creating billing batch")
state.drainEnqueue(logger, mc.conf, billing.GetHostname(), queueWriters)
case <-ctx.Done():
return nil
}
}
}
func (s *metricsState) collect(logger *zap.Logger, store VMStoreForNode, metrics PromMetrics) {
now := time.Now()
metricsBatch := metrics.forBatch()
defer metricsBatch.finish() // This doesn't *really* need to be deferred, but it's up here so we don't forget
old := s.present
s.present = make(map[metricsKey]vmMetricsInstant)
var vmsOnThisNode []*vmapi.VirtualMachine
if store.Failing() {
logger.Error("VM store is currently stopped. No events will be recorded")
} else {
vmsOnThisNode = store.ListIndexed(func(i *VMNodeIndex) []*vmapi.VirtualMachine {
return i.List()
})
}
for _, vm := range vmsOnThisNode {
endpointID, isEndpoint := vm.Annotations[api.AnnotationBillingEndpointID]
metricsBatch.inc(isEndpointFlag(isEndpoint), autoscalingEnabledFlag(api.HasAutoscalingEnabled(vm)), vm.Status.Phase)
if !isEndpoint {
// we're only reporting metrics for VMs with endpoint IDs, and this VM doesn't have one
continue
}
if !vm.Status.Phase.IsAlive() || vm.Status.CPUs == nil {
continue
}
key := metricsKey{
uid: vm.UID,
endpointID: endpointID,
}
presentMetrics := vmMetricsInstant{
cpu: *vm.Status.CPUs,
}
if oldMetrics, ok := old[key]; ok {
// The VM was present from s.lastTime to now. Add a time slice to its metrics history.
timeSlice := metricsTimeSlice{
metrics: vmMetricsInstant{
// strategically under-bill by assigning the minimum to the entire time slice.
cpu: min(oldMetrics.cpu, presentMetrics.cpu),
},
// note: we know s.lastTime != nil because otherwise old would be empty.
startTime: *s.lastCollectTime,
endTime: now,
}
vmHistory, ok := s.historical[key]
if !ok {
vmHistory = vmMetricsHistory{
lastSlice: nil,
total: vmMetricsSeconds{cpu: 0, activeTime: time.Duration(0)},
}
}
// append the slice, merging with the previous if the resource usage was the same
vmHistory.appendSlice(timeSlice)
s.historical[key] = vmHistory
}
s.present[key] = presentMetrics
}
s.lastCollectTime = &now
}
func (h *vmMetricsHistory) appendSlice(timeSlice metricsTimeSlice) {
// Try to extend the existing period of continuous usage
if h.lastSlice != nil && h.lastSlice.tryMerge(timeSlice) {
return
}
// Something's new. Push previous time slice, start new one:
h.finalizeCurrentTimeSlice()
h.lastSlice = &timeSlice
}
// finalizeCurrentTimeSlice pushes the current time slice onto h.total
//
// This ends up rounding down the total time spent on a given time slice, so it's best to defer
// calling this function until it's actually needed.
func (h *vmMetricsHistory) finalizeCurrentTimeSlice() {
if h.lastSlice == nil {
return
}
duration := h.lastSlice.Duration()
if duration < 0 {
panic("negative duration")
}
// TODO: This approach is imperfect. Floating-point math is probably *fine*, but really not
// something we want to rely on. A "proper" solution is a lot of work, but long-term valuable.
metricsSeconds := vmMetricsSeconds{
cpu: duration.Seconds() * h.lastSlice.metrics.cpu.AsFloat64(),
activeTime: duration,
}
h.total.cpu += metricsSeconds.cpu
h.total.activeTime += metricsSeconds.activeTime
h.lastSlice = nil
}
// tryMerge attempts to merge s and next (assuming that next is after s), returning true only if
// that merging was successful.
//
// Merging may fail if s.endTime != next.startTime or s.metrics != next.metrics.
func (s *metricsTimeSlice) tryMerge(next metricsTimeSlice) bool {
merged := s.endTime == next.startTime && s.metrics == next.metrics
if merged {
s.endTime = next.endTime
}
return merged
}
func logAddedEvent(logger *zap.Logger, event *billing.IncrementalEvent) *billing.IncrementalEvent {
logger.Info(
"Adding event to batch",
zap.String("IdempotencyKey", event.IdempotencyKey),
zap.String("EndpointID", event.EndpointID),
zap.String("MetricName", event.MetricName),
zap.Int("Value", event.Value),
)
return event
}
// drainEnqueue clears the current history, adding it as events to the queue
func (s *metricsState) drainEnqueue(logger *zap.Logger, conf *Config, hostname string, queues []eventQueuePusher[*billing.IncrementalEvent]) {
now := time.Now()
countInBatch := 0
batchSize := 2 * len(s.historical)
// Helper function that adds an event to all queues
enqueue := func(event *billing.IncrementalEvent) {
for _, q := range queues {
q.enqueue(event)
}
}
for key, history := range s.historical {
history.finalizeCurrentTimeSlice()
countInBatch += 1
enqueue(logAddedEvent(logger, billing.Enrich(now, hostname, countInBatch, batchSize, &billing.IncrementalEvent{
MetricName: conf.CPUMetricName,
Type: "", // set by billing.Enrich
IdempotencyKey: "", // set by billing.Enrich
EndpointID: key.endpointID,
// TODO: maybe we should store start/stop time in the vmMetricsHistory object itself?
// That way we can be aligned to collection, rather than pushing.
StartTime: s.pushWindowStart,
StopTime: now,
Value: int(math.Round(history.total.cpu)),
})))
countInBatch += 1
enqueue(logAddedEvent(logger, billing.Enrich(now, hostname, countInBatch, batchSize, &billing.IncrementalEvent{
MetricName: conf.ActiveTimeMetricName,
Type: "", // set by billing.Enrich
IdempotencyKey: "", // set by billing.Enrich
EndpointID: key.endpointID,
StartTime: s.pushWindowStart,
StopTime: now,
Value: int(math.Round(history.total.activeTime.Seconds())),
})))
}
s.pushWindowStart = now
s.historical = make(map[metricsKey]vmMetricsHistory)
}
package billing
// Types and implementation relating to VMNodeIndex, which provides indexing for watch.Watch for
// efficient lookup of VMs on a particular node.
import (
"k8s.io/apimachinery/pkg/types"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/util/watch"
)
type VMStoreForNode = watch.IndexedStore[vmapi.VirtualMachine, *VMNodeIndex]
// VMNodeIndex is a watch.Index that stores all of the VMs for a particular node
//
// We have to implement this ourselves because K8s does not (as of 2023-04-04) support field
// selectors on CRDs, so we can't have the API server filter out VMs for us.
//
// For more info, see: https://github.com/kubernetes/kubernetes/issues/53459
// This comment in particular was particularly instructive:
// https://github.com/kubernetes/kubernetes/issues/53459#issuecomment-1146200268
type VMNodeIndex struct {
forNode map[types.UID]*vmapi.VirtualMachine
node string
}
func NewVMNodeIndex(node string) *VMNodeIndex {
return &VMNodeIndex{
forNode: make(map[types.UID]*vmapi.VirtualMachine),
node: node,
}
}
func (i *VMNodeIndex) Add(vm *vmapi.VirtualMachine) {
if vm.Status.Node == i.node {
i.forNode[vm.UID] = vm
}
}
func (i *VMNodeIndex) Update(oldVM, newVM *vmapi.VirtualMachine) {
i.Delete(oldVM)
i.Add(newVM)
}
func (i *VMNodeIndex) Delete(vm *vmapi.VirtualMachine) {
// note: delete is a no-op if the key isn't present.
delete(i.forNode, vm.UID)
}
func (i *VMNodeIndex) List() []*vmapi.VirtualMachine {
items := make([]*vmapi.VirtualMachine, 0, len(i.forNode))
for _, vm := range i.forNode {
items = append(items, vm)
}
return items
}
package billing
// Prometheus metrics for the agent's billing subsystem
import (
"strconv"
"github.com/prometheus/client_golang/prometheus"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)
type PromMetrics struct {
vmsProcessedTotal *prometheus.CounterVec
vmsCurrent *prometheus.GaugeVec
queueSizeCurrent *prometheus.GaugeVec
lastSendDuration *prometheus.GaugeVec
sendErrorsTotal *prometheus.CounterVec
}
func NewPromMetrics() PromMetrics {
return PromMetrics{
vmsProcessedTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_billing_vms_processed_total",
Help: "Total number of times the autoscaler-agent's billing subsystem processes any VM",
},
[]string{"is_endpoint", "autoscaling_enabled", "phase"},
),
vmsCurrent: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "autoscaling_agent_billing_vms_current",
Help: "Total current VMs visible to the autoscaler-agent's billing subsystem, labeled by some bits of metadata",
},
[]string{"is_endpoint", "autoscaling_enabled", "phase"},
),
queueSizeCurrent: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "autoscaling_agent_billing_queue_size",
Help: "Size of the billing subsystem's queue of unsent events",
},
[]string{"client"},
),
lastSendDuration: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "autoscaling_agent_billing_last_send_duration_seconds",
Help: "Duration, in seconds, that it took to send the latest set of billing events (or current time if ongoing)",
},
[]string{"client"},
),
sendErrorsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_billing_send_errors_total",
Help: "Total errors from attempting to send billing events",
},
[]string{"client", "cause"},
),
}
}
func (m PromMetrics) MustRegister(reg *prometheus.Registry) {
reg.MustRegister(m.vmsProcessedTotal)
reg.MustRegister(m.vmsCurrent)
reg.MustRegister(m.queueSizeCurrent)
reg.MustRegister(m.lastSendDuration)
reg.MustRegister(m.sendErrorsTotal)
}
type batchMetrics struct {
total map[batchMetricsLabels]int
vmsProcessedTotal *prometheus.CounterVec
vmsCurrent *prometheus.GaugeVec
}
type batchMetricsLabels struct {
isEndpoint string
autoscalingEnabled string
phase string
}
func (m PromMetrics) forBatch() batchMetrics {
m.vmsCurrent.Reset()
return batchMetrics{
total: make(map[batchMetricsLabels]int),
vmsProcessedTotal: m.vmsProcessedTotal,
vmsCurrent: m.vmsCurrent,
}
}
type isEndpointFlag bool
type autoscalingEnabledFlag bool
func (b batchMetrics) inc(isEndpoint isEndpointFlag, autoscalingEnabled autoscalingEnabledFlag, phase vmapi.VmPhase) {
key := batchMetricsLabels{
isEndpoint: strconv.FormatBool(bool(isEndpoint)),
autoscalingEnabled: strconv.FormatBool(bool(autoscalingEnabled)),
phase: string(phase),
}
b.total[key] = b.total[key] + 1
b.vmsProcessedTotal.
WithLabelValues(key.isEndpoint, key.autoscalingEnabled, key.phase).
Inc()
}
func (b batchMetrics) finish() {
for key, count := range b.total {
b.vmsCurrent.WithLabelValues(key.isEndpoint, key.autoscalingEnabled, key.phase).Set(float64(count))
}
}
package billing
// Implementation of the event queue for mediating event generation and event sending.
//
// The "public" (ish - it's all one package) types are eventQueuePuller and eventQueuePusher, two
// halves of the same queue. Each half is only safe for use from a single thread, but *together*
// they can be used in separate threads.
import (
"sync"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/exp/slices"
)
// this is generic just so there's less typing - "billing.IncrementalEvent" is long!
type eventQueueInternals[E any] struct {
mu sync.Mutex
items []E
sizeGauge prometheus.Gauge
}
type eventQueuePuller[E any] struct {
internals *eventQueueInternals[E]
}
type eventQueuePusher[E any] struct {
internals *eventQueueInternals[E]
}
func newEventQueue[E any](sizeGauge prometheus.Gauge) (eventQueuePusher[E], eventQueuePuller[E]) {
internals := &eventQueueInternals[E]{
mu: sync.Mutex{},
items: nil,
sizeGauge: sizeGauge,
}
return eventQueuePusher[E]{internals}, eventQueuePuller[E]{internals}
}
// NB: must hold mu
func (qi *eventQueueInternals[E]) updateGauge() {
qi.sizeGauge.Set(float64(len(qi.items)))
}
func (q eventQueuePusher[E]) enqueue(events ...E) {
q.internals.mu.Lock()
defer q.internals.mu.Unlock()
q.internals.items = append(q.internals.items, events...)
q.internals.updateGauge()
}
func (q eventQueuePuller[E]) size() int {
q.internals.mu.Lock()
defer q.internals.mu.Unlock()
return len(q.internals.items)
}
func (q eventQueuePuller[E]) get(limit int) []E {
q.internals.mu.Lock()
defer q.internals.mu.Unlock()
count := min(limit, len(q.internals.items))
// NOTE: this kind of access escaping the mutex is only sound because this access is only
// granted to the puller, and there's only one puller, and it isn't sound to use the output of a
// previous get() after calling drop().
return q.internals.items[:count]
}
func (q eventQueuePuller[E]) drop(count int) {
q.internals.mu.Lock()
defer q.internals.mu.Unlock()
q.internals.items = slices.Replace(q.internals.items, 0, count)
q.internals.updateGauge()
}
package billing
// Logic responsible for sending billing events by repeatedly pulling from the eventQueue
import (
"context"
"fmt"
"time"
"go.uber.org/zap"
"github.com/neondatabase/autoscaling/pkg/billing"
"github.com/neondatabase/autoscaling/pkg/util"
)
type clientInfo struct {
client billing.Client
name string
config BaseClientConfig
}
type eventSender struct {
clientInfo
metrics PromMetrics
queue eventQueuePuller[*billing.IncrementalEvent]
collectorFinished util.CondChannelReceiver
// lastSendDuration tracks the "real" last full duration of (eventSender).sendAllCurrentEvents().
//
// It's separate from metrics.lastSendDuration because (a) we'd like to include the duration of
// ongoing calls to sendAllCurrentEvents, but (b) we don't want the bias towards lower durations
// that comes with that.
//
// Here's some more detail:
//
// To make sure that long-running sendAllCurrentEvents() loops show up in the metrics while
// they're still running, we want to periodically update metrics.lastSendDuration before the
// loop has finished. A side-effect of doing this naively is that the gauge will sometimes
// return durations that are much shorter than the *actual* previous send loop duration.
//
// In order to fix this, we store that *actual* previous duration in this field, but only
// update the metric when either (a) the loop is done, or (b) the duration so far is already
// longer than the previous one.
//
// This means that we remove the bias towards shorter durations, at the expense of sometimes
// returning higher durations for too long. IMO that's ok, and we'd rather have our metrics give
// a pessimistic but more accurate view.
lastSendDuration time.Duration
}
func (s eventSender) senderLoop(logger *zap.Logger) {
ticker := time.NewTicker(time.Second * time.Duration(s.config.PushEverySeconds))
defer ticker.Stop()
for {
final := false
select {
case <-s.collectorFinished.Recv():
logger.Info("Received notification that collector finished")
final = true
case <-ticker.C:
}
s.sendAllCurrentEvents(logger)
if final {
logger.Info("Ending events sender loop")
return
}
}
}
func (s eventSender) sendAllCurrentEvents(logger *zap.Logger) {
logger.Info("Pushing all available events")
if s.queue.size() == 0 {
logger.Info("No billing events to push")
s.lastSendDuration = 0
s.metrics.lastSendDuration.WithLabelValues(s.clientInfo.name).Set(1e-6) // small value, to indicate that nothing happened
return
}
total := 0
startTime := time.Now()
// while there's still events in the queue, send them
//
// If events are being added to the queue faster than we can send them, this loop will not
// terminate. For the most part, that's ok: worst-case, we miss the collectorFinished
// notification, which isn't the end of the world. Any long-running call to this function will
// be reported by s.metrics.lastSendDuration as we go (provided the request timeout isn't too
// long).
for {
if size := s.queue.size(); size != 0 {
logger.Info("Current queue size is non-zero", zap.Int("queueSize", size))
}
chunk := s.queue.get(int(s.config.MaxBatchSize))
count := len(chunk)
if count == 0 {
totalTime := time.Since(startTime)
s.lastSendDuration = totalTime
s.metrics.lastSendDuration.WithLabelValues(s.clientInfo.name).Set(totalTime.Seconds())
logger.Info(
"All available events have been sent",
zap.Int("total", total),
zap.Duration("totalTime", totalTime),
)
return
}
traceID := billing.GenerateTraceID()
logger.Info(
"Pushing billing events",
zap.Int("count", count),
zap.String("traceID", string(traceID)),
s.client.LogFields(),
)
reqStart := time.Now()
err := func() error {
reqCtx, cancel := context.WithTimeout(context.TODO(), time.Second*time.Duration(s.config.PushRequestTimeoutSeconds))
defer cancel()
return billing.Send(reqCtx, s.client, traceID, chunk)
}()
reqDuration := time.Since(reqStart)
if err != nil {
// Something went wrong and we're going to abandon attempting to push any further
// events.
logger.Error(
"Failed to push billing events",
zap.Int("count", count),
zap.Duration("after", reqDuration),
zap.String("traceID", string(traceID)),
s.client.LogFields(),
zap.Int("total", total),
zap.Duration("totalTime", time.Since(startTime)),
zap.Error(err),
)
var rootErr string
//nolint:errorlint // The type switch (instead of errors.As) is ok; billing.Send() guarantees the error types.
switch e := err.(type) {
case billing.JSONError:
rootErr = "JSON marshaling"
case billing.UnexpectedStatusCodeError:
rootErr = fmt.Sprintf("HTTP code %d", e.StatusCode)
case billing.S3Error:
rootErr = "S3 error"
case billing.AzureError:
rootErr = "Azure Blob error"
default:
rootErr = util.RootError(err).Error()
}
s.metrics.sendErrorsTotal.WithLabelValues(s.clientInfo.name, rootErr).Inc()
s.lastSendDuration = 0
s.metrics.lastSendDuration.WithLabelValues(s.clientInfo.name).Set(0.0) // use 0 as a flag that something went wrong; there's no valid time here.
return
}
s.queue.drop(count) // mark len(chunk) as successfully processed
total += len(chunk)
currentTotalTime := time.Since(startTime)
logger.Info(
"Successfully pushed some billing events",
zap.Int("count", count),
zap.Duration("after", reqDuration),
zap.String("traceID", string(traceID)),
s.client.LogFields(),
zap.Int("total", total),
zap.Duration("totalTime", currentTotalTime),
)
if currentTotalTime > s.lastSendDuration {
s.lastSendDuration = currentTotalTime
s.metrics.lastSendDuration.WithLabelValues(s.clientInfo.name).Set(currentTotalTime.Seconds())
}
}
}
package agent
import (
"encoding/json"
"fmt"
"os"
"github.com/tychoish/fun/erc"
"github.com/neondatabase/autoscaling/pkg/agent/billing"
"github.com/neondatabase/autoscaling/pkg/api"
)
type Config struct {
RefreshStateIntervalSeconds uint `json:"refereshStateIntervalSeconds"`
Scaling ScalingConfig `json:"scaling"`
Metrics MetricsConfig `json:"metrics"`
Scheduler SchedulerConfig `json:"scheduler"`
Monitor MonitorConfig `json:"monitor"`
NeonVM NeonVMConfig `json:"neonvm"`
Billing billing.Config `json:"billing"`
DumpState *DumpStateConfig `json:"dumpState"`
}
type RateThresholdConfig struct {
IntervalSeconds uint `json:"intervalSeconds"`
Threshold uint `json:"threshold"`
}
type MonitorConfig struct {
ResponseTimeoutSeconds uint `json:"responseTimeoutSeconds"`
// ConnectionTimeoutSeconds gives how long we may take to connect to the
// monitor before cancelling.
ConnectionTimeoutSeconds uint `json:"connectionTimeoutSeconds"`
// ConnectionRetryMinWaitSeconds gives the minimum amount of time we must wait between attempts
// to connect to the vm-monitor, regardless of whether they're successful.
ConnectionRetryMinWaitSeconds uint `json:"connectionRetryMinWaitSeconds"`
// ServerPort is the port that the dispatcher serves from
ServerPort uint16 `json:"serverPort"`
// UnhealthyAfterSilenceDurationSeconds gives the duration, in seconds, after which failing to
// receive a successful request from the monitor indicates that it is probably unhealthy.
UnhealthyAfterSilenceDurationSeconds uint `json:"unhealthyAfterSilenceDurationSeconds"`
// UnhealthyStartupGracePeriodSeconds gives the duration, in seconds, after which we will no
// longer excuse total VM monitor failures - i.e. when unhealthyAfterSilenceDurationSeconds
// kicks in.
UnhealthyStartupGracePeriodSeconds uint `json:"unhealthyStartupGracePeriodSeconds"`
// MaxHealthCheckSequentialFailuresSeconds gives the duration, in seconds, after which we
// should restart the connection to the vm-monitor if health checks aren't succeeding.
MaxHealthCheckSequentialFailuresSeconds uint `json:"maxHealthCheckSequentialFailuresSeconds"`
// MaxFailedRequestRate defines the maximum rate of failed monitor requests, above which
// a VM is considered stuck.
MaxFailedRequestRate RateThresholdConfig `json:"maxFailedRequestRate"`
// RetryFailedRequestSeconds gives the duration, in seconds, that we must wait before retrying a
// request that previously failed.
RetryFailedRequestSeconds uint `json:"retryFailedRequestSeconds"`
// RetryDeniedDownscaleSeconds gives the duration, in seconds, that we must wait before retrying
// a downscale request that was previously denied
RetryDeniedDownscaleSeconds uint `json:"retryDeniedDownscaleSeconds"`
// RequestedUpscaleValidSeconds gives the duration, in seconds, that requested upscaling should
// be respected for, before allowing re-downscaling.
RequestedUpscaleValidSeconds uint `json:"requestedUpscaleValidSeconds"`
}
// DumpStateConfig configures the endpoint to dump all internal state
type DumpStateConfig struct {
// Port is the port to serve on
Port uint16 `json:"port"`
// TimeoutSeconds gives the maximum duration, in seconds, that we allow for a request to dump
// internal state.
TimeoutSeconds uint `json:"timeoutSeconds"`
}
// ScalingConfig defines the scheduling we use for scaling up and down
type ScalingConfig struct {
// ComputeUnit is the desired ratio between CPU and memory that the autoscaler-agent should
// uphold when making changes to a VM
ComputeUnit api.Resources `json:"computeUnit"`
// DefaultConfig gives the default scaling config, to be used if there is no configuration
// supplied with the "autoscaling.neon.tech/config" annotation.
DefaultConfig api.ScalingConfig `json:"defaultConfig"`
}
// MetricsConfig defines a few parameters for metrics requests to the VM
type MetricsConfig struct {
System MetricsSourceConfig `json:"system"`
LFC MetricsSourceConfig `json:"lfc"`
}
type MetricsSourceConfig struct {
// Port is the port that VMs are expected to provide the metrics on
//
// For system metrics, vm-builder installs vector (from vector.dev) to expose them on port 9100.
Port uint16 `json:"port"`
// RequestTimeoutSeconds gives the timeout duration, in seconds, for metrics requests
RequestTimeoutSeconds uint `json:"requestTimeoutSeconds"`
// SecondsBetweenRequests sets the number of seconds to wait between metrics requests
SecondsBetweenRequests uint `json:"secondsBetweenRequests"`
}
// SchedulerConfig defines a few parameters for scheduler requests
type SchedulerConfig struct {
// SchedulerName is the name of the scheduler we're expecting to communicate with.
//
// Any VMs that don't have a matching Spec.SchedulerName will not be autoscaled.
SchedulerName string `json:"schedulerName"`
// RequestTimeoutSeconds gives the timeout duration, in seconds, for requests to the scheduler
//
// If zero, requests will have no timeout.
RequestTimeoutSeconds uint `json:"requestTimeoutSeconds"`
// RequestAtLeastEverySeconds gives the maximum duration we should go without attempting a
// request to the scheduler, even if nothing's changed.
RequestAtLeastEverySeconds uint `json:"requestAtLeastEverySeconds"`
// RetryFailedRequestSeconds gives the duration, in seconds, that we must wait after a previous
// failed request before making another one.
RetryFailedRequestSeconds uint `json:"retryFailedRequestSeconds"`
// RetryDeniedUpscaleSeconds gives the duration, in seconds, that we must wait before resending
// a request for resources that were not approved
RetryDeniedUpscaleSeconds uint `json:"retryDeniedUpscaleSeconds"`
// RequestPort defines the port to access the scheduler's ✨special✨ API with
RequestPort uint16 `json:"requestPort"`
// MaxFailedRequestRate defines the maximum rate of failed scheduler requests, above which
// a VM is considered stuck.
MaxFailedRequestRate RateThresholdConfig `json:"maxFailedRequestRate"`
}
// NeonVMConfig defines a few parameters for NeonVM requests
type NeonVMConfig struct {
// RequestTimeoutSeconds gives the timeout duration, in seconds, for VM patch requests
RequestTimeoutSeconds uint `json:"requestTimeoutSeconds"`
// RetryFailedRequestSeconds gives the duration, in seconds, that we must wait after a previous
// failed request before making another one.
RetryFailedRequestSeconds uint `json:"retryFailedRequestSeconds"`
// MaxFailedRequestRate defines the maximum rate of failed NeonVM requests, above which
// a VM is considered stuck.
MaxFailedRequestRate RateThresholdConfig `json:"maxFailedRequestRate"`
}
func ReadConfig(path string) (*Config, error) {
file, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("Error opening config file %q: %w", path, err)
}
defer file.Close()
var config Config
jsonDecoder := json.NewDecoder(file)
jsonDecoder.DisallowUnknownFields()
if err = jsonDecoder.Decode(&config); err != nil {
return nil, fmt.Errorf("Error decoding JSON config in %q: %w", path, err)
}
if err = config.validate(); err != nil {
return nil, fmt.Errorf("Invalid config: %w", err)
}
return &config, nil
}
func (c *Config) validate() error {
ec := &erc.Collector{}
const (
emptyTmpl = "field %q cannot be empty"
zeroTmpl = "field %q cannot be zero"
)
validateBaseBillingConfig := func(cfg *billing.BaseClientConfig, key string) {
erc.Whenf(ec, cfg.PushEverySeconds == 0, zeroTmpl, fmt.Sprintf("%s.pushEverySeconds", key))
erc.Whenf(ec, cfg.PushRequestTimeoutSeconds == 0, zeroTmpl, fmt.Sprintf("%s.pushRequestTimeoutSeconds", key))
erc.Whenf(ec, cfg.MaxBatchSize == 0, zeroTmpl, fmt.Sprintf("%s.maxBatchSize", key))
}
erc.Whenf(ec, c.Billing.ActiveTimeMetricName == "", emptyTmpl, ".billing.activeTimeMetricName")
erc.Whenf(ec, c.Billing.CPUMetricName == "", emptyTmpl, ".billing.cpuMetricName")
erc.Whenf(ec, c.Billing.CollectEverySeconds == 0, zeroTmpl, ".billing.collectEverySeconds")
erc.Whenf(ec, c.Billing.AccumulateEverySeconds == 0, zeroTmpl, ".billing.accumulateEverySeconds")
if c.Billing.Clients.AzureBlob != nil {
validateBaseBillingConfig(&c.Billing.Clients.AzureBlob.BaseClientConfig, ".billing.clients.azure")
erc.Whenf(ec, c.Billing.Clients.AzureBlob.Endpoint == "", emptyTmpl, ".billing.clients.azure.endpoint")
erc.Whenf(ec, c.Billing.Clients.AzureBlob.Container == "", emptyTmpl, ".billing.clients.azure.container")
}
if c.Billing.Clients.HTTP != nil {
validateBaseBillingConfig(&c.Billing.Clients.HTTP.BaseClientConfig, ".billing.clients.http")
erc.Whenf(ec, c.Billing.Clients.HTTP.URL == "", emptyTmpl, ".billing.clients.http.url")
}
if c.Billing.Clients.S3 != nil {
validateBaseBillingConfig(&c.Billing.Clients.S3.BaseClientConfig, "billing.clients.s3")
erc.Whenf(ec, c.Billing.Clients.S3.Bucket == "", emptyTmpl, ".billing.clients.s3.bucket")
erc.Whenf(ec, c.Billing.Clients.S3.Region == "", emptyTmpl, ".billing.clients.s3.region")
erc.Whenf(ec, c.Billing.Clients.S3.PrefixInBucket == "", emptyTmpl, ".billing.clients.s3.prefixInBucket")
}
erc.Whenf(ec, c.DumpState != nil && c.DumpState.Port == 0, zeroTmpl, ".dumpState.port")
erc.Whenf(ec, c.DumpState != nil && c.DumpState.TimeoutSeconds == 0, zeroTmpl, ".dumpState.timeoutSeconds")
validateMetricsConfig := func(cfg MetricsSourceConfig, key string) {
erc.Whenf(ec, cfg.Port == 0, zeroTmpl, fmt.Sprintf(".metrics.%s.port", key))
erc.Whenf(ec, cfg.RequestTimeoutSeconds == 0, zeroTmpl, fmt.Sprintf(".metrics.%s.requestTimeoutSeconds", key))
erc.Whenf(ec, cfg.SecondsBetweenRequests == 0, zeroTmpl, fmt.Sprintf(".metrics.%s.secondsBetweenRequests", key))
}
validateMetricsConfig(c.Metrics.System, "system")
validateMetricsConfig(c.Metrics.LFC, "lfc")
erc.Whenf(ec, c.Scaling.ComputeUnit.VCPU == 0, zeroTmpl, ".scaling.computeUnit.vCPUs")
erc.Whenf(ec, c.Scaling.ComputeUnit.Mem == 0, zeroTmpl, ".scaling.computeUnit.mem")
erc.Whenf(ec, c.NeonVM.RequestTimeoutSeconds == 0, zeroTmpl, ".scaling.requestTimeoutSeconds")
erc.Whenf(ec, c.NeonVM.RetryFailedRequestSeconds == 0, zeroTmpl, ".scaling.retryFailedRequestSeconds")
erc.Whenf(ec, c.NeonVM.MaxFailedRequestRate.IntervalSeconds == 0, zeroTmpl, ".neonvm.maxFailedRequestRate.intervalSeconds")
erc.Whenf(ec, c.Monitor.ResponseTimeoutSeconds == 0, zeroTmpl, ".monitor.responseTimeoutSeconds")
erc.Whenf(ec, c.Monitor.ConnectionTimeoutSeconds == 0, zeroTmpl, ".monitor.connectionTimeoutSeconds")
erc.Whenf(ec, c.Monitor.ConnectionRetryMinWaitSeconds == 0, zeroTmpl, ".monitor.connectionRetryMinWaitSeconds")
erc.Whenf(ec, c.Monitor.ServerPort == 0, zeroTmpl, ".monitor.serverPort")
erc.Whenf(ec, c.Monitor.UnhealthyAfterSilenceDurationSeconds == 0, zeroTmpl, ".monitor.unhealthyAfterSilenceDurationSeconds")
erc.Whenf(ec, c.Monitor.UnhealthyStartupGracePeriodSeconds == 0, zeroTmpl, ".monitor.unhealthyStartupGracePeriodSeconds")
erc.Whenf(ec, c.Monitor.MaxHealthCheckSequentialFailuresSeconds == 0, zeroTmpl, ".monitor.maxHealthCheckSequentialFailuresSeconds")
erc.Whenf(ec, c.Monitor.RetryFailedRequestSeconds == 0, zeroTmpl, ".monitor.retryFailedRequestSeconds")
erc.Whenf(ec, c.Monitor.RetryDeniedDownscaleSeconds == 0, zeroTmpl, ".monitor.retryDeniedDownscaleSeconds")
erc.Whenf(ec, c.Monitor.RequestedUpscaleValidSeconds == 0, zeroTmpl, ".monitor.requestedUpscaleValidSeconds")
erc.Whenf(ec, c.Monitor.MaxFailedRequestRate.IntervalSeconds == 0, zeroTmpl, ".monitor.maxFailedRequestRate.intervalSeconds")
// add all errors if there are any: https://github.com/neondatabase/autoscaling/pull/195#discussion_r1170893494
ec.Add(c.Scaling.DefaultConfig.ValidateDefaults())
erc.Whenf(ec, c.Scheduler.RequestPort == 0, zeroTmpl, ".scheduler.requestPort")
erc.Whenf(ec, c.Scheduler.RequestTimeoutSeconds == 0, zeroTmpl, ".scheduler.requestTimeoutSeconds")
erc.Whenf(ec, c.Scheduler.RequestAtLeastEverySeconds == 0, zeroTmpl, ".scheduler.requestAtLeastEverySeconds")
erc.Whenf(ec, c.Scheduler.RetryFailedRequestSeconds == 0, zeroTmpl, ".scheduler.retryFailedRequestSeconds")
erc.Whenf(ec, c.Scheduler.RetryDeniedUpscaleSeconds == 0, zeroTmpl, ".scheduler.retryDeniedUpscaleSeconds")
erc.Whenf(ec, c.Scheduler.SchedulerName == "", emptyTmpl, ".scheduler.schedulerName")
erc.Whenf(ec, c.Scheduler.MaxFailedRequestRate.IntervalSeconds == 0, zeroTmpl, ".monitor.maxFailedRequestRate.intervalSeconds")
return ec.Resolve()
}
package core
import (
"time"
"go.uber.org/zap/zapcore"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
)
type ActionSet struct {
Wait *ActionWait `json:"wait,omitempty"`
PluginRequest *ActionPluginRequest `json:"pluginRequest,omitempty"`
NeonVMRequest *ActionNeonVMRequest `json:"neonvmRequest,omitempty"`
MonitorDownscale *ActionMonitorDownscale `json:"monitorDownscale,omitempty"`
MonitorUpscale *ActionMonitorUpscale `json:"monitorUpscale,omitempty"`
}
type ActionWait struct {
Duration time.Duration `json:"duration"`
}
type ActionPluginRequest struct {
LastPermit *api.Resources `json:"current"`
Target api.Resources `json:"target"`
Metrics *api.Metrics `json:"metrics"`
TargetRevision vmv1.RevisionWithTime `json:"targetRevision"`
}
type ActionNeonVMRequest struct {
Current api.Resources `json:"current"`
Target api.Resources `json:"target"`
TargetRevision vmv1.RevisionWithTime `json:"targetRevision"`
}
type ActionMonitorDownscale struct {
Current api.Resources `json:"current"`
Target api.Resources `json:"target"`
TargetRevision vmv1.RevisionWithTime `json:"targetRevision"`
}
type ActionMonitorUpscale struct {
Current api.Resources `json:"current"`
Target api.Resources `json:"target"`
TargetRevision vmv1.RevisionWithTime `json:"targetRevision"`
}
func addObjectPtr[T zapcore.ObjectMarshaler](enc zapcore.ObjectEncoder, key string, value *T) error {
if value != nil {
return enc.AddObject(key, *value)
} else {
// nil ObjectMarshaler is not sound, but nil reflected is, and it shortcuts reflection
return enc.AddReflected(key, nil)
}
}
func (s ActionSet) MarshalLogObject(enc zapcore.ObjectEncoder) error {
_ = addObjectPtr(enc, "wait", s.Wait)
_ = addObjectPtr(enc, "pluginRequest", s.PluginRequest)
_ = addObjectPtr(enc, "neonvmRequest", s.NeonVMRequest)
_ = addObjectPtr(enc, "monitorDownscale", s.MonitorDownscale)
_ = addObjectPtr(enc, "monitorUpscale", s.MonitorUpscale)
return nil
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionWait can be used with zap.Object
func (a ActionWait) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddDuration("duration", a.Duration)
return nil
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionPluginRequest can be used with zap.Object
func (a ActionPluginRequest) MarshalLogObject(enc zapcore.ObjectEncoder) error {
_ = addObjectPtr(enc, "lastPermit", a.LastPermit)
_ = enc.AddObject("target", a.Target)
_ = enc.AddReflected("metrics", a.Metrics)
return nil
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionNeonVMRequest can be used with zap.Object
func (a ActionNeonVMRequest) MarshalLogObject(enc zapcore.ObjectEncoder) error {
_ = enc.AddObject("current", a.Current)
_ = enc.AddObject("target", a.Target)
return nil
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionMonitorDownscale can be used with zap.Object
func (a ActionMonitorDownscale) MarshalLogObject(enc zapcore.ObjectEncoder) error {
_ = enc.AddObject("current", a.Current)
_ = enc.AddObject("target", a.Target)
return nil
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionMonitorUpscale can be used with zap.Object
func (a ActionMonitorUpscale) MarshalLogObject(enc zapcore.ObjectEncoder) error {
_ = enc.AddObject("current", a.Current)
_ = enc.AddObject("target", a.Target)
return nil
}
package core
// Implementation of (*State).Dump()
import (
"encoding/json"
"time"
"github.com/neondatabase/autoscaling/pkg/api"
)
func shallowCopy[T any](ptr *T) *T {
if ptr == nil {
return nil
} else {
x := *ptr
return &x
}
}
// StateDump provides introspection into the current values of the fields of State
//
// It implements json.Marshaler.
type StateDump struct {
internal state
}
func (d StateDump) MarshalJSON() ([]byte, error) {
return json.Marshal(d.internal)
}
// Dump produces a JSON-serializable copy of the State
func (s *State) Dump() StateDump {
return StateDump{
internal: state{
Debug: s.internal.Debug,
Config: s.internal.Config,
VM: s.internal.VM,
Plugin: s.internal.Plugin.deepCopy(),
Monitor: s.internal.Monitor.deepCopy(),
NeonVM: s.internal.NeonVM.deepCopy(),
Metrics: shallowCopy[SystemMetrics](s.internal.Metrics),
LFCMetrics: shallowCopy[LFCMetrics](s.internal.LFCMetrics),
TargetRevision: s.internal.TargetRevision,
LastDesiredResources: s.internal.LastDesiredResources,
},
}
}
func (s *pluginState) deepCopy() pluginState {
return pluginState{
OngoingRequest: s.OngoingRequest,
LastRequest: shallowCopy[pluginRequested](s.LastRequest),
LastFailureAt: shallowCopy[time.Time](s.LastFailureAt),
Permit: shallowCopy[api.Resources](s.Permit),
CurrentRevision: s.CurrentRevision,
}
}
func (s *monitorState) deepCopy() monitorState {
return monitorState{
OngoingRequest: shallowCopy[ongoingMonitorRequest](s.OngoingRequest),
RequestedUpscale: shallowCopy[requestedUpscale](s.RequestedUpscale),
DeniedDownscale: shallowCopy[deniedDownscale](s.DeniedDownscale),
Approved: shallowCopy[api.Resources](s.Approved),
DownscaleFailureAt: shallowCopy[time.Time](s.DownscaleFailureAt),
UpscaleFailureAt: shallowCopy[time.Time](s.UpscaleFailureAt),
CurrentRevision: s.CurrentRevision,
}
}
func (s *neonvmState) deepCopy() neonvmState {
return neonvmState{
LastSuccess: shallowCopy[api.Resources](s.LastSuccess),
OngoingRequested: shallowCopy[api.Resources](s.OngoingRequested),
RequestFailedAt: shallowCopy[time.Time](s.RequestFailedAt),
TargetRevision: s.TargetRevision,
CurrentRevision: s.CurrentRevision,
}
}
package core
// extracted components of how "goal CU" is determined
import (
"math"
"github.com/samber/lo"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"github.com/neondatabase/autoscaling/pkg/api"
)
type scalingGoal struct {
hasAllMetrics bool
goalCU uint32
}
func calculateGoalCU(
warn func(string),
cfg api.ScalingConfig,
computeUnit api.Resources,
systemMetrics *SystemMetrics,
lfcMetrics *LFCMetrics,
) (scalingGoal, []zap.Field) {
hasAllMetrics := systemMetrics != nil && (!*cfg.EnableLFCMetrics || lfcMetrics != nil)
if !hasAllMetrics {
warn("Making scaling decision without all required metrics available")
}
var lfcGoalCU, cpuGoalCU, memGoalCU, memTotalGoalCU uint32
var logFields []zap.Field
var wss *api.Bytes // estimated working set size
if lfcMetrics != nil {
var lfcLogFunc func(zapcore.ObjectEncoder) error
lfcGoalCU, wss, lfcLogFunc = calculateLFCGoalCU(warn, cfg, computeUnit, *lfcMetrics)
if lfcLogFunc != nil {
logFields = append(logFields, zap.Object("lfc", zapcore.ObjectMarshalerFunc(lfcLogFunc)))
}
}
if systemMetrics != nil {
cpuGoalCU = calculateCPUGoalCU(cfg, computeUnit, *systemMetrics)
memGoalCU = calculateMemGoalCU(cfg, computeUnit, *systemMetrics)
}
if systemMetrics != nil && wss != nil {
memTotalGoalCU = calculateMemTotalGoalCU(cfg, computeUnit, *systemMetrics, *wss)
}
goalCU := max(cpuGoalCU, memGoalCU, memTotalGoalCU, lfcGoalCU)
return scalingGoal{hasAllMetrics: hasAllMetrics, goalCU: goalCU}, logFields
}
// For CPU:
// Goal compute unit is at the point where (CPUs) × (LoadAverageFractionTarget) == (load average),
// which we can get by dividing LA by LAFT, and then dividing by the number of CPUs per CU
func calculateCPUGoalCU(
cfg api.ScalingConfig,
computeUnit api.Resources,
systemMetrics SystemMetrics,
) uint32 {
goalCPUs := systemMetrics.LoadAverage1Min / *cfg.LoadAverageFractionTarget
cpuGoalCU := uint32(math.Round(goalCPUs / computeUnit.VCPU.AsFloat64()))
return cpuGoalCU
}
// For Mem:
// Goal compute unit is at the point where (Mem) * (MemoryUsageFractionTarget) == (Mem Usage)
// We can get the desired memory allocation in bytes by dividing MU by MUFT, and then convert
// that to CUs.
func calculateMemGoalCU(
cfg api.ScalingConfig,
computeUnit api.Resources,
systemMetrics SystemMetrics,
) uint32 {
// goal memory size, just looking at allocated memory (not including page cache...)
memGoalBytes := api.Bytes(math.Round(systemMetrics.MemoryUsageBytes / *cfg.MemoryUsageFractionTarget))
// note: this is equal to ceil(memGoalBytes / computeUnit.Mem), because ceil(X/M) == floor((X+M-1)/M)
memGoalCU := uint32((memGoalBytes + computeUnit.Mem - 1) / computeUnit.Mem)
return memGoalCU
}
// goal memory size, looking at allocated memory and min(page cache usage, LFC working set size)
func calculateMemTotalGoalCU(
cfg api.ScalingConfig,
computeUnit api.Resources,
systemMetrics SystemMetrics,
wss api.Bytes,
) uint32 {
lfcCached := min(float64(wss), systemMetrics.MemoryCachedBytes)
totalGoalBytes := api.Bytes((lfcCached + systemMetrics.MemoryUsageBytes) / *cfg.MemoryTotalFractionTarget)
memTotalGoalCU := uint32((totalGoalBytes + computeUnit.Mem - 1) / computeUnit.Mem)
return memTotalGoalCU
}
func calculateLFCGoalCU(
warn func(string),
cfg api.ScalingConfig,
computeUnit api.Resources,
lfcMetrics LFCMetrics,
) (uint32, *api.Bytes, func(zapcore.ObjectEncoder) error) {
wssValues := lfcMetrics.ApproximateworkingSetSizeBuckets
// At this point, we can assume that the values are equally spaced at 1 minute apart,
// starting at 1 minute.
offsetIndex := *cfg.LFCMinWaitBeforeDownscaleMinutes - 1 // -1 because values start at 1m
windowSize := *cfg.LFCWindowSizeMinutes
// Handle invalid metrics:
if len(wssValues) < offsetIndex+windowSize {
warn("not enough working set size values to make scaling determination")
return 0, nil, nil
} else {
estimateWss := EstimateTrueWorkingSetSize(wssValues, WssEstimatorConfig{
MaxAllowedIncreaseFactor: 3.0, // hard-code this for now.
InitialOffset: offsetIndex,
WindowSize: windowSize,
})
projectSliceEnd := offsetIndex // start at offsetIndex to avoid panics if not monotonically non-decreasing
for ; projectSliceEnd < len(wssValues) && wssValues[projectSliceEnd] <= estimateWss; projectSliceEnd++ {
}
projectLen := 0.5 // hard-code this for now.
predictedHighestNextMinute := ProjectNextHighest(wssValues[:projectSliceEnd], projectLen)
// predictedHighestNextMinute is still in units of 8KiB pages. Let's convert that
// into GiB...
estimateWssMem := predictedHighestNextMinute * 8192
// ... and then invert the discount form only some of the memory going towards LFC...
requiredMem := estimateWssMem / *cfg.LFCToMemoryRatio
// ... and then convert that into the actual CU required to fit the working set:
requiredCU := requiredMem / computeUnit.Mem.AsFloat64()
lfcGoalCU := uint32(math.Ceil(requiredCU))
lfcLogFields := func(obj zapcore.ObjectEncoder) error {
obj.AddFloat64("estimateWssPages", estimateWss)
obj.AddFloat64("predictedNextWssPages", predictedHighestNextMinute)
obj.AddFloat64("requiredCU", requiredCU)
return nil
}
return lfcGoalCU, lo.ToPtr(api.Bytes(estimateWssMem)), lfcLogFields
}
}
package core
// Definition of the Metrics type, plus reading it from vector.dev's prometheus format host metrics
import (
"cmp"
"fmt"
"io"
"slices"
"strconv"
"time"
promtypes "github.com/prometheus/client_model/go"
promfmt "github.com/prometheus/common/expfmt"
"github.com/tychoish/fun/erc"
"github.com/neondatabase/autoscaling/pkg/api"
)
type SystemMetrics struct {
LoadAverage1Min float64
MemoryUsageBytes float64
MemoryCachedBytes float64
}
func (m SystemMetrics) ToAPI() api.Metrics {
return api.Metrics{
LoadAverage1Min: float32(m.LoadAverage1Min),
LoadAverage5Min: nil,
MemoryUsageBytes: nil,
}
}
type LFCMetrics struct {
CacheHitsTotal float64
CacheMissesTotal float64
CacheWritesTotal float64
// lfc_approximate_working_set_size_windows, currently requires that values are exactly every
// minute
ApproximateworkingSetSizeBuckets []float64
}
// FromPrometheus represents metric types that can be parsed from prometheus output.
type FromPrometheus interface {
fromPrometheus(map[string]*promtypes.MetricFamily) error
}
// ParseMetrics reads the prometheus text-format content, parses it, and uses M's implementation of
// FromPrometheus to populate it before returning.
func ParseMetrics(content io.Reader, metrics FromPrometheus) error {
var parser promfmt.TextParser
mfs, err := parser.TextToMetricFamilies(content)
if err != nil {
return fmt.Errorf("failed to parse content as prometheus text format: %w", err)
}
if err := metrics.fromPrometheus(mfs); err != nil {
return fmt.Errorf("failed to extract metrics: %w", err)
}
return nil
}
func extractFloatGauge(mf *promtypes.MetricFamily) (float64, error) {
if mf.GetType() != promtypes.MetricType_GAUGE {
return 0, fmt.Errorf("wrong metric type: expected %s but got %s", promtypes.MetricType_GAUGE, mf.GetType())
} else if len(mf.Metric) != 1 {
return 0, fmt.Errorf("expected 1 metric, found %d", len(mf.Metric))
}
return mf.Metric[0].GetGauge().GetValue(), nil
}
// Helper function to return an error for a missing metric
func missingMetric(name string) error {
return fmt.Errorf("missing expected metric %s", name)
}
// fromPrometheus implements FromPrometheus, so SystemMetrics can be used with ParseMetrics.
func (m *SystemMetrics) fromPrometheus(mfs map[string]*promtypes.MetricFamily) error {
ec := &erc.Collector{}
getFloat := func(metricName string) float64 {
if mf := mfs[metricName]; mf != nil {
f, err := extractFloatGauge(mf)
ec.Add(err) // does nothing if err == nil
return f
} else {
ec.Add(missingMetric(metricName))
return 0
}
}
load1 := getFloat("host_load1")
memTotal := getFloat("host_memory_total_bytes")
memAvailable := getFloat("host_memory_available_bytes")
memCached := getFloat("host_memory_cached_bytes")
tmp := SystemMetrics{
LoadAverage1Min: load1,
// Add an extra 100 MiB to account for kernel memory usage
MemoryUsageBytes: memTotal - memAvailable + 100*(1<<20),
MemoryCachedBytes: memCached,
}
if err := ec.Resolve(); err != nil {
return err
}
*m = tmp
return nil
}
// fromPrometheus implements FromPrometheus, so LFCMetrics can be used with ParseMetrics.
func (m *LFCMetrics) fromPrometheus(mfs map[string]*promtypes.MetricFamily) error {
ec := &erc.Collector{}
getFloat := func(metricName string) float64 {
if mf := mfs[metricName]; mf != nil {
f, err := extractFloatGauge(mf)
ec.Add(err) // does nothing if err == nil
return f
} else {
ec.Add(missingMetric(metricName))
return 0
}
}
wssBuckets, err := extractWorkingSetSizeWindows(mfs)
ec.Add(err)
tmp := LFCMetrics{
CacheHitsTotal: getFloat("lfc_hits"),
CacheMissesTotal: getFloat("lfc_misses"),
CacheWritesTotal: getFloat("lfc_writes"),
ApproximateworkingSetSizeBuckets: wssBuckets,
}
if err := ec.Resolve(); err != nil {
return err
}
*m = tmp
return nil
}
func extractWorkingSetSizeWindows(mfs map[string]*promtypes.MetricFamily) ([]float64, error) {
metricName := "lfc_approximate_working_set_size_windows"
mf := mfs[metricName]
if mf == nil {
return nil, missingMetric(metricName)
}
if mf.GetType() != promtypes.MetricType_GAUGE {
return nil, fmt.Errorf("wrong metric type: expected %s, but got %s", promtypes.MetricType_GAUGE, mf.GetType())
} else if len(mf.Metric) < 1 {
return nil, fmt.Errorf("expected >= metric, found %d", len(mf.Metric))
}
type pair struct {
duration time.Duration
value float64
}
var pairs []pair
for _, m := range mf.Metric {
// Find the duration label
durationLabel := "duration_seconds"
durationIndex := slices.IndexFunc(m.Label, func(l *promtypes.LabelPair) bool {
return l.GetName() == durationLabel
})
if durationIndex == -1 {
return nil, fmt.Errorf("metric missing label %q", durationLabel)
}
durationSeconds, err := strconv.Atoi(m.Label[durationIndex].GetValue())
if err != nil {
return nil, fmt.Errorf("couldn't parse metric's %q label as int: %w", durationLabel, err)
}
pairs = append(pairs, pair{
duration: time.Second * time.Duration(durationSeconds),
value: m.GetGauge().GetValue(),
})
}
slices.SortFunc(pairs, func(x, y pair) int {
return cmp.Compare(x.duration, y.duration)
})
// Check that the values make are as expected: they should all be 1 minute apart, starting
// at 1 minute.
// NOTE: this assumption is relied on elsewhere for scaling on ApproximateworkingSetSizeBuckets.
// Please search for usages before changing this behavior.
if pairs[0].duration != time.Minute {
return nil, fmt.Errorf("expected smallest duration to be %v, got %v", time.Minute, pairs[0].duration)
}
for i := range pairs {
expected := time.Minute * time.Duration(i+1)
if pairs[i].duration != expected {
return nil, fmt.Errorf(
"expected duration values to be exactly 1m apart, got unexpected value %v instead of %v",
pairs[i].duration,
expected,
)
}
}
var values []float64
for _, p := range pairs {
values = append(values, p.value)
}
return values, nil
}
package revsource
import (
"errors"
"time"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)
const (
Upscale vmv1.Flag = 1 << iota
Downscale
)
// MaxRevisions is the maximum number of revisions that can be stored in the RevisionSource.
// This is to prevent memory leaks.
// Upon reaching it, the oldest revisions are discarded.
const MaxRevisions = 100
// RevisionSource can generate and observe revisions.
// Each Revision is a value and a set of flags (for meta-information).
// Once RevisionSource observes a previously generated Revision after some time,
// the time it took since that Revision was generated.
type RevisionSource struct {
cb ObserveCallback
// The in-flight revisions are stored in-order.
// After the revision is observed, it is removed from the measurements, and the offset is increased.
measurements []time.Time
offset int64
}
func NewRevisionSource(initialRevision int64, cb ObserveCallback) *RevisionSource {
return &RevisionSource{
cb: cb,
measurements: nil,
offset: initialRevision + 1, // Will start from the next one
}
}
func (c *RevisionSource) nextValue() int64 {
return c.offset + int64(len(c.measurements))
}
func (c *RevisionSource) Next(now time.Time, flags vmv1.Flag) vmv1.Revision {
ret := vmv1.Revision{
Value: c.nextValue(),
Flags: flags,
}
c.measurements = append(c.measurements, now)
if len(c.measurements) > MaxRevisions {
c.measurements = c.measurements[1:]
c.offset++
}
return ret
}
func (c *RevisionSource) Observe(moment time.Time, rev vmv1.Revision) error {
if rev.Value < c.offset {
// Already observed
return nil
}
idx := rev.Value - c.offset
if idx > int64(len(c.measurements)) {
return errors.New("revision is in the future")
}
diff := moment.Sub(c.measurements[idx])
if c.cb != nil {
c.cb(diff, rev.Flags)
}
// Forget the measurement, and all the measurements before it.
c.offset = rev.Value + 1
c.measurements = c.measurements[idx+1:]
return nil
}
type ObserveCallback func(dur time.Duration, flags vmv1.Flag)
// Propagate sets the target revision to be current, optionally measuring the time it took
// for propagation.
func Propagate(
now time.Time,
target vmv1.RevisionWithTime,
currentSlot *vmv1.Revision,
cb ObserveCallback,
) {
if currentSlot == nil {
return
}
if currentSlot.Value >= target.Value {
return
}
if cb != nil {
diff := now.Sub(target.UpdatedAt.Time)
cb(diff, target.Flags)
}
*currentSlot = target.Revision
}
package core
// The core scaling logic at the heart of the autoscaler-agent. This file implements everything with
// mostly pure-ish functions, so that all the making & receiving requests can be done elsewhere.
//
// Broadly our strategy is to mimic the kind of eventual consistency that is itself used in
// Kubernetes. The scaling logic wasn't always implemented like this, but because the
// autoscaler-agent *fundamentally* exists in an eventual consistency world, we have to either:
// (a) make assumptions that we know are false; or
// (b) design our system so it assumes less.
// We used to solve this by (a). We ran into¹ issues² going that way, because sometimes those false
// assumptions come back to haunt you.
//
// That said, there's still some tricky semantics we want to maintain. Internally, the
// autoscaler-agent must be designed around eventual consistency, but the API we expose to the
// vm-monitor is strictly synchronous. As such, there's some subtle logic to make sure that we're
// not violating our own guarantees unless required to.
//
// ---
// ¹ https://github.com/neondatabase/autoscaling/issues/23
// ² https://github.com/neondatabase/autoscaling/issues/350
import (
"errors"
"fmt"
"strings"
"time"
"github.com/samber/lo"
"go.uber.org/zap"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/agent/core/revsource"
"github.com/neondatabase/autoscaling/pkg/api"
)
type ObservabilityCallbacks struct {
PluginLatency revsource.ObserveCallback
MonitorLatency revsource.ObserveCallback
NeonVMLatency revsource.ObserveCallback
}
type RevisionSource interface {
Next(ts time.Time, flags vmv1.Flag) vmv1.Revision
Observe(moment time.Time, rev vmv1.Revision) error
}
// Config represents some of the static configuration underlying the decision-making of State
type Config struct {
// ComputeUnit is the desired ratio between CPU and memory, copied from the global
// autoscaler-agent config.
ComputeUnit api.Resources
// DefaultScalingConfig is just copied from the global autoscaler-agent config.
// If the VM's ScalingConfig is nil, we use this field instead.
DefaultScalingConfig api.ScalingConfig
// NeonVMRetryWait gives the amount of time to wait to retry after a failed request
NeonVMRetryWait time.Duration
// PluginRequestTick gives the period at which we should be making requests to the scheduler
// plugin, even if nothing's changed.
PluginRequestTick time.Duration
// PluginRetryWait gives the amount of time to wait to retry after a failed request
PluginRetryWait time.Duration
// PluginDeniedRetryWait gives the amount of time we must wait before re-requesting resources
// that were not fully granted.
PluginDeniedRetryWait time.Duration
// MonitorDeniedDownscaleCooldown gives the time we must wait between making duplicate
// downscale requests to the vm-monitor where the previous failed.
MonitorDeniedDownscaleCooldown time.Duration
// MonitorRequestedUpscaleValidPeriod gives the duration for which requested upscaling from the
// vm-monitor must be respected.
MonitorRequestedUpscaleValidPeriod time.Duration
// MonitorRetryWait gives the amount of time to wait to retry after a *failed* request.
MonitorRetryWait time.Duration
// Log provides an outlet for (*State).NextActions() to give informative messages or warnings
// about conditions that are impeding its ability to execute.
Log LogConfig `json:"-"`
// RevisionSource is the source of revisions to track the progress during scaling.
RevisionSource RevisionSource `json:"-"`
// ObservabilityCallbacks are the callbacks to submit datapoints for observability.
ObservabilityCallbacks ObservabilityCallbacks `json:"-"`
}
type LogConfig struct {
// Info, if not nil, will be called to provide information during normal functioning.
// For example, we log the calculated desired resources on every call to NextActions.
Info func(string, ...zap.Field)
// Warn, if not nil, will be called to log conditions that are impeding the ability to move the
// current resources to what's desired.
// A typical warning may be something like "wanted to do X but couldn't because of Y".
Warn func(string, ...zap.Field)
}
// State holds all of the necessary internal state for a VM in order to make scaling
// decisions
type State struct {
internal state
}
// one level of indirection below State so that the fields can be public, and JSON-serializable
type state struct {
Config Config
// unused. Exists to make it easier to add print debugging (via .config.Warn) for a single call
// to NextActions.
Debug bool
// VM gives the current state of the VM - or at least, the state of the fields we care about.
//
// NB: any contents behind pointers in VM are immutable. Any time the field is updated, we
// replace it with a fresh object.
VM api.VmInfo
// Plugin records all state relevant to communications with the scheduler plugin
Plugin pluginState
// Monitor records all state relevant to communications with the vm-monitor
Monitor monitorState
// NeonVM records all state relevant to the NeonVM k8s API
NeonVM neonvmState
Metrics *SystemMetrics
LFCMetrics *LFCMetrics
// TargetRevision is the revision agent works towards.
TargetRevision vmv1.Revision
// LastDesiredResources is the last target agent wanted to scale to.
LastDesiredResources *api.Resources
}
type pluginState struct {
// OngoingRequest is true iff there is currently an ongoing request to *this* scheduler plugin.
OngoingRequest bool
// LastRequest, if not nil, gives information about the most recently started request to the
// plugin (maybe unfinished!)
LastRequest *pluginRequested
// LastFailureAt, if not nil, gives the time of the most recent request failure
LastFailureAt *time.Time
// Permit, if not nil, stores the Permit in the most recent PluginResponse. This field will be
// nil if we have not been able to contact *any* scheduler.
Permit *api.Resources
// CurrentRevision is the most recent revision the plugin has acknowledged.
CurrentRevision vmv1.Revision
}
type pluginRequested struct {
At time.Time
Resources api.Resources
}
type monitorState struct {
OngoingRequest *ongoingMonitorRequest
// RequestedUpscale, if not nil, stores the most recent *unresolved* upscaling requested by the
// vm-monitor, along with the time at which it occurred.
RequestedUpscale *requestedUpscale
// DeniedDownscale, if not nil, stores the result of the latest denied /downscale request.
DeniedDownscale *deniedDownscale
// Approved stores the most recent Resources associated with either (a) an accepted downscale
// request, or (b) a successful upscale notification.
Approved *api.Resources
// DownscaleFailureAt, if not nil, stores the time at which a downscale request most recently
// failed (where "failed" means that some unexpected error occurred, not that it was merely
// denied).
DownscaleFailureAt *time.Time
// UpscaleFailureAt, if not nil, stores the time at which an upscale request most recently
// failed
UpscaleFailureAt *time.Time
// CurrentRevision is the most recent revision the monitor has acknowledged.
CurrentRevision vmv1.Revision
}
func (ms *monitorState) active() bool {
return ms.Approved != nil
}
type ongoingMonitorRequest struct {
Kind monitorRequestKind
Requested api.Resources
}
type monitorRequestKind string
const (
monitorRequestKindDownscale monitorRequestKind = "downscale"
monitorRequestKindUpscale monitorRequestKind = "upscale"
)
type requestedUpscale struct {
At time.Time
Base api.Resources
Requested api.MoreResources
}
type deniedDownscale struct {
At time.Time
Current api.Resources
Requested api.Resources
}
type neonvmState struct {
LastSuccess *api.Resources
// OngoingRequested, if not nil, gives the resources requested
OngoingRequested *api.Resources
RequestFailedAt *time.Time
// TargetRevision is the revision agent works towards. Contrary to monitor/plugin, we
// store it not only in action, but also here. This is needed, because for NeonVM propagation
// happens after the changes are actually applied, when the action object is long gone.
TargetRevision vmv1.RevisionWithTime
CurrentRevision vmv1.Revision
}
func (ns *neonvmState) ongoingRequest() bool {
return ns.OngoingRequested != nil
}
func NewState(vm api.VmInfo, config Config) *State {
return &State{
internal: state{
Config: config,
Debug: false,
VM: vm,
Plugin: pluginState{
OngoingRequest: false,
LastRequest: nil,
LastFailureAt: nil,
Permit: nil,
CurrentRevision: vmv1.ZeroRevision,
},
Monitor: monitorState{
OngoingRequest: nil,
RequestedUpscale: nil,
DeniedDownscale: nil,
Approved: nil,
DownscaleFailureAt: nil,
UpscaleFailureAt: nil,
CurrentRevision: vmv1.ZeroRevision,
},
NeonVM: neonvmState{
LastSuccess: nil,
OngoingRequested: nil,
RequestFailedAt: nil,
TargetRevision: vmv1.ZeroRevision.WithTime(time.Time{}),
CurrentRevision: vmv1.ZeroRevision,
},
Metrics: nil,
LFCMetrics: nil,
LastDesiredResources: nil,
TargetRevision: vmv1.ZeroRevision,
},
}
}
func (s *state) info(msg string, fields ...zap.Field) {
if s.Config.Log.Info != nil {
s.Config.Log.Info(msg, fields...)
}
}
func (s *state) warn(msg string /* , fields ...zap.Field */) {
if s.Config.Log.Warn != nil {
s.Config.Log.Warn(msg /* , fields... */)
}
}
func (s *state) warnf(msg string, args ...any) {
s.warn(fmt.Sprintf(msg, args...))
}
// NextActions is used to implement the state machine. It's a pure function that *just* indicates
// what the executor should do.
func (s *State) NextActions(now time.Time) ActionSet {
return s.internal.nextActions(now)
}
func (s *state) nextActions(now time.Time) ActionSet {
var actions ActionSet
desiredResources, calcDesiredResourcesWait := s.desiredResourcesFromMetricsOrRequestedUpscaling(now)
if calcDesiredResourcesWait == nil {
// our handling later on is easier if we can assume it's non-nil
calcDesiredResourcesWait = func(ActionSet) *time.Duration { return nil }
}
// ----
// Requests to the scheduler plugin:
var pluginRequiredWait *time.Duration
actions.PluginRequest, pluginRequiredWait = s.calculatePluginAction(now, desiredResources)
// ----
// Requests to NeonVM:
var pluginRequested *api.Resources
var pluginRequestedPhase string = "<this string should not appear>"
if s.Plugin.OngoingRequest {
pluginRequested = &s.Plugin.LastRequest.Resources
pluginRequestedPhase = "ongoing"
} else if actions.PluginRequest != nil {
pluginRequested = &actions.PluginRequest.Target
pluginRequestedPhase = "planned"
}
var neonvmRequiredWait *time.Duration
actions.NeonVMRequest, neonvmRequiredWait = s.calculateNeonVMAction(now, desiredResources, pluginRequested, pluginRequestedPhase)
// ----
// Requests to vm-monitor (upscaling)
//
// NB: upscaling takes priority over downscaling requests, because otherwise we'd potentially
// forego notifying the vm-monitor of increased resources because we were busy asking if it
// could downscale.
var monitorUpscaleRequiredWait *time.Duration
actions.MonitorUpscale, monitorUpscaleRequiredWait = s.calculateMonitorUpscaleAction(now, desiredResources)
// ----
// Requests to vm-monitor (downscaling)
plannedUpscale := actions.MonitorUpscale != nil
var monitorDownscaleRequiredWait *time.Duration
actions.MonitorDownscale, monitorDownscaleRequiredWait = s.calculateMonitorDownscaleAction(now, desiredResources, plannedUpscale)
// --- and that's all the request types! ---
// If there's anything waiting, we should also note how long we should wait for.
// There's two components we could be waiting on: the scheduler plugin, and the vm-monitor.
maximumDuration := time.Duration(int64(uint64(1)<<63 - 1))
requiredWait := maximumDuration
requiredWaits := []*time.Duration{
calcDesiredResourcesWait(actions),
pluginRequiredWait,
neonvmRequiredWait,
monitorUpscaleRequiredWait,
monitorDownscaleRequiredWait,
}
for _, w := range requiredWaits {
if w != nil {
requiredWait = min(requiredWait, *w)
}
}
// If we're waiting on anything, add it as an action
if requiredWait != maximumDuration {
actions.Wait = &ActionWait{Duration: requiredWait}
}
return actions
}
func (s *state) calculatePluginAction(
now time.Time,
desiredResources api.Resources,
) (*ActionPluginRequest, *time.Duration) {
logFailureReason := func(reason string) {
s.warnf("Wanted to make a request to the scheduler plugin, but %s", reason)
}
// additional resources we want to request OR previous downscaling we need to inform the plugin of
// NOTE: only valid if s.plugin.permit != nil AND there's no ongoing NeonVM request.
requestResources := s.clampResources(
s.VM.Using(),
desiredResources,
ptr(s.VM.Using()), // don't decrease below VM using (decrease happens *before* telling the plugin)
nil, // but any increase is ok
)
// resources if we're just informing the plugin of current resource usage.
currentResources := s.VM.Using()
if s.NeonVM.OngoingRequested != nil {
// include any ongoing NeonVM request, because we're already using that.
currentResources = currentResources.Max(*s.NeonVM.OngoingRequested)
}
// We want to make a request to the scheduler plugin if:
// 1. it's been long enough since the previous request (so we're obligated by PluginRequestTick); or
// 2.a. we want to request resources / inform it of downscale;
// b. there isn't any ongoing, conflicting request; and
// c. we haven't recently been denied these resources
var timeUntilNextRequestTick time.Duration
if s.Plugin.LastRequest != nil {
timeUntilNextRequestTick = s.Config.PluginRequestTick - now.Sub(s.Plugin.LastRequest.At)
}
timeForRequest := timeUntilNextRequestTick <= 0
var timeUntilRetryBackoffExpires time.Duration
requestPreviouslyDenied := !s.Plugin.OngoingRequest &&
s.Plugin.LastRequest != nil &&
s.Plugin.Permit != nil &&
s.Plugin.LastRequest.Resources.HasFieldGreaterThan(*s.Plugin.Permit)
if requestPreviouslyDenied {
timeUntilRetryBackoffExpires = s.Plugin.LastRequest.At.Add(s.Config.PluginDeniedRetryWait).Sub(now)
}
waitingOnRetryBackoff := timeUntilRetryBackoffExpires > 0
// changing the resources we're requesting from the plugin
wantToRequestNewResources := s.Plugin.LastRequest != nil && s.Plugin.Permit != nil &&
requestResources != *s.Plugin.Permit
// ... and this isn't a duplicate (or, at least it's been long enough)
shouldRequestNewResources := wantToRequestNewResources && !waitingOnRetryBackoff
permittedRequestResources := requestResources
if !shouldRequestNewResources {
permittedRequestResources = currentResources
}
// Can't make a duplicate request
if s.Plugin.OngoingRequest {
// ... but if the desired request is different from what we would be making,
// then it's worth logging
if s.Plugin.LastRequest.Resources != permittedRequestResources {
logFailureReason("there's already an ongoing request for different resources")
}
return nil, nil
}
// Can't make a request if we failed too recently
if s.Plugin.LastFailureAt != nil {
timeUntilFailureBackoffExpires := s.Plugin.LastFailureAt.Add(s.Config.PluginRetryWait).Sub(now)
if timeUntilFailureBackoffExpires > 0 {
logFailureReason("previous request failed too recently")
return nil, &timeUntilFailureBackoffExpires
}
}
// At this point, all that's left is either making the request, or saying to wait.
// The rest of the complication is just around accurate logging.
if timeForRequest || shouldRequestNewResources {
return &ActionPluginRequest{
LastPermit: s.Plugin.Permit,
Target: permittedRequestResources,
// convert maybe-nil '*Metrics' to maybe-nil '*core.Metrics'
Metrics: func() *api.Metrics {
if s.Metrics != nil {
return lo.ToPtr(s.Metrics.ToAPI())
} else {
return nil
}
}(),
TargetRevision: s.TargetRevision.WithTime(now),
}, nil
} else {
if wantToRequestNewResources && waitingOnRetryBackoff {
logFailureReason("previous request for more resources was denied too recently")
}
waitTime := timeUntilNextRequestTick
if waitingOnRetryBackoff {
waitTime = min(waitTime, timeUntilRetryBackoffExpires)
}
return nil, &waitTime
}
}
func ptr[T any](t T) *T { return &t }
func (s *state) calculateNeonVMAction(
now time.Time,
desiredResources api.Resources,
pluginRequested *api.Resources,
pluginRequestedPhase string,
) (*ActionNeonVMRequest, *time.Duration) {
targetRevision := s.TargetRevision
if desiredResources.HasFieldLessThan(s.VM.Using()) && s.Monitor.CurrentRevision.Value > 0 {
// We are downscaling, so we needed a permit from the monitor
targetRevision = targetRevision.Min(s.Monitor.CurrentRevision)
}
if desiredResources.HasFieldGreaterThan(s.VM.Using()) && s.Plugin.CurrentRevision.Value > 0 {
// We are upscaling, so we needed a permit from the plugin
targetRevision = targetRevision.Min(s.Plugin.CurrentRevision)
}
// clamp desiredResources to what we're allowed to make a request for
desiredResources = s.clampResources(
s.VM.Using(), // current: what we're using already
desiredResources, // target: desired resources
ptr(s.monitorApprovedLowerBound()), // lower bound: downscaling that the monitor has approved
ptr(s.pluginApprovedUpperBound()), // upper bound: upscaling that the plugin has approved
)
// If we're already using the desired resources, then no need to make a request
if s.VM.Using() == desiredResources {
return nil, nil
}
conflictingPluginRequest := pluginRequested != nil && pluginRequested.HasFieldLessThan(desiredResources)
if !s.NeonVM.ongoingRequest() && !conflictingPluginRequest {
// We *should* be all clear to make a request; not allowed to make one if we failed too
// recently
if s.NeonVM.RequestFailedAt != nil {
timeUntilFailureBackoffExpires := s.NeonVM.RequestFailedAt.Add(s.Config.NeonVMRetryWait).Sub(now)
if timeUntilFailureBackoffExpires > 0 {
s.warn("Wanted to make a request to NeonVM API, but recent request failed too recently")
return nil, &timeUntilFailureBackoffExpires
}
}
s.NeonVM.TargetRevision = targetRevision.WithTime(now)
return &ActionNeonVMRequest{
Current: s.VM.Using(),
Target: desiredResources,
TargetRevision: s.NeonVM.TargetRevision,
}, nil
} else {
var reqs []string
if s.Plugin.OngoingRequest {
reqs = append(reqs, fmt.Sprintf("plugin request %s", pluginRequestedPhase))
}
if s.NeonVM.ongoingRequest() && *s.NeonVM.OngoingRequested != desiredResources {
reqs = append(reqs, "NeonVM request (for different resources) ongoing")
}
if len(reqs) != 0 {
s.warnf("Wanted to make a request to NeonVM API, but there's already %s", strings.Join(reqs, " and "))
}
return nil, nil
}
}
func (s *state) calculateMonitorUpscaleAction(
now time.Time,
desiredResources api.Resources,
) (*ActionMonitorUpscale, *time.Duration) {
// can't do anything if we don't have an active connection to the vm-monitor
if !s.Monitor.active() {
return nil, nil
}
requestResources := s.clampResources(
*s.Monitor.Approved, // current: last resources we got the OK from the monitor on
s.VM.Using(), // target: what the VM is currently using
ptr(*s.Monitor.Approved), // don't decrease below what the monitor is currently set to (this is an *upscale* request)
ptr(desiredResources.Max(*s.Monitor.Approved)), // don't increase above desired resources
)
// Clamp the request resources so we're not increasing by more than 1 CU:
requestResources = s.clampResources(
*s.Monitor.Approved,
requestResources,
nil, // no lower bound
ptr(requestResources.Add(s.Config.ComputeUnit)), // upper bound: must not increase by >1 CU
)
// Check validity of the request that we would send, before sending it
if requestResources.HasFieldLessThan(*s.Monitor.Approved) {
panic(fmt.Errorf(
"resources for vm-monitor upscaling are less than what was last approved: %+v has field less than %+v",
requestResources,
*s.Monitor.Approved,
))
}
wantToDoRequest := requestResources != *s.Monitor.Approved
if !wantToDoRequest {
return nil, nil
}
// Can't make another request if there's already one ongoing
if s.Monitor.OngoingRequest != nil {
var requestDescription string
if s.Monitor.OngoingRequest.Kind == monitorRequestKindUpscale && s.Monitor.OngoingRequest.Requested != requestResources {
requestDescription = "upscale request (for different resources)"
} else if s.Monitor.OngoingRequest.Kind == monitorRequestKindDownscale {
requestDescription = "downscale request"
}
if requestDescription != "" {
s.warnf("Wanted to send vm-monitor upscale request, but waiting on ongoing %s", requestDescription)
}
return nil, nil
}
// Can't make another request if we failed too recently:
if s.Monitor.UpscaleFailureAt != nil {
timeUntilFailureBackoffExpires := s.Monitor.UpscaleFailureAt.Add(s.Config.MonitorRetryWait).Sub(now)
if timeUntilFailureBackoffExpires > 0 {
s.warn("Wanted to send vm-monitor upscale request, but failed too recently")
return nil, &timeUntilFailureBackoffExpires
}
}
// Otherwise, we can make the request:
return &ActionMonitorUpscale{
Current: *s.Monitor.Approved,
Target: requestResources,
TargetRevision: s.TargetRevision.WithTime(now),
}, nil
}
func (s *state) calculateMonitorDownscaleAction(
now time.Time,
desiredResources api.Resources,
plannedUpscaleRequest bool,
) (*ActionMonitorDownscale, *time.Duration) {
// can't do anything if we don't have an active connection to the vm-monitor
if !s.Monitor.active() {
if desiredResources.HasFieldLessThan(s.VM.Using()) {
s.warn("Wanted to send vm-monitor downscale request, but there's no active connection")
}
return nil, nil
}
requestResources := s.clampResources(
*s.Monitor.Approved, // current: what the monitor is already aware of
desiredResources, // target: what we'd like the VM to be using
nil, // lower bound: any decrease is fine
ptr(*s.Monitor.Approved), // upper bound: don't increase (this is only downscaling!)
)
// Clamp the request resources so we're not decreasing by more than 1 CU:
requestResources = s.clampResources(
*s.Monitor.Approved,
requestResources,
ptr(s.Monitor.Approved.SaturatingSub(s.Config.ComputeUnit)), // Must not decrease by >1 CU
nil, // no upper bound
)
// Check validity of the request that we would send, before sending it
if requestResources.HasFieldGreaterThan(*s.Monitor.Approved) {
panic(fmt.Errorf(
"resources for vm-monitor downscaling are greater than what was last approved: %+v has field greater than %+v",
requestResources,
*s.Monitor.Approved,
))
}
wantToDoRequest := requestResources != *s.Monitor.Approved
if !wantToDoRequest {
return nil, nil
}
// Can't make another request if there's already one ongoing (or if an upscaling request is
// planned)
if plannedUpscaleRequest {
s.warn("Wanted to send vm-monitor downscale request, but waiting on other planned upscale request")
return nil, nil
} else if s.Monitor.OngoingRequest != nil {
var requestDescription string
if s.Monitor.OngoingRequest.Kind == monitorRequestKindDownscale && s.Monitor.OngoingRequest.Requested != requestResources {
requestDescription = "downscale request (for different resources)"
} else if s.Monitor.OngoingRequest.Kind == monitorRequestKindUpscale {
requestDescription = "upscale request"
}
if requestDescription != "" {
s.warnf("Wanted to send vm-monitor downscale request, but waiting on other ongoing %s", requestDescription)
}
return nil, nil
}
// Can't make another request if we failed too recently:
if s.Monitor.DownscaleFailureAt != nil {
timeUntilFailureBackoffExpires := s.Monitor.DownscaleFailureAt.Add(s.Config.MonitorRetryWait).Sub(now)
if timeUntilFailureBackoffExpires > 0 {
s.warn("Wanted to send vm-monitor downscale request but failed too recently")
return nil, &timeUntilFailureBackoffExpires
}
}
// Can't make another request if a recent request for resources less than or equal to the
// proposed request was denied. In general though, this should be handled by
// DesiredResourcesFromMetricsOrRequestedUpscaling, so it's we're better off panicking here.
if s.timeUntilDeniedDownscaleExpired(now) > 0 && !s.Monitor.DeniedDownscale.Requested.HasFieldLessThan(requestResources) {
panic(errors.New(
"Wanted to send vm-monitor downscale request, but too soon after previously denied downscaling that should have been handled earlier",
))
}
// Nothing else to check, we're good to make the request
return &ActionMonitorDownscale{
Current: *s.Monitor.Approved,
Target: requestResources,
TargetRevision: s.TargetRevision.WithTime(now),
}, nil
}
func (s *state) scalingConfig() api.ScalingConfig {
// nb: WithOverrides allows its arg to be nil, in which case it does nothing.
return s.Config.DefaultScalingConfig.WithOverrides(s.VM.Config.ScalingConfig)
}
// public version, for testing.
func (s *State) DesiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) (api.Resources, func(ActionSet) *time.Duration) {
return s.internal.desiredResourcesFromMetricsOrRequestedUpscaling(now)
}
func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) (api.Resources, func(ActionSet) *time.Duration) {
// There's some annoying edge cases that this function has to be able to handle properly. For
// the sake of completeness, they are:
//
// 1. s.vm.Using() is not a multiple of s.computeUnit
// 2. s.vm.Max() is less than s.computeUnit (or: has at least one resource that is)
// 3. s.vm.Using() is a fractional multiple of s.computeUnit, but !allowDecrease and rounding up
// is greater than s.vm.Max()
// 4. s.vm.Using() is much larger than s.vm.Min() and not a multiple of s.computeUnit, but load
// is low so we should just decrease *anyways*.
//
// ---
//
// Broadly, the implementation works like this:
// For CPU:
// Based on load average, calculate the "goal" number of CPUs (and therefore compute units)
//
// For Memory:
// Based on memory usage, calculate the VM's desired memory allocation and extrapolate a
// goal number of CUs from that.
//
// 1. Take the maximum of these two goal CUs to create a unified goal CU
// 2. Cap the goal CU by min/max, etc
// 3. that's it!
sg, goalCULogFields := calculateGoalCU(
s.warn,
s.scalingConfig(),
s.Config.ComputeUnit,
s.Metrics,
s.LFCMetrics,
)
goalCU := sg.goalCU
// If we don't have all the metrics we need, we'll later prevent downscaling to avoid flushing
// the VM's cache on autoscaler-agent restart if we have SystemMetrics but not LFCMetrics.
hasAllMetrics := sg.hasAllMetrics
// Copy the initial value of the goal CU so that we can accurately track whether either
// requested upscaling or denied downscaling affected the outcome.
// Otherwise as written, it'd be possible to update goalCU from requested upscaling and
// incorrectly miss that denied downscaling could have had the same effect.
initialGoalCU := goalCU
var requestedUpscalingAffectedResult bool
// Update goalCU based on any explicitly requested upscaling
timeUntilRequestedUpscalingExpired := s.timeUntilRequestedUpscalingExpired(now)
requestedUpscalingInEffect := timeUntilRequestedUpscalingExpired > 0
if requestedUpscalingInEffect {
reqCU := s.requiredCUForRequestedUpscaling(s.Config.ComputeUnit, *s.Monitor.RequestedUpscale)
if reqCU > initialGoalCU {
// FIXME: this isn't quite correct, because if initialGoalCU is already equal to the
// maximum goal CU we *could* have, this won't actually have an effect.
requestedUpscalingAffectedResult = true
goalCU = max(goalCU, reqCU)
}
}
var deniedDownscaleAffectedResult bool
// Update goalCU based on any previously denied downscaling
timeUntilDeniedDownscaleExpired := s.timeUntilDeniedDownscaleExpired(now)
deniedDownscaleInEffect := timeUntilDeniedDownscaleExpired > 0
if deniedDownscaleInEffect {
reqCU := s.requiredCUForDeniedDownscale(s.Config.ComputeUnit, s.Monitor.DeniedDownscale.Requested)
if reqCU > initialGoalCU {
deniedDownscaleAffectedResult = true
goalCU = max(goalCU, reqCU)
}
}
// resources for the desired "goal" compute units
goalResources := s.Config.ComputeUnit.Mul(uint16(goalCU))
// If we don't have all the metrics we need to make a proper decision, make sure that we aren't
// going to scale down below the current resources.
// Otherwise, we can make an under-informed decision that has undesirable impacts (e.g., scaling
// down because we don't have LFC metrics and flushing the cache because of it).
if !hasAllMetrics {
goalResources = goalResources.Max(s.VM.Using())
}
// bound goalResources by the minimum and maximum resource amounts for the VM
result := goalResources.Min(s.VM.Max()).Max(s.VM.Min())
// ... but if we aren't allowed to downscale, then we *must* make sure that the VM's usage value
// won't decrease to the previously denied amount, even if it's greater than the maximum.
//
// We can run into siutations like this when VM scale-down on bounds change fails, so we end up
// with a usage value greater than the maximum.
//
// It's not a great situation to be in, but it's easier to make the policy "give the users a
// little extra if we mess up" than "oops we OOM-killed your DB, hope you weren't doing anything".
if deniedDownscaleInEffect {
// roughly equivalent to "result >= s.monitor.deniedDownscale.requested"
if !result.HasFieldGreaterThan(s.Monitor.DeniedDownscale.Requested) {
// This can only happen if s.vm.Max() is less than goalResources, because otherwise this
// would have been factored into goalCU, affecting goalResources. Hence, the warning.
s.warn("Can't decrease desired resources to within VM maximum because of vm-monitor previously denied downscale request")
}
preMaxResult := result
result = result.Max(s.minRequiredResourcesForDeniedDownscale(s.Config.ComputeUnit, *s.Monitor.DeniedDownscale))
if result != preMaxResult {
deniedDownscaleAffectedResult = true
}
}
// Check that the result is sound.
//
// With the current (naive) implementation, this is trivially ok. In future versions, it might
// not be so simple, so it's good to have this integrity check here.
if !deniedDownscaleAffectedResult && result.HasFieldGreaterThan(s.VM.Max()) {
panic(fmt.Errorf(
"produced invalid desired state: result has field greater than max. this = %+v", *s,
))
} else if result.HasFieldLessThan(s.VM.Min()) {
panic(fmt.Errorf(
"produced invalid desired state: result has field less than min. this = %+v", *s,
))
}
calculateWaitTime := func(actions ActionSet) *time.Duration {
var waiting bool
waitTime := time.Duration(int64(1<<63 - 1)) // time.Duration is an int64. As an "unset" value, use the maximum.
if deniedDownscaleAffectedResult && actions.MonitorDownscale == nil && s.Monitor.OngoingRequest == nil {
waitTime = min(waitTime, timeUntilDeniedDownscaleExpired)
waiting = true
}
if requestedUpscalingAffectedResult {
waitTime = min(waitTime, timeUntilRequestedUpscalingExpired)
waiting = true
}
if waiting {
return &waitTime
} else {
return nil
}
}
s.updateTargetRevision(now, result, s.VM.Using())
// TODO: we are both saving the result into LastDesiredResources and returning it. This is
// redundant, and we should remove one of the two.
s.LastDesiredResources = &result
logFields := []zap.Field{
zap.Object("current", s.VM.Using()),
zap.Object("target", result),
zap.Object("targetRevision", &s.TargetRevision),
}
logFields = append(logFields, goalCULogFields...)
s.info("Calculated desired resources", logFields...)
return result, calculateWaitTime
}
func (s *state) updateTargetRevision(now time.Time, desired api.Resources, current api.Resources) {
if s.LastDesiredResources == nil {
s.LastDesiredResources = ¤t
}
if *s.LastDesiredResources == desired {
// Nothing changed, so no need to update the target revision
return
}
var flags vmv1.Flag
if desired.HasFieldGreaterThan(*s.LastDesiredResources) {
flags.Set(revsource.Upscale)
}
if desired.HasFieldLessThan(*s.LastDesiredResources) {
flags.Set(revsource.Downscale)
}
s.TargetRevision = s.Config.RevisionSource.Next(now, flags)
}
func (s *state) updateNeonVMCurrentRevision(currentRevision vmv1.RevisionWithTime) {
revsource.Propagate(currentRevision.UpdatedAt.Time,
s.NeonVM.TargetRevision,
&s.NeonVM.CurrentRevision,
s.Config.ObservabilityCallbacks.NeonVMLatency,
)
err := s.Config.RevisionSource.Observe(currentRevision.UpdatedAt.Time, currentRevision.Revision)
if err != nil {
s.warnf("Failed to observe clock source: %v", err)
}
// We also zero out LastDesiredResources, because we are now starting from
// a new current resources.
s.LastDesiredResources = nil
}
func (s *state) timeUntilRequestedUpscalingExpired(now time.Time) time.Duration {
if s.Monitor.RequestedUpscale != nil {
return s.Monitor.RequestedUpscale.At.Add(s.Config.MonitorRequestedUpscaleValidPeriod).Sub(now)
} else {
return 0
}
}
// NB: we could just use s.plugin.computeUnit or s.monitor.requestedUpscale from inside the
// function, but those are sometimes nil. This way, it's clear that it's the caller's responsibility
// to ensure that the values are non-nil.
func (s *state) requiredCUForRequestedUpscaling(computeUnit api.Resources, requestedUpscale requestedUpscale) uint32 {
var required uint32
requested := requestedUpscale.Requested
base := requestedUpscale.Base
// note: 1 + floor(x / M) gives the minimum integer value greater than x / M.
if requested.Cpu {
required = max(required, 1+uint32(base.VCPU/computeUnit.VCPU))
}
if requested.Memory {
required = max(required, 1+uint32(base.Mem/computeUnit.Mem))
}
return required
}
func (s *state) timeUntilDeniedDownscaleExpired(now time.Time) time.Duration {
if s.Monitor.DeniedDownscale != nil {
return s.Monitor.DeniedDownscale.At.Add(s.Config.MonitorDeniedDownscaleCooldown).Sub(now)
} else {
return 0
}
}
// NB: like requiredCUForRequestedUpscaling, we make the caller provide the values so that it's
// more clear that it's the caller's responsibility to ensure the values are non-nil.
func (s *state) requiredCUForDeniedDownscale(computeUnit, deniedResources api.Resources) uint32 {
// note: floor(x / M) + 1 gives the minimum integer value greater than x / M.
requiredFromCPU := 1 + uint32(deniedResources.VCPU/computeUnit.VCPU)
requiredFromMem := 1 + uint32(deniedResources.Mem/computeUnit.Mem)
return max(requiredFromCPU, requiredFromMem)
}
func (s *state) minRequiredResourcesForDeniedDownscale(computeUnit api.Resources, denied deniedDownscale) api.Resources {
// for each resource, increase the value by one CU's worth, but not greater than the value we
// were at while attempting to downscale.
//
// phrasing it like this cleanly handles some subtle edge cases when denied.current isn't a
// multiple of the compute unit.
return api.Resources{
VCPU: min(denied.Current.VCPU, computeUnit.VCPU*(1+denied.Requested.VCPU/computeUnit.VCPU)),
Mem: min(denied.Current.Mem, computeUnit.Mem*(1+denied.Requested.Mem/computeUnit.Mem)),
}
}
// clampResources uses the directionality of the difference between s.vm.Using() and desired to
// clamp the desired resources with the upper *or* lower bound
func (s *state) clampResources(
current api.Resources,
desired api.Resources,
lowerBound *api.Resources,
upperBound *api.Resources,
) api.Resources {
// Internal validity checks:
if lowerBound != nil && lowerBound.HasFieldGreaterThan(current) {
panic(fmt.Errorf(
"clampResources called with invalid arguments: lowerBound=%+v has field greater than current=%+v",
lowerBound,
current,
))
} else if upperBound != nil && upperBound.HasFieldLessThan(current) {
panic(fmt.Errorf(
"clampResources called with invalid arguments: upperBound=%+v has field less than current=%+v",
upperBound,
current,
))
}
cpu := desired.VCPU
if desired.VCPU < current.VCPU && lowerBound != nil {
cpu = max(desired.VCPU, lowerBound.VCPU)
} else if desired.VCPU > current.VCPU && upperBound != nil {
cpu = min(desired.VCPU, upperBound.VCPU)
}
mem := desired.Mem
if desired.Mem < current.Mem && lowerBound != nil {
mem = max(desired.Mem, lowerBound.Mem)
} else if desired.Mem > current.Mem && upperBound != nil {
mem = min(desired.Mem, upperBound.Mem)
}
return api.Resources{VCPU: cpu, Mem: mem}
}
func (s *state) monitorApprovedLowerBound() api.Resources {
if s.Monitor.Approved != nil {
return *s.Monitor.Approved
} else {
return s.VM.Using()
}
}
func (s *state) pluginApprovedUpperBound() api.Resources {
if s.Plugin.Permit != nil {
return *s.Plugin.Permit
} else {
return s.VM.Using()
}
}
//////////////////////////////////////////
// PUBLIC FUNCTIONS TO UPDATE THE STATE //
//////////////////////////////////////////
// Debug sets s.debug = enabled. This method is exclusively meant to be used in tests, to make it
// easier to enable print debugging only for a single call to NextActions, via s.warn() or otherwise.
func (s *State) Debug(enabled bool) {
s.internal.Debug = enabled
}
func (s *State) UpdatedVM(vm api.VmInfo) {
// FIXME: overriding this is required right now because we trust that a successful request to
// NeonVM means the VM was already updated, which... isn't true, and otherwise we could run into
// sync issues.
// A first-pass solution is possible by reading the values of VirtualMachine.Spec, but the
// "proper" solution would read from VirtualMachine.Status, which (at time of writing) isn't
// sound. For more, see:
// - https://github.com/neondatabase/autoscaling/pull/371#issuecomment-1752110131
// - https://github.com/neondatabase/autoscaling/issues/462
vm.SetUsing(s.internal.VM.Using())
s.internal.VM = vm
if vm.CurrentRevision != nil {
s.internal.updateNeonVMCurrentRevision(*vm.CurrentRevision)
}
// Make sure that if LFC metrics are disabled & later enabled, we don't make decisions based on
// stale data.
if !*s.internal.scalingConfig().EnableLFCMetrics {
s.internal.LFCMetrics = nil
}
}
func (s *State) UpdateSystemMetrics(metrics SystemMetrics) {
s.internal.Metrics = &metrics
}
func (s *State) UpdateLFCMetrics(metrics LFCMetrics) {
s.internal.LFCMetrics = &metrics
}
// PluginHandle provides write access to the scheduler plugin pieces of an UpdateState
type PluginHandle struct {
s *state
}
func (s *State) Plugin() PluginHandle {
return PluginHandle{&s.internal}
}
func (h PluginHandle) StartingRequest(now time.Time, resources api.Resources) {
h.s.Plugin.LastRequest = &pluginRequested{
At: now,
Resources: resources,
}
h.s.Plugin.OngoingRequest = true
}
func (h PluginHandle) RequestFailed(now time.Time) {
h.s.Plugin.OngoingRequest = false
h.s.Plugin.LastFailureAt = &now
}
func (h PluginHandle) RequestSuccessful(
now time.Time,
targetRevision vmv1.RevisionWithTime,
resp api.PluginResponse,
) (_err error) {
h.s.Plugin.OngoingRequest = false
defer func() {
if _err != nil {
h.s.Plugin.LastFailureAt = &now
}
}()
if err := resp.Permit.ValidateNonZero(); err != nil {
return fmt.Errorf("Invalid permit: %w", err)
}
// Errors from resp in connection with the prior request
if resp.Permit.HasFieldGreaterThan(h.s.Plugin.LastRequest.Resources) {
return fmt.Errorf(
"Permit has resources greater than request (%+v vs. %+v)",
resp.Permit, h.s.Plugin.LastRequest.Resources,
)
}
// Errors from resp in connection with the prior request AND the VM state
if vmUsing := h.s.VM.Using(); resp.Permit.HasFieldLessThan(vmUsing) {
return fmt.Errorf("Permit has resources less than VM (%+v vs %+v)", resp.Permit, vmUsing)
}
// All good - set everything.
// NOTE: We don't set the compute unit, even though the plugin response contains it. We're in
// the process of moving the source of truth for ComputeUnit from the scheduler plugin to the
// autoscaler-agent.
h.s.Plugin.Permit = &resp.Permit
revsource.Propagate(now,
targetRevision,
&h.s.Plugin.CurrentRevision,
h.s.Config.ObservabilityCallbacks.PluginLatency,
)
return nil
}
// MonitorHandle provides write access to the vm-monitor pieces of an UpdateState
type MonitorHandle struct {
s *state
}
func (s *State) Monitor() MonitorHandle {
return MonitorHandle{&s.internal}
}
func (h MonitorHandle) Reset() {
h.s.Monitor = monitorState{
OngoingRequest: nil,
RequestedUpscale: nil,
DeniedDownscale: nil,
Approved: nil,
DownscaleFailureAt: nil,
UpscaleFailureAt: nil,
CurrentRevision: vmv1.ZeroRevision,
}
}
func (h MonitorHandle) Active(active bool) {
if active {
approved := h.s.VM.Using()
h.s.Monitor.Approved = &approved // TODO: this is racy
} else {
h.s.Monitor.Approved = nil
}
}
func (h MonitorHandle) UpscaleRequested(now time.Time, resources api.MoreResources) {
h.s.Monitor.RequestedUpscale = &requestedUpscale{
At: now,
Base: *h.s.Monitor.Approved,
Requested: resources,
}
}
func (h MonitorHandle) StartingUpscaleRequest(now time.Time, resources api.Resources) {
h.s.Monitor.OngoingRequest = &ongoingMonitorRequest{
Kind: monitorRequestKindUpscale,
Requested: resources,
}
h.s.Monitor.UpscaleFailureAt = nil
}
func (h MonitorHandle) UpscaleRequestSuccessful(now time.Time) {
h.s.Monitor.Approved = &h.s.Monitor.OngoingRequest.Requested
h.s.Monitor.OngoingRequest = nil
}
func (h MonitorHandle) UpscaleRequestFailed(now time.Time) {
h.s.Monitor.OngoingRequest = nil
h.s.Monitor.UpscaleFailureAt = &now
}
func (h MonitorHandle) StartingDownscaleRequest(now time.Time, resources api.Resources) {
h.s.Monitor.OngoingRequest = &ongoingMonitorRequest{
Kind: monitorRequestKindDownscale,
Requested: resources,
}
h.s.Monitor.DownscaleFailureAt = nil
}
func (h MonitorHandle) DownscaleRequestAllowed(now time.Time, rev vmv1.RevisionWithTime) {
h.s.Monitor.Approved = &h.s.Monitor.OngoingRequest.Requested
h.s.Monitor.OngoingRequest = nil
revsource.Propagate(now,
rev,
&h.s.Monitor.CurrentRevision,
h.s.Config.ObservabilityCallbacks.MonitorLatency,
)
}
// Downscale request was successful but the monitor denied our request.
func (h MonitorHandle) DownscaleRequestDenied(now time.Time, targetRevision vmv1.RevisionWithTime) {
h.s.Monitor.DeniedDownscale = &deniedDownscale{
At: now,
Current: *h.s.Monitor.Approved,
Requested: h.s.Monitor.OngoingRequest.Requested,
}
h.s.Monitor.OngoingRequest = nil
revsource.Propagate(now,
targetRevision,
&h.s.Monitor.CurrentRevision,
h.s.Config.ObservabilityCallbacks.MonitorLatency,
)
}
func (h MonitorHandle) DownscaleRequestFailed(now time.Time) {
h.s.Monitor.OngoingRequest = nil
h.s.Monitor.DownscaleFailureAt = &now
}
type NeonVMHandle struct {
s *state
}
func (s *State) NeonVM() NeonVMHandle {
return NeonVMHandle{&s.internal}
}
func (h NeonVMHandle) StartingRequest(now time.Time, resources api.Resources) {
// FIXME: add time to ongoing request info (or maybe only in RequestFailed?)
h.s.NeonVM.OngoingRequested = &resources
}
func (h NeonVMHandle) RequestSuccessful(now time.Time) {
if h.s.NeonVM.OngoingRequested == nil {
panic("received NeonVM().RequestSuccessful() update without ongoing request")
}
resources := *h.s.NeonVM.OngoingRequested
// FIXME: This is actually incorrect; we shouldn't trust that the VM has already been updated
// just because the request completed. It takes longer for the reconcile cycle(s) to make the
// necessary changes.
// See the comments in (*State).UpdatedVM() for more info.
h.s.VM.SetUsing(resources)
h.s.NeonVM.OngoingRequested = nil
}
func (h NeonVMHandle) RequestFailed(now time.Time) {
h.s.NeonVM.OngoingRequested = nil
h.s.NeonVM.RequestFailedAt = &now
}
package testhelpers
import (
"errors"
"fmt"
"reflect"
"testing"
"github.com/samber/lo"
"github.com/stretchr/testify/assert"
)
type Assert struct {
t *testing.T
storedWarnings *[]string
waitingOnPreparedCall *bool
tinfo transactionInfo
}
type transactionInfo struct {
expectedWarnings []string
}
// NewAssert creates a new Assert object wrapping the provided *testing.T
func NewAssert(t *testing.T) Assert {
return Assert{
t: t,
storedWarnings: &[]string{},
waitingOnPreparedCall: lo.ToPtr(false),
tinfo: transactionInfo{
expectedWarnings: []string{},
},
}
}
// StoredWarnings returns a reference to the warnings that will be checked, intended to be used with
// the InitialStateOpt constructor WithStoredWarnings
func (a Assert) StoredWarnings() *[]string {
return a.storedWarnings
}
// WithWarnings returns an Assert that expects the given warnings to be emitted on each operation
func (a Assert) WithWarnings(warnings ...string) Assert {
a.tinfo.expectedWarnings = warnings
return a
}
// Do calls the function with the provided arguments, checking that no unexpected warnings were
// generated
//
// This is only valid for functions that return nothing.
func (a Assert) Do(f any, args ...any) {
a.Call(f, args...).Equals( /* empty args list means no returns */ )
}
// NoError calls the function with the provided arguments, checking that the error it returns is
// nil, and that no unexpected warnings were generated.
func (a Assert) NoError(f any, args ...any) {
a.Call(f, args...).Equals(nil)
}
// Call sets up a prepared function call, which will not be executed until one of its methods is
// actually called, which will perform all the relevant checks.
//
// Variadic functions are not supported.
func (a Assert) Call(f any, args ...any) PreparedFunctionCall {
if *a.waitingOnPreparedCall {
panic(errors.New("previous Call() constructed but not executed (must use `Do()`, `NoError()`, or `Call().Equals()`)"))
}
fv := reflect.ValueOf(f)
fTy := fv.Type()
if fTy.Kind() != reflect.Func {
panic(errors.New("f must be a function"))
} else if fTy.IsVariadic() {
panic(errors.New("f is variadic"))
}
var argValues []reflect.Value
for _, a := range args {
argValues = append(argValues, reflect.ValueOf(a))
}
*a.waitingOnPreparedCall = true
return PreparedFunctionCall{a: a, f: fv, args: argValues}
}
// PreparedFunctionCall is a function call that has been set up by (Assert).Call() but not executed
type PreparedFunctionCall struct {
a Assert
f reflect.Value
args []reflect.Value
}
// Equals calls the prepared function, checking that all the return values are equal to what's
// expected, and that no unexpected warnings were generated.
func (f PreparedFunctionCall) Equals(expected ...any) {
*f.a.waitingOnPreparedCall = false
fTy := f.f.Type()
numOut := fTy.NumOut()
if len(expected) != numOut {
panic(fmt.Errorf(
"Mismatched number of out parameters from function: func has %d but expected len is %d",
numOut,
len(expected),
))
}
type unknownInterface any
var actualReturnTypes []reflect.Type
var expectedReturnTypes []reflect.Type
for i := 0; i < numOut; i += 1 {
actual := fTy.Out(i)
actualReturnTypes = append(actualReturnTypes, actual)
// Can't call reflect.Value.Type on nil, so if we're given a nil value, we have to be a
// little more permissive.
var expectedTy reflect.Type
if expected[i] != nil {
expectedTy = reflect.TypeOf(expected[i])
} else if actual.Kind() == reflect.Interface {
// well, the actual value can be a nil interface too, so it's probably fine
expectedTy = actual
} else {
// but... if the actual value isn't an interface, there's a problem
expectedTy = reflect.TypeOf((*unknownInterface)(nil)).Elem()
}
expectedReturnTypes = append(expectedReturnTypes, expectedTy)
}
if !reflect.DeepEqual(expectedReturnTypes, actualReturnTypes) {
panic(fmt.Errorf(
"provided return types not equal to the function's: function has %v, but expected has %v",
actualReturnTypes,
expectedReturnTypes,
))
}
returnValues := f.f.Call(f.args)
for i := range returnValues {
assert.Equal(f.a.t, expected[i], returnValues[i].Interface())
}
assert.Equal(f.a.t, f.a.tinfo.expectedWarnings, *f.a.storedWarnings)
if f.a.t.Failed() {
f.a.t.FailNow()
}
*f.a.storedWarnings = []string{}
}
package testhelpers
import (
"fmt"
"testing"
"time"
"github.com/stretchr/testify/require"
)
// FakeClock is a small facility that makes it easy to operate on duration since start with
// relative times, rather than absolute times.
type FakeClock struct {
t *testing.T
base time.Time
now time.Time
}
// NewFakeClock creates a new fake clock, with the initial time set to an unspecified, round number.
func NewFakeClock(t *testing.T) *FakeClock {
base, err := time.Parse(time.RFC3339, "2000-01-01T00:00:00Z") // a nice round number, to make things easier
if err != nil {
panic(err)
}
return &FakeClock{t: t, base: base, now: base}
}
// Now returns the current time of the clock
func (c *FakeClock) Now() time.Time {
return c.now
}
// Elapsed returns the total time added (via Inc) since the clock was started
func (c *FakeClock) Elapsed() Elapsed {
return Elapsed{c.t, c.now.Sub(c.base)}
}
// Inc adds duration to the current time of the clock
func (c *FakeClock) Inc(duration time.Duration) Elapsed {
if duration < 0 {
panic(fmt.Errorf("(*FakeClock).Inc() called with negative duration %s", duration))
}
c.now = c.now.Add(duration)
return c.Elapsed()
}
type Elapsed struct {
t *testing.T
time.Duration
}
func (e Elapsed) AssertEquals(expected time.Duration) {
require.Equal(e.t, expected, e.Duration)
}
package testhelpers
import (
"fmt"
"testing"
"go.uber.org/zap"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/agent/core"
"github.com/neondatabase/autoscaling/pkg/api"
)
type InitialVmInfoConfig struct {
ComputeUnit api.Resources
MemorySlotSize api.Bytes
MinCU uint16
MaxCU uint16
}
type InitialStateConfig struct {
VM InitialVmInfoConfig
Core core.Config
}
type InitialStateOpt interface {
modifyStateConfig(*core.Config)
}
type VmInfoOpt interface {
InitialStateOpt
modifyVmInfoConfig(*InitialVmInfoConfig)
modifyVmInfoWithConfig(InitialVmInfoConfig, *api.VmInfo)
}
func CreateInitialState(config InitialStateConfig, opts ...InitialStateOpt) *core.State {
vmOpts := []VmInfoOpt{}
for _, o := range opts {
if vo, ok := o.(VmInfoOpt); ok {
vmOpts = append(vmOpts, vo)
}
}
vm := CreateVmInfo(config.VM, vmOpts...)
for _, o := range opts {
o.modifyStateConfig(&config.Core)
}
return core.NewState(vm, config.Core)
}
func CreateVmInfo(config InitialVmInfoConfig, opts ...VmInfoOpt) api.VmInfo {
for _, o := range opts {
o.modifyVmInfoConfig(&config)
}
if config.ComputeUnit.Mem%config.MemorySlotSize != 0 {
panic(fmt.Errorf(
"compute unit is not divisible by memory slot size: %v is not divisible by %v",
config.ComputeUnit.Mem,
config.MemorySlotSize,
))
}
vm := api.VmInfo{
Name: "test",
Namespace: "test",
Cpu: api.VmCpuInfo{
Min: vmapi.MilliCPU(config.MinCU) * config.ComputeUnit.VCPU,
Use: vmapi.MilliCPU(config.MinCU) * config.ComputeUnit.VCPU,
Max: vmapi.MilliCPU(config.MaxCU) * config.ComputeUnit.VCPU,
},
Mem: api.VmMemInfo{
SlotSize: config.MemorySlotSize,
Min: config.MinCU * uint16(config.ComputeUnit.Mem/config.MemorySlotSize),
Use: config.MinCU * uint16(config.ComputeUnit.Mem/config.MemorySlotSize),
Max: config.MaxCU * uint16(config.ComputeUnit.Mem/config.MemorySlotSize),
},
Config: api.VmConfig{
AutoMigrationEnabled: false,
AlwaysMigrate: false,
ScalingConfig: nil,
ScalingEnabled: true,
},
CurrentRevision: nil,
}
for _, o := range opts {
o.modifyVmInfoWithConfig(config, &vm)
}
return vm
}
type coreConfigModifier func(*core.Config)
type vmInfoConfigModifier func(*InitialVmInfoConfig)
type vmInfoModifier func(InitialVmInfoConfig, *api.VmInfo)
var (
_ VmInfoOpt = vmInfoConfigModifier(nil)
_ VmInfoOpt = vmInfoModifier(nil)
)
func (m coreConfigModifier) modifyStateConfig(c *core.Config) { (func(*core.Config))(m)(c) }
func (m vmInfoConfigModifier) modifyStateConfig(*core.Config) {}
func (m vmInfoModifier) modifyStateConfig(*core.Config) {}
func (m vmInfoModifier) modifyVmInfoConfig(*InitialVmInfoConfig) {}
func (m vmInfoConfigModifier) modifyVmInfoConfig(c *InitialVmInfoConfig) {
(func(*InitialVmInfoConfig))(m)(c)
}
func (m vmInfoConfigModifier) modifyVmInfoWithConfig(InitialVmInfoConfig, *api.VmInfo) {}
func (m vmInfoModifier) modifyVmInfoWithConfig(c InitialVmInfoConfig, vm *api.VmInfo) {
(func(InitialVmInfoConfig, *api.VmInfo))(m)(c, vm)
}
func WithConfigSetting(f func(*core.Config)) InitialStateOpt {
return coreConfigModifier(f)
}
func WithStoredWarnings(warnings *[]string) InitialStateOpt {
return WithConfigSetting(func(c *core.Config) {
warn := c.Log.Warn
c.Log.Warn = func(msg string, fields ...zap.Field) {
*warnings = append(*warnings, msg)
if warn != nil {
warn(msg, fields...)
}
}
})
}
func WithTestingLogfWarnings(t *testing.T) InitialStateOpt {
return WithConfigSetting(func(c *core.Config) {
warn := c.Log.Warn
c.Log.Warn = func(msg string, fields ...zap.Field) {
t.Log(msg)
if warn != nil {
warn(msg, fields...)
}
}
})
}
func WithMinMaxCU(minCU, maxCU uint16) VmInfoOpt {
return vmInfoConfigModifier(func(c *InitialVmInfoConfig) {
c.MinCU = minCU
c.MaxCU = maxCU
})
}
func WithCurrentCU(cu uint16) VmInfoOpt {
return vmInfoModifier(func(c InitialVmInfoConfig, vm *api.VmInfo) {
vm.SetUsing(c.ComputeUnit.Mul(cu))
})
}
func WithCurrentRevision(rev vmapi.RevisionWithTime) VmInfoOpt {
return vmInfoModifier(func(c InitialVmInfoConfig, vm *api.VmInfo) {
vm.CurrentRevision = &rev
})
}
package testhelpers
import (
"time"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)
type ExpectedRevision struct {
vmv1.Revision
Now func() time.Time
}
func NewExpectedRevision(now func() time.Time) *ExpectedRevision {
return &ExpectedRevision{
Now: now,
Revision: vmv1.ZeroRevision,
}
}
func (e *ExpectedRevision) WithTime() vmv1.RevisionWithTime {
return e.Revision.WithTime(e.Now())
}
type NilRevisionSource struct{}
func (c *NilRevisionSource) Next(_ time.Time, _ vmv1.Flag) vmv1.Revision {
return vmv1.Revision{
Value: 0,
Flags: 0,
}
}
func (c *NilRevisionSource) Observe(_ time.Time, _ vmv1.Revision) error { return nil }
package core
// Working set size estimation
// For more, see: https://www.notion.so/neondatabase/874ef1cc942a4e6592434dbe9e609350
import (
"fmt"
)
type WssEstimatorConfig struct {
// MaxAllowedIncreaseFactor is the maximum tolerable increase in slope between windows.
// If the slope increases by more than this factor, we will cut off the working set size as the
// border between the two windows.
MaxAllowedIncreaseFactor float64
// InitialOffset is the index of the minimum working set size we must consider.
//
// In practice, this is taken from the scaling config's LFCMinWaitBeforeDownscaleMinutes, with
// the expectation that datapoints are all one minute apart, starting at 1m. So a value of 15m
// translates to an InitialOffset of 14 (-1 because indexes start at zero, but the first
// datapoint is 1m).
InitialOffset int
// WindowSize sets the offset for datapoints used in the calculation of the slope before & after
// a point. For window size W, we calculate the slope at point P as value[P]-value[P-(W-1)].
// This value must be >= 2.
//
// In practice, this value is taken from the scaling config's LFCWindowSizeMinutes, with the
// expectation that datapoints are all one minute apart. So, a value of 5 minutes translates to
// a WindowSize of 5.
WindowSize int
}
// EstimateTrueWorkingSetSize returns an estimate of the "true" current working set size, given a
// series of datapoints for the observed working set size over increasing time intervals.
//
// In practice, the 'series' is e.g., values of 'neon.lfc_approximate_working_set_size_seconds(d)'
// for equidistant values of 'd' from 1 minute to 60 minutes.
//
// This function panics if:
// * cfg.WindowSize < 2
// * cfg.InitialOffset < cfg.WindowSize - 1
func EstimateTrueWorkingSetSize(
series []float64,
cfg WssEstimatorConfig,
) float64 {
if cfg.WindowSize < 2 {
panic(fmt.Errorf("cfg.WindowSize must be >= 2 (got %v)", cfg.WindowSize))
} else if cfg.InitialOffset < cfg.WindowSize-1 {
panic(fmt.Errorf("cfg.InitialOffset must be >= cfg.WindowSize - 1 (got %v < %v - 1)", cfg.InitialOffset, cfg.WindowSize))
}
// For a window size of e.g. 5 points, we're looking back from series[t] to series[t-4], because
// series[t] is already included. (and similarly for looking forward to series[t+4]).
// 'w' is a shorthand for that -1 to make the code in the loop below cleaner.
w := cfg.WindowSize - 1
for t := cfg.InitialOffset; t < len(series)-w; t += 1 {
// In theory the HLL estimator will guarantee that - at any instant - increasing the
// duration for the working set will not decrease the value.
// However in practice, the individual values are not calculated at the same time, so we
// must still account for the possibility that series[t] < series[t-w], or similarly for
// series[t+w] and series[t].
// Hence, max(0.0, ...)
d0 := max(0.0, series[t]-series[t-w])
d1 := max(0.0, series[t+w]-series[t])
if d1 > d0*cfg.MaxAllowedIncreaseFactor {
return series[t]
}
}
return series[len(series)-1]
}
// ProjectNextHighest looks at the rate of change between points in 'series', returning the maximum
// value if any of these slopes were to continue for 'projectLen' additional datapoints.
//
// For example, given the series '0, 1, 3, 4, 5', projectLen of 3, and ceil equal to 6,
// ProjectNextHighest will return 9 (because 1 → 3 would reach 9 if it continued for another 3
// datapoints (→ 5 → 7 → 9).
//
// Internally, ProjectNextHighest is used to allow preemptive scale-up when we can see that the
// observed working set size is increasing, but we don't know how big it'll get.
// In short, this function helps answer: "How much should we scale-up to accommodate expected
// increases in demand?".
func ProjectNextHighest(series []float64, projectLen float64) float64 {
if len(series) < 2 {
panic(fmt.Errorf("Cannot ProjectNextHighest with series of length %d (must be >= 2)", len(series)))
}
highest := series[0]
for i := 1; i < len(series); i += 1 {
x0 := series[i-1]
x1 := max(x0, series[i]) // ignore decreases
predicted := x1 + (x1-x0)*projectLen
highest = max(highest, predicted)
}
return highest
}
package agent
// The Dispatcher is our interface with the monitor. We interact via a websocket
// connection through a simple RPC-style protocol.
import (
"context"
"encoding/json"
"errors"
"fmt"
"sync"
"sync/atomic"
"time"
"go.uber.org/zap"
"nhooyr.io/websocket"
"nhooyr.io/websocket/wsjson"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
)
const (
MinMonitorProtocolVersion api.MonitorProtoVersion = api.MonitorProtoV1_0
MaxMonitorProtocolVersion api.MonitorProtoVersion = api.MonitorProtoV1_0
)
// This struct represents the result of a dispatcher.Call. Because the SignalSender
// passed in can only be generic over one type - we have this mock enum. Only
// one field should ever be non-nil, and it should always be clear which field
// is readable. For example, the caller of dispatcher.call(HealthCheck { .. })
// should only read the healthcheck field.
type MonitorResult struct {
Result *api.DownscaleResult
Confirmation *api.UpscaleConfirmation
HealthCheck *api.HealthCheck
}
// The Dispatcher is the main object managing the websocket connection to the
// monitor. For more information on the protocol, see pkg/api/types.go
type Dispatcher struct {
// The underlying connection we are managing
conn *websocket.Conn
// When someone sends a message, the dispatcher will attach a transaction id
// to it so that it knows when a response is back. When it receives a message
// with the same transaction id, it knows that that is the response to the original
// message and will send it down the SignalSender so the original sender can use it.
waiters map[uint64]util.SignalSender[waiterResult]
// lock guards mutating the waiters, exitError, and (closing) exitSignal field.
// conn and lastTransactionID are all thread safe.
// runner, exit, and protoVersion are never modified.
lock sync.Mutex
// The runner that this dispatcher is part of
runner *Runner
exit func(status websocket.StatusCode, err error, transformErr func(error) error)
exitError error
exitSignal chan struct{}
// lastTransactionID is the last transaction id. When we need a new one
// we simply bump it and take the new number.
//
// In order to prevent collisions between the IDs generated here vs by
// the monitor, we only generate even IDs, and the monitor only generates
// odd ones. So generating a new value is done by adding 2.
lastTransactionID atomic.Uint64
protoVersion api.MonitorProtoVersion
}
type waiterResult struct {
err error
res *MonitorResult
}
// Create a new Dispatcher, establishing a connection with the vm-monitor and setting up all the
// background threads to manage the connection.
func NewDispatcher(
ctx context.Context,
logger *zap.Logger,
addr string,
runner *Runner,
sendUpscaleRequested func(request api.MoreResources, withLock func()),
) (_finalDispatcher *Dispatcher, _ error) {
// Create a new root-level context for this Dispatcher so that we can cancel if need be
ctx, cancelRootContext := context.WithCancel(ctx)
defer func() {
// cancel on failure or panic
if _finalDispatcher == nil {
cancelRootContext()
}
}()
connectTimeout := time.Second * time.Duration(runner.global.config.Monitor.ConnectionTimeoutSeconds)
conn, protoVersion, err := connectToMonitor(ctx, logger, addr, connectTimeout)
if err != nil {
return nil, err
}
disp := &Dispatcher{
conn: conn,
waiters: make(map[uint64]util.SignalSender[waiterResult]),
runner: runner,
lock: sync.Mutex{},
exit: nil, // set below
exitError: nil,
exitSignal: make(chan struct{}),
lastTransactionID: atomic.Uint64{}, // Note: initialized to 0, so it's even, as required.
protoVersion: *protoVersion,
}
disp.exit = func(status websocket.StatusCode, err error, transformErr func(error) error) {
disp.lock.Lock()
defer disp.lock.Unlock()
if disp.Exited() {
return
}
close(disp.exitSignal)
disp.exitError = err
cancelRootContext()
var closeReason string
if err != nil {
if transformErr != nil {
closeReason = transformErr(err).Error()
} else {
closeReason = err.Error()
}
} else {
closeReason = "normal exit"
}
// Run the actual websocket closing in a separate goroutine so we don't block while holding
// the lock. It can take up to 10s to close:
//
// > [Close] will write a WebSocket close frame with a timeout of 5s and then wait 5s for
// > the peer to send a close frame.
//
// This *potentially* runs us into race issues, but those are probably less bad to deal
// with, tbh.
go disp.conn.Close(status, closeReason)
}
go func() {
<-ctx.Done()
disp.exit(websocket.StatusNormalClosure, nil, nil)
}()
msgHandlerLogger := logger.Named("message-handler")
runner.spawnBackgroundWorker(ctx, msgHandlerLogger, "vm-monitor message handler", func(c context.Context, l *zap.Logger) {
disp.run(c, l, sendUpscaleRequested)
})
runner.spawnBackgroundWorker(ctx, logger.Named("health-checks"), "vm-monitor health checks", func(ctx context.Context, logger *zap.Logger) {
timeout := time.Second * time.Duration(runner.global.config.Monitor.ResponseTimeoutSeconds)
// FIXME: make this duration configurable
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
// if we've had sequential failures for more than
var firstSequentialFailure *time.Time
continuedFailureAbortTimeout := time.Second * time.Duration(runner.global.config.Monitor.MaxHealthCheckSequentialFailuresSeconds)
// if we don't have any errors, we will log only every 10th successful health check
const logEveryNth = 10
var okSequence int
var failSequence int
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
}
startTime := time.Now()
_, err := disp.Call(ctx, logger, timeout, "HealthCheck", api.HealthCheck{})
endTime := time.Now()
logFields := []zap.Field{
zap.Duration("duration", endTime.Sub(startTime)),
}
if okSequence != 0 {
logFields = append(logFields, zap.Int("okSequence", okSequence))
}
if failSequence != 0 {
logFields = append(logFields, zap.Int("failSequence", failSequence))
}
if err != nil {
// health check failed, reset the ok sequence count
okSequence = 0
failSequence++
logger.Error("vm-monitor health check failed", append(logFields, zap.Error(err))...)
if firstSequentialFailure == nil {
now := time.Now()
firstSequentialFailure = &now
} else if since := time.Since(*firstSequentialFailure); since > continuedFailureAbortTimeout {
err := fmt.Errorf("vm-monitor has been failing health checks for at least %s", continuedFailureAbortTimeout)
logger.Error(fmt.Sprintf("%s, triggering connection restart", err.Error()))
disp.exit(websocket.StatusInternalError, err, nil)
}
} else {
// health check was successful, so reset the sequential failures count
failSequence = 0
okSequence++
firstSequentialFailure = nil
if okSequence%logEveryNth == 0 {
logger.Info("vm-monitor health check successful", logFields...)
}
runner.status.update(runner.global, func(s podStatus) podStatus {
now := time.Now()
s.lastSuccessfulMonitorComm = &now
return s
})
}
}
})
return disp, nil
}
func connectToMonitor(
ctx context.Context,
logger *zap.Logger,
addr string,
timeout time.Duration,
) (_ *websocket.Conn, _ *api.MonitorProtoVersion, finalErr error) {
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
logger.Info("Connecting to vm-monitor via websocket", zap.String("addr", addr))
// We do not need to close the response body according to docs.
// Doing so causes memory bugs.
c, _, err := websocket.Dial(ctx, addr, nil) //nolint:bodyclose // see comment above
if err != nil {
return nil, nil, fmt.Errorf("error establishing websocket connection to %s: %w", addr, err)
}
// If we return early, make sure we close the websocket
var failureReason websocket.StatusCode
defer func() {
if finalErr != nil {
if failureReason == 0 {
failureReason = websocket.StatusInternalError
}
c.Close(failureReason, finalErr.Error())
}
}()
versionRange := api.VersionRange[api.MonitorProtoVersion]{
Min: MinMonitorProtocolVersion,
Max: MaxMonitorProtocolVersion,
}
logger.Info("Sending protocol version range", zap.Any("range", versionRange))
// Figure out protocol version
err = wsjson.Write(ctx, c, versionRange)
if err != nil {
return nil, nil, fmt.Errorf("error sending protocol range to monitor: %w", err)
}
logger.Info("Reading monitor version response")
var resp api.MonitorProtocolResponse
err = wsjson.Read(ctx, c, &resp)
if err != nil {
logger.Error("Failed to read monitor response", zap.Error(err))
failureReason = websocket.StatusProtocolError
return nil, nil, fmt.Errorf("Error reading vm-monitor response during protocol handshake: %w", err)
}
logger.Info("Got monitor version response", zap.Any("response", resp))
if resp.Error != nil {
logger.Error("Got error response from vm-monitor", zap.Any("response", resp), zap.String("error", *resp.Error))
failureReason = websocket.StatusProtocolError
return nil, nil, fmt.Errorf("Monitor returned error during protocol handshake: %q", *resp.Error)
}
logger.Info("negotiated protocol version with monitor", zap.Any("response", resp), zap.String("version", resp.Version.String()))
return c, &resp.Version, nil
}
// ExitSignal returns a channel that is closed when the Dispatcher is no longer running
func (disp *Dispatcher) ExitSignal() <-chan struct{} {
return disp.exitSignal
}
// Exited returns whether the Dispatcher is no longer running
//
// Exited will return true iff the channel returned by ExitSignal is closed.
func (disp *Dispatcher) Exited() bool {
select {
case <-disp.exitSignal:
return true
default:
return false
}
}
// ExitError returns the error that caused the dispatcher to exit, if there was one
func (disp *Dispatcher) ExitError() error {
disp.lock.Lock()
defer disp.lock.Unlock()
return disp.exitError
}
// temporary method to hopefully help with https://github.com/neondatabase/autoscaling/issues/503
func (disp *Dispatcher) lenWaiters() int {
disp.lock.Lock()
defer disp.lock.Unlock()
return len(disp.waiters)
}
// Send a message down the connection. Only call this method with types that
// SerializeMonitorMessage can handle.
func (disp *Dispatcher) send(ctx context.Context, logger *zap.Logger, id uint64, message any) error {
data, err := api.SerializeMonitorMessage(message, id)
if err != nil {
return fmt.Errorf("error serializing message: %w", err)
}
// wsjson.Write serializes whatever is passed in, and go serializes []byte
// by base64 encoding it, so use RawMessage to avoid serializing to []byte
// (done by SerializeMonitorMessage), and then base64 encoding again
raw := json.RawMessage(data)
logger.Debug("sending message to monitor", zap.ByteString("message", raw))
return wsjson.Write(ctx, disp.conn, &raw)
}
// registerWaiter registers a util.SignalSender to get notified when a
// message with the given id arrives.
func (disp *Dispatcher) registerWaiter(id uint64, sender util.SignalSender[waiterResult]) {
disp.lock.Lock()
defer disp.lock.Unlock()
disp.waiters[id] = sender
}
// unregisterWaiter deletes a preexisting waiter without interacting with it.
func (disp *Dispatcher) unregisterWaiter(id uint64) {
disp.lock.Lock()
defer disp.lock.Unlock()
delete(disp.waiters, id)
}
// Make a request to the monitor and wait for a response. The value passed as message must be a
// valid value to send to the monitor. See the docs for SerializeMonitorMessage for more.
//
// This function must NOT be called while holding disp.runner.lock.
func (disp *Dispatcher) Call(
ctx context.Context,
logger *zap.Logger,
timeout time.Duration,
messageType string,
message any,
) (*MonitorResult, error) {
id := disp.lastTransactionID.Add(2)
sender, receiver := util.NewSingleSignalPair[waiterResult]()
status := "internal error"
defer func() {
disp.runner.global.metrics.monitorRequestsOutbound.WithLabelValues(messageType, status).Inc()
}()
// register the waiter *before* sending, so that we avoid a potential race where we'd get a
// reply to the message before being ready to receive it.
disp.registerWaiter(id, sender)
err := disp.send(ctx, logger, id, message)
if err != nil {
logger.Error("failed to send message", zap.Any("message", message), zap.Error(err))
disp.unregisterWaiter(id)
status = "[error: failed to send]"
return nil, err
}
timer := time.NewTimer(timeout)
defer timer.Stop()
select {
case result := <-receiver.Recv():
if result.err != nil {
status = fmt.Sprintf("[error: %s]", result.err)
return nil, errors.New("monitor experienced an internal error")
}
status = "ok"
return result.res, nil
case <-timer.C:
err := fmt.Errorf("timed out waiting %v for monitor response", timeout)
disp.unregisterWaiter(id)
status = "[error: timed out waiting for response]"
return nil, err
}
}
func extractField[T any](data map[string]interface{}, key string) (*T, error) {
field, ok := data[key]
if !ok {
return nil, fmt.Errorf("data had no key %q", key)
}
coerced, ok := field.(T)
if !ok {
return nil, fmt.Errorf("data[%q] was not of type %T", key, *new(T))
}
return &coerced, nil
}
type messageHandlerFuncs struct {
handleUpscaleRequest func(api.UpscaleRequest)
handleUpscaleConfirmation func(api.UpscaleConfirmation, uint64) error
handleDownscaleResult func(api.DownscaleResult, uint64) error
handleMonitorError func(api.InternalError, uint64) error
handleHealthCheck func(api.HealthCheck, uint64) error
}
// Handle messages from the monitor. Make sure that all message types the monitor
// can send are included in the inner switch statement.
func (disp *Dispatcher) HandleMessage(
ctx context.Context,
logger *zap.Logger,
handlers messageHandlerFuncs,
) error {
// Deserialization has several steps:
// 1. Deserialize into an unstructured map[string]interface{}
// 2. Read the `type` field to know the type of the message
// 3. Then try to to deserialize again, but into that specific type
// 4. All message also come with an integer id under the key `id`
// wsjson.Read tries to deserialize the message. If we were to read to a
// []byte, it would base64 encode it as part of deserialization. json.RawMessage
// avoids this, and we manually deserialize later
var message json.RawMessage
if err := wsjson.Read(ctx, disp.conn, &message); err != nil {
return fmt.Errorf("Error receiving message: %w", err)
}
logger.Debug("(pre-decoding): received a message", zap.ByteString("message", message))
var unstructured map[string]interface{}
if err := json.Unmarshal(message, &unstructured); err != nil {
return fmt.Errorf("Error deserializing message: %q", string(message))
}
typeStr, err := extractField[string](unstructured, "type")
if err != nil {
return fmt.Errorf("Error extracting 'type' field: %w", err)
}
// go thinks all json numbers are float64 so we first deserialize to that to
// avoid the type error, then cast to uint64
f, err := extractField[float64](unstructured, "id")
if err != nil {
return fmt.Errorf("Error extracting 'id field: %w", err)
}
id := uint64(*f)
var rootErr error
// now that we have the waiter's ID, make sure that if there's some failure past this point, we
// propagate that along to the monitor and remove it
defer func() {
// speculatively determine the root error, to send that along to the instance of Call
// waiting for it.
var err error
panicPayload := recover()
if panicPayload != nil {
err = errors.New("panicked")
} else if rootErr != nil {
err = rootErr
} else {
// if HandleMessage bailed without panicking or setting rootErr, but *also* without
// sending a message to the waiter, we should make sure that *something* gets sent, so
// the message doesn't just time out. But we don't have more information, so the error
// is still just "unknown".
err = errors.New("unknown")
}
disp.lock.Lock()
defer disp.lock.Unlock()
if sender, ok := disp.waiters[id]; ok {
sender.Send(waiterResult{err: err, res: nil})
delete(disp.waiters, id)
} else if rootErr != nil {
// we had some error while handling the message with this ID, and there wasn't a
// corresponding waiter. We should make note of this in the metrics:
status := fmt.Sprintf("[error: %s]", rootErr)
disp.runner.global.metrics.monitorRequestsInbound.WithLabelValues(*typeStr, status)
}
// resume panicking if we were before
if panicPayload != nil {
panic(panicPayload)
}
}()
// Helper function to handle common unmarshalling logic
unmarshal := func(value any) error {
if err := json.Unmarshal(message, value); err != nil {
rootErr = errors.New("Failed unmarshaling JSON")
err := fmt.Errorf("Error unmarshaling %s: %w", *typeStr, err)
logger.Error(rootErr.Error(), zap.Error(err))
// we're already on the error path anyways
_ = disp.send(ctx, logger, id, api.InvalidMessage{Error: err.Error()})
return err
}
return nil
}
switch *typeStr {
case "UpscaleRequest":
var req api.UpscaleRequest
if err := unmarshal(&req); err != nil {
return err
}
handlers.handleUpscaleRequest(req)
return nil
case "UpscaleConfirmation":
var confirmation api.UpscaleConfirmation
if err := unmarshal(&confirmation); err != nil {
return err
}
return handlers.handleUpscaleConfirmation(confirmation, id)
case "DownscaleResult":
var res api.DownscaleResult
if err := unmarshal(&res); err != nil {
return err
}
return handlers.handleDownscaleResult(res, id)
case "InternalError":
var monitorErr api.InternalError
if err := unmarshal(&monitorErr); err != nil {
return err
}
return handlers.handleMonitorError(monitorErr, id)
case "HealthCheck":
var healthCheck api.HealthCheck
if err := unmarshal(&healthCheck); err != nil {
return err
}
return handlers.handleHealthCheck(healthCheck, id)
case "InvalidMessage":
var warning api.InvalidMessage
if err := unmarshal(&warning); err != nil {
return err
}
logger.Warn("Received notification we sent an invalid message", zap.Any("warning", warning))
return nil
default:
rootErr = errors.New("Received unknown message type")
return disp.send(
ctx,
logger,
id,
api.InvalidMessage{Error: fmt.Sprintf("Received message of unknown type: %q", *typeStr)},
)
}
}
// Long running function that orchestrates all requests/responses.
func (disp *Dispatcher) run(ctx context.Context, logger *zap.Logger, upscaleRequester func(_ api.MoreResources, withLock func())) {
logger.Info("Starting message handler")
// Utility for logging + returning an error when we get a message with an
// id we're unaware of. Note: unknownMessage is not a message type.
handleUnkownMessage := func(messageType string, id uint64) error {
fmtString := "Received %s with id %d but id is unknown or already timed out waiting for a reply"
msg := fmt.Sprintf(fmtString, messageType, id)
logger.Warn(msg, zap.Uint64("id", id))
return disp.send(ctx, logger, id, api.InvalidMessage{Error: msg})
}
// Does not take a message id because we don't know when the agent will
// upscale. The monitor will get the result back as a NotifyUpscale message
// from us, with a new id.
handleUpscaleRequest := func(req api.UpscaleRequest) {
// TODO: it shouldn't be this function's responsibility to update metrics.
defer func() {
disp.runner.global.metrics.monitorRequestsInbound.WithLabelValues("UpscaleRequest", "ok").Inc()
}()
resourceReq := api.MoreResources{
Cpu: false,
Memory: true,
}
upscaleRequester(resourceReq, func() {
logger.Info("Updating requested upscale", zap.Any("requested", resourceReq))
})
}
handleUpscaleConfirmation := func(_ api.UpscaleConfirmation, id uint64) error {
disp.lock.Lock()
defer disp.lock.Unlock()
sender, ok := disp.waiters[id]
if ok {
logger.Info("vm-monitor confirmed upscale", zap.Uint64("id", id))
sender.Send(waiterResult{
err: nil,
res: &MonitorResult{
Confirmation: &api.UpscaleConfirmation{},
Result: nil,
HealthCheck: nil,
},
})
// Don't forget to delete the waiter
delete(disp.waiters, id)
return nil
} else {
return handleUnkownMessage("UpscaleConfirmation", id)
}
}
handleDownscaleResult := func(res api.DownscaleResult, id uint64) error {
disp.lock.Lock()
defer disp.lock.Unlock()
sender, ok := disp.waiters[id]
if ok {
logger.Info("vm-monitor returned downscale result", zap.Uint64("id", id), zap.Any("result", res))
sender.Send(waiterResult{
err: nil,
res: &MonitorResult{
Result: &res,
Confirmation: nil,
HealthCheck: nil,
},
})
// Don't forget to delete the waiter
delete(disp.waiters, id)
return nil
} else {
return handleUnkownMessage("DownscaleResult", id)
}
}
handleMonitorError := func(err api.InternalError, id uint64) error {
disp.lock.Lock()
defer disp.lock.Unlock()
sender, ok := disp.waiters[id]
if ok {
logger.Warn(
"vm-monitor experienced an internal error",
zap.Uint64("id", id),
zap.String("error", err.Error),
)
// Indicate to the receiver that an error occurred
sender.Send(waiterResult{
err: errors.New("vm-monitor internal error"),
res: nil,
})
// Don't forget to delete the waiter
delete(disp.waiters, id)
return nil
} else {
return handleUnkownMessage("MonitorError", id)
}
}
handleHealthCheck := func(confirmation api.HealthCheck, id uint64) error {
disp.lock.Lock()
defer disp.lock.Unlock()
sender, ok := disp.waiters[id]
if ok {
logger.Debug("vm-monitor responded to health check", zap.Uint64("id", id))
// Indicate to the receiver that an error occurred
sender.Send(waiterResult{
err: nil,
res: &MonitorResult{
HealthCheck: &api.HealthCheck{},
Result: nil,
Confirmation: nil,
},
})
// Don't forget to delete the waiter
delete(disp.waiters, id)
return nil
} else {
return handleUnkownMessage("HealthCheck", id)
}
}
handlers := messageHandlerFuncs{
handleUpscaleRequest: handleUpscaleRequest,
handleUpscaleConfirmation: handleUpscaleConfirmation,
handleDownscaleResult: handleDownscaleResult,
handleMonitorError: handleMonitorError,
handleHealthCheck: handleHealthCheck,
}
for {
err := disp.HandleMessage(ctx, logger, handlers)
if err != nil {
if ctx.Err() != nil {
// The context is already cancelled, so this error is mostly likely
// expected. For example, if the context is cancelled because the
// runner exited, we should expect to fail to read off the connection,
// which is closed by the server exit.
logger.Warn("Error handling message", zap.Error(err))
} else {
logger.Error("Error handling message, shutting down connection", zap.Error(err))
err = fmt.Errorf("Error handling message: %w", err)
// note: in theory we *could* be more descriptive with these statuses, but the only
// consumer of this API is the vm-monitor, and it doesn't check those.
//
// Also note: there's a limit on the size of the close frame we're allowed to send,
// so the actual error we use to exit with must be somewhat reduced in size. These
// "Error handling message" errors can get quite long, so we'll only use the root
// cause of the error for the message.
disp.exit(websocket.StatusInternalError, err, util.RootError)
}
return
}
}
}
package agent
// Utilities for dumping internal state
import (
"context"
"errors"
"fmt"
"net"
"net/http"
"runtime"
"sync"
"time"
"go.uber.org/zap"
"golang.org/x/exp/slices"
"github.com/neondatabase/autoscaling/pkg/util"
)
type StateDump struct {
Stopped bool `json:"stopped"`
BuildInfo util.BuildInfo `json:"buildInfo"`
Pods []podStateDump `json:"pods"`
}
func (s *agentState) StartDumpStateServer(shutdownCtx context.Context, logger *zap.Logger, config *DumpStateConfig) error {
// Manually start the TCP listener so we can minimize errors in the background thread.
addr := net.TCPAddr{IP: net.IPv4zero, Port: int(config.Port)}
listener, err := net.ListenTCP("tcp", &addr)
if err != nil {
return fmt.Errorf("Error binding to %v", addr)
}
go func() {
mux := http.NewServeMux()
util.AddHandler(logger, mux, "/", http.MethodGet, "<empty>", func(ctx context.Context, logger *zap.Logger, body *struct{}) (*StateDump, int, error) {
timeout := time.Duration(config.TimeoutSeconds) * time.Second
startTime := time.Now()
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
state, err := s.DumpState(ctx, shutdownCtx.Err() != nil)
if err != nil {
if ctx.Err() != nil && errors.Is(ctx.Err(), context.DeadlineExceeded) {
totalDuration := time.Since(startTime)
return nil, 500, fmt.Errorf("timed out after %s while getting state", totalDuration)
} else {
// some other type of cancel; 400 is a little weird, but there isn't a great
// option here.
return nil, 400, fmt.Errorf("error while getting state: %w", err)
}
}
return state, 200, nil
})
// note: we don't shut down this server. It should be possible to continue fetching the
// internal state after shutdown has started.
server := &http.Server{Handler: mux}
if err := server.Serve(listener); err != nil {
logger.Error("dump-state server exited", zap.Error(err))
}
}()
return nil
}
func (s *agentState) DumpState(ctx context.Context, stopped bool) (*StateDump, error) {
// Copy the high-level state, then process it
podList, err := func() ([]*podState, error) {
if err := s.lock.TryLock(ctx); err != nil {
return nil, err
}
defer s.lock.Unlock()
list := make([]*podState, 0, len(s.pods))
for name := range s.pods {
list = append(list, s.pods[name])
}
return list, nil
}()
if err != nil {
return nil, err
}
state := StateDump{
Stopped: stopped,
BuildInfo: util.GetBuildInfo(),
Pods: make([]podStateDump, len(podList)),
}
wg := sync.WaitGroup{}
wg.Add(len(podList))
concurrencyLimit := runtime.NumCPU()
sema := make(chan struct{}, concurrencyLimit) // semaphore
for i, pod := range podList {
sema <- struct{}{} // enforce only 'concurrencyLimit' threads running at a time
go func() {
defer func() {
<-sema
wg.Done()
}()
state.Pods[i] = pod.dump(ctx)
}()
}
// note: pod.Dump() respects the context, even with locking. When the context expires before we
// acquire a lock, there's still valuable information to return - it's worthwhile to wait for
// that to make it back to state.Pods when the context expires, instead of proactively aborting
// in *this* thread.
wg.Wait()
// Sort the pods by name, so that we produce a deterministic ordering
slices.SortFunc(state.Pods, func(a, b podStateDump) (less bool) {
return a.PodName.Namespace < b.PodName.Namespace && a.PodName.Name < b.PodName.Name
})
return &state, nil
}
package agent
import (
"context"
"fmt"
"github.com/tychoish/fun/pubsub"
"go.uber.org/zap"
"k8s.io/client-go/kubernetes"
vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
"github.com/neondatabase/autoscaling/pkg/agent/billing"
"github.com/neondatabase/autoscaling/pkg/agent/schedwatch"
"github.com/neondatabase/autoscaling/pkg/util"
"github.com/neondatabase/autoscaling/pkg/util/taskgroup"
"github.com/neondatabase/autoscaling/pkg/util/watch"
)
type MainRunner struct {
EnvArgs EnvArgs
Config *Config
KubeClient *kubernetes.Clientset
VMClient *vmclient.Clientset
}
func (r MainRunner) Run(logger *zap.Logger, ctx context.Context) error {
vmEventQueue := pubsub.NewUnlimitedQueue[vmEvent]()
defer vmEventQueue.Close()
pushToQueue := func(ev vmEvent) {
if err := vmEventQueue.Add(ev); err != nil {
logger.Warn("Failed to add vmEvent to queue", zap.Object("event", ev), zap.Error(err))
}
}
watchMetrics := watch.NewMetrics("autoscaling_agent_watchers")
perVMMetrics, vmPromReg := makePerVMMetrics()
logger.Info("Starting VM watcher")
vmWatchStore, err := startVMWatcher(ctx, logger, r.Config, r.VMClient, watchMetrics, perVMMetrics, r.EnvArgs.K8sNodeName, pushToQueue)
if err != nil {
return fmt.Errorf("Error starting VM watcher: %w", err)
}
defer vmWatchStore.Stop()
logger.Info("VM watcher started")
schedTracker, err := schedwatch.StartSchedulerWatcher(ctx, logger, r.KubeClient, watchMetrics, r.Config.Scheduler.SchedulerName)
if err != nil {
return fmt.Errorf("Starting scheduler watch server: %w", err)
}
defer schedTracker.Stop()
globalState, globalPromReg := r.newAgentState(logger, r.EnvArgs.K8sPodIP, schedTracker)
watchMetrics.MustRegister(globalPromReg)
logger.Info("Starting billing metrics collector")
storeForNode := watch.NewIndexedStore(vmWatchStore, billing.NewVMNodeIndex(r.EnvArgs.K8sNodeName))
metrics := billing.NewPromMetrics()
metrics.MustRegister(globalPromReg)
promLogger := logger.Named("prometheus")
if err := util.StartPrometheusMetricsServer(ctx, promLogger.Named("global"), 9100, globalPromReg); err != nil {
return fmt.Errorf("Error starting prometheus metrics server: %w", err)
}
if err := util.StartPrometheusMetricsServer(ctx, promLogger.Named("per-vm"), 9101, vmPromReg); err != nil {
return fmt.Errorf("Error starting prometheus metrics server: %w", err)
}
if r.Config.DumpState != nil {
logger.Info("Starting 'dump state' server")
if err := globalState.StartDumpStateServer(ctx, logger.Named("dump-state"), r.Config.DumpState); err != nil {
return fmt.Errorf("Error starting dump state server: %w", err)
}
}
mc, err := billing.NewMetricsCollector(ctx, logger, &r.Config.Billing)
if err != nil {
return fmt.Errorf("error creating billing metrics collector: %w", err)
}
tg := taskgroup.NewGroup(logger, taskgroup.WithParentContext(ctx))
tg.Go("billing", func(logger *zap.Logger) error {
return mc.Run(tg.Ctx(), logger, storeForNode, metrics)
})
tg.Go("main-loop", func(logger *zap.Logger) error {
logger.Info("Entering main loop")
for {
event, err := vmEventQueue.Wait(ctx)
if err != nil {
if ctx.Err() != nil {
// treat context canceled as a "normal" exit (because it is)
return nil
}
logger.Error("vmEventQueue returned error", zap.Error(err))
return err
}
globalState.handleEvent(tg.Ctx(), logger, event)
}
})
return tg.Wait()
}
package agent
// Implementations of the interfaces used by & defined in pkg/agent/executor
//
// This file is essentially the bridge between 'runner.go' and 'executor/',
// connecting the latter to the actual implementations in the former.
import (
"context"
"fmt"
"go.uber.org/zap"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/agent/executor"
"github.com/neondatabase/autoscaling/pkg/api"
)
var (
_ executor.PluginInterface = (*execPluginInterface)(nil)
_ executor.NeonVMInterface = (*execNeonVMInterface)(nil)
_ executor.MonitorInterface = (*execMonitorInterface)(nil)
)
/////////////////////////////////////////////////////////////
// Scheduler Plugin -related interfaces and implementation //
/////////////////////////////////////////////////////////////
type execPluginInterface struct {
runner *Runner
}
func makePluginInterface(r *Runner) *execPluginInterface {
return &execPluginInterface{runner: r}
}
// scalingResponseType indicates type of scaling response from the scheduler plugin
type scalingResponseType string
const (
scalingResponseTypeDenied = "denied"
scalingResponseTypeApproved = "approved"
scalingResponseTypePartiallyApproved = "partiallyApproved"
scalingResponseTypeFailed = "failed"
)
// Request implements executor.PluginInterface
func (iface *execPluginInterface) Request(
ctx context.Context,
logger *zap.Logger,
lastPermit *api.Resources,
target api.Resources,
metrics *api.Metrics,
) (*api.PluginResponse, error) {
if lastPermit != nil {
iface.runner.recordResourceChange(*lastPermit, target, iface.runner.global.metrics.schedulerRequestedChange)
}
resp, err := iface.runner.DoSchedulerRequest(ctx, logger, target, lastPermit, metrics)
if err == nil && lastPermit != nil {
iface.runner.recordResourceChange(*lastPermit, resp.Permit, iface.runner.global.metrics.schedulerApprovedChange)
}
responseType := func() scalingResponseType {
if err != nil { // request is failed
return scalingResponseTypeFailed
}
if resp.Permit == target { // request is fully approved by the scheduler
return scalingResponseTypeApproved
}
if lastPermit != nil && *lastPermit != resp.Permit { // request is partially approved by the scheduler
return scalingResponseTypePartiallyApproved
}
return scalingResponseTypeDenied // scheduler denied the request
}()
// update VM metrics
switch responseType {
case scalingResponseTypePartiallyApproved:
iface.runner.global.metrics.scalingPartialApprovalsTotal.WithLabelValues(directionValueInc).Inc()
case scalingResponseTypeDenied:
iface.runner.global.metrics.scalingFullDeniesTotal.WithLabelValues(directionValueInc).Inc()
default:
}
iface.runner.status.update(iface.runner.global, func(ps podStatus) podStatus {
// update podStatus metrics on failures
switch responseType {
case scalingResponseTypeDenied, scalingResponseTypeFailed:
ps.failedSchedulerRequestCounter.Inc()
default:
}
return ps
})
return resp, err
}
/////////////////////////////////////////////////
// NeonVM-related interface and implementation //
/////////////////////////////////////////////////
type execNeonVMInterface struct {
runner *Runner
}
func makeNeonVMInterface(r *Runner) *execNeonVMInterface {
return &execNeonVMInterface{runner: r}
}
// Request implements executor.NeonVMInterface
func (iface *execNeonVMInterface) Request(
ctx context.Context,
logger *zap.Logger,
current, target api.Resources,
targetRevision vmv1.RevisionWithTime,
) error {
iface.runner.recordResourceChange(current, target, iface.runner.global.metrics.neonvmRequestedChange)
err := iface.runner.doNeonVMRequest(ctx, target, targetRevision)
if err != nil {
iface.runner.status.update(iface.runner.global, func(ps podStatus) podStatus {
ps.failedNeonVMRequestCounter.Inc()
return ps
})
return fmt.Errorf("Error making VM patch request: %w", err)
}
return nil
}
////////////////////////////////////////////////////
// Monitor-related interface and implementation //
////////////////////////////////////////////////////
type execMonitorInterface struct {
runner *Runner
core *executor.ExecutorCore
generation *executor.StoredGenerationNumber
}
func makeMonitorInterface(
r *Runner,
core *executor.ExecutorCore,
generation *executor.StoredGenerationNumber,
) *execMonitorInterface {
return &execMonitorInterface{runner: r, core: core, generation: generation}
}
func (iface *execMonitorInterface) CurrentGeneration() executor.GenerationNumber {
return iface.generation.Get()
}
// GetHandle implements executor.MonitorInterface, and MUST only be called while holding the
// executor's lock.
//
// The locking requirement is why we're able to get away with an "unsynchronized" read of the value
// in the runner. For more, see the documentation on Runner.monitor.
func (iface *execMonitorInterface) GetHandle() executor.MonitorHandle {
monitor := iface.runner.monitor
if monitor == nil /* || monitor.dispatcher.Exited() */ {
// NB: we can't check if dispatcher.Exited() because otherwise we might return nil when the
// executor is told to make a request, because Exited() is not synchronized with changes to
// the executor state.
return nil
}
return &execMonitorHandle{
runner: iface.runner,
monitor: monitor,
}
}
type execMonitorHandle struct {
runner *Runner
monitor *monitorInfo
}
func (h *execMonitorHandle) Generation() executor.GenerationNumber {
return h.monitor.generation
}
func (h *execMonitorHandle) Downscale(
ctx context.Context,
logger *zap.Logger,
current api.Resources,
target api.Resources,
) (*api.DownscaleResult, error) {
// Check validity of the message we're sending
if target.HasFieldGreaterThan(current) {
innerMsg := fmt.Errorf("%+v has field greater than %+v", target, current)
panic(fmt.Errorf("(*execMonitorHandle).Downscale() called with target greater than current: %w", innerMsg))
}
h.runner.recordResourceChange(current, target, h.runner.global.metrics.monitorRequestedChange)
result, err := doMonitorDownscale(ctx, logger, h.monitor.dispatcher, target)
if err == nil {
if result.Ok {
h.runner.recordResourceChange(current, target, h.runner.global.metrics.monitorApprovedChange)
}
} else {
h.runner.status.update(h.runner.global, func(ps podStatus) podStatus {
ps.failedMonitorRequestCounter.Inc()
h.runner.global.metrics.scalingFullDeniesTotal.WithLabelValues(directionValueDec).Inc()
return ps
})
}
return result, err
}
func (h *execMonitorHandle) Upscale(ctx context.Context, logger *zap.Logger, current, target api.Resources) error {
// Check validity of the message we're sending
if target.HasFieldLessThan(current) {
innerMsg := fmt.Errorf("%+v has field less than %+v", target, current)
panic(fmt.Errorf("(*execMonitorHandle).Upscale() called with target less than current: %w", innerMsg))
}
h.runner.recordResourceChange(current, target, h.runner.global.metrics.monitorRequestedChange)
err := doMonitorUpscale(ctx, logger, h.monitor.dispatcher, target)
if err == nil {
h.runner.recordResourceChange(current, target, h.runner.global.metrics.monitorApprovedChange)
} else {
h.runner.status.update(h.runner.global, func(ps podStatus) podStatus {
ps.failedMonitorRequestCounter.Inc()
return ps
})
}
return err
}
package executor
// Consumers of pkg/agent/core, implementing the "executors" for each type of action. These are
// wrapped up into a single ExecutorCore type, which exposes some methods for the various executors.
//
// The executors use various abstract interfaces for the scheduler plugin / NeonVM / vm-monitor, and
// are defined in exec_*.go. The implementations of those interfaces are defined in execbridge.go.
//
// Each of the methods to modify ExecutorCore take 'withLock' as a callback that runs while the lock
// is held. In general, this is used for logging, so that the log output strictly matches the
// ordering of the changes to the underlying core.State, which should help with debugging.
//
// For more, see pkg/agent/ARCHITECTURE.md.
import (
"sync"
"time"
"go.uber.org/zap"
"github.com/neondatabase/autoscaling/pkg/agent/core"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
)
type Config struct {
// OnNextActions is called each time the ExecutorCore calls (*core.State).NextActions() on the
// inner state object.
//
// In practice, this value is set to a callback that increments a metric.
OnNextActions func()
Core core.Config
}
type ExecutorCore struct {
mu sync.Mutex
stateLogger *zap.Logger
core *core.State
actions *timedActions
lastActionsID timedActionsID
onNextActions func()
updates *util.Broadcaster
}
type ClientSet struct {
Plugin PluginInterface
NeonVM NeonVMInterface
Monitor MonitorInterface
}
func NewExecutorCore(stateLogger *zap.Logger, vm api.VmInfo, config Config) *ExecutorCore {
return &ExecutorCore{
mu: sync.Mutex{},
stateLogger: stateLogger,
core: core.NewState(vm, config.Core),
actions: nil, // (*ExecutorCore).getActions() checks if this is nil
lastActionsID: -1,
onNextActions: config.OnNextActions,
updates: util.NewBroadcaster(),
}
}
// ExecutorCoreWithClients wraps ExecutorCore with the various
type ExecutorCoreWithClients struct {
*ExecutorCore
clients ClientSet
}
func (c *ExecutorCore) WithClients(clients ClientSet) ExecutorCoreWithClients {
return ExecutorCoreWithClients{
ExecutorCore: c,
clients: clients,
}
}
// timedActions stores the core.ActionSet in ExecutorCore alongside a unique ID
type timedActions struct {
// id stores a unique ID associated with the cached actions, so that we can use optimistic
// locking to make sure we're never taking an action that is not the *current* recommendation,
// because otherwise guaranteeing correctness of core.State is really difficult.
//
// id is exclusively used by (*ExecutorCore).updateIfActionsUnchanged().
id timedActionsID
actions core.ActionSet
}
type timedActionsID int64
// fetch the currently cached actions, or recalculate if they've since been invalidated
func (c *ExecutorCore) getActions() timedActions {
c.mu.Lock()
defer c.mu.Unlock()
if c.actions == nil {
id := c.lastActionsID + 1
c.onNextActions()
// NOTE: Even though we cache the actions generated using time.Now(), it's *generally* ok.
now := time.Now()
c.stateLogger.Debug("Recalculating ActionSet", zap.Time("now", now), zap.Any("state", c.core.Dump()))
c.actions = &timedActions{id: id, actions: c.core.NextActions(now)}
c.lastActionsID = id
c.stateLogger.Debug("New ActionSet", zap.Time("now", now), zap.Any("actions", c.actions.actions))
}
return *c.actions
}
func (c *ExecutorCore) update(with func(*core.State)) {
c.mu.Lock()
defer c.mu.Unlock()
c.updates.Broadcast()
c.actions = nil
with(c.core)
}
// updateIfActionsUnchanged is like update, but if the actions have been changed, then the function
// is not called and this returns false.
//
// Otherwise, if the actions are up-to-date, then this is equivalent to c.update(with), and returns true.
func (c *ExecutorCore) updateIfActionsUnchanged(actions timedActions, with func(*core.State)) (updated bool) {
c.mu.Lock()
defer c.mu.Unlock()
if actions.id != c.lastActionsID {
return false
}
c.updates.Broadcast()
c.actions = nil
with(c.core)
return true
}
// may change in the future
type StateDump = core.StateDump
// StateDump copies and returns the current state inside the executor
func (c *ExecutorCore) StateDump() StateDump {
c.mu.Lock()
defer c.mu.Unlock()
return c.core.Dump()
}
// Updater returns a handle on the object used for making external changes to the ExecutorCore,
// beyond what's provided by the various client (ish) interfaces
func (c *ExecutorCore) Updater() ExecutorCoreUpdater {
return ExecutorCoreUpdater{c}
}
// ExecutorCoreUpdater provides a common interface for external changes to the ExecutorCore
type ExecutorCoreUpdater struct {
core *ExecutorCore
}
// UpdateSystemMetrics calls (*core.State).UpdateSystemMetrics() on the inner core.State and runs
// withLock while holding the lock.
func (c ExecutorCoreUpdater) UpdateSystemMetrics(metrics core.SystemMetrics, withLock func()) {
c.core.update(func(state *core.State) {
state.UpdateSystemMetrics(metrics)
withLock()
})
}
// UpdateLFCMetrics calls (*core.State).UpdateLFCMetrics() on the inner core.State and runs withLock
// while holding the lock.
func (c ExecutorCoreUpdater) UpdateLFCMetrics(metrics core.LFCMetrics, withLock func()) {
c.core.update(func(state *core.State) {
state.UpdateLFCMetrics(metrics)
withLock()
})
}
// UpdatedVM calls (*core.State).UpdatedVM() on the inner core.State and runs withLock while
// holding the lock.
func (c ExecutorCoreUpdater) UpdatedVM(vm api.VmInfo, withLock func()) {
c.core.update(func(state *core.State) {
state.UpdatedVM(vm)
withLock()
})
}
// ResetMonitor calls (*core.State).Monitor().Reset() on the inner core.State and runs withLock
// while holding the lock.
func (c ExecutorCoreUpdater) ResetMonitor(withLock func()) {
c.core.update(func(state *core.State) {
state.Monitor().Reset()
withLock()
})
}
// UpscaleRequested calls (*core.State).Monitor().UpscaleRequested(...) on the inner core.State and
// runs withLock while holding the lock.
func (c ExecutorCoreUpdater) UpscaleRequested(resources api.MoreResources, withLock func()) {
c.core.update(func(state *core.State) {
state.Monitor().UpscaleRequested(time.Now(), resources)
withLock()
})
}
// MonitorActive calls (*core.State).Monitor().Active(...) on the inner core.State and runs withLock
// while holding the lock.
func (c ExecutorCoreUpdater) MonitorActive(active bool, withLock func()) {
c.core.update(func(state *core.State) {
state.Monitor().Active(active)
withLock()
})
}
package executor
import (
"context"
"errors"
"time"
"go.uber.org/zap"
"github.com/neondatabase/autoscaling/pkg/agent/core"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
)
type MonitorInterface interface {
CurrentGeneration() GenerationNumber
// GetHandle fetches a stable handle for the current monitor, or nil if there is not one.
// This method MUST NOT be called unless holding the executor's lock.
GetHandle() MonitorHandle
}
type MonitorHandle interface {
Generation() GenerationNumber
Downscale(_ context.Context, _ *zap.Logger, current, target api.Resources) (*api.DownscaleResult, error)
Upscale(_ context.Context, _ *zap.Logger, current, target api.Resources) error
}
func (c *ExecutorCoreWithClients) DoMonitorDownscales(ctx context.Context, logger *zap.Logger) {
var (
updates util.BroadcastReceiver = c.updates.NewReceiver()
ifaceLogger *zap.Logger = logger.Named("client")
)
// must be called while holding c's lock
generationUnchanged := func(since MonitorHandle) bool {
return since.Generation() == c.clients.Monitor.CurrentGeneration()
}
for {
// Wait until the state's changed, or we're done.
select {
case <-ctx.Done():
return
case <-updates.Wait():
updates.Awake()
}
last := c.getActions()
if last.actions.MonitorDownscale == nil {
continue // nothing to do; wait until the state changes.
}
var startTime time.Time
var monitorIface MonitorHandle
action := *last.actions.MonitorDownscale
if updated := c.updateIfActionsUnchanged(last, func(state *core.State) {
logger.Info("Starting vm-monitor downscale request", zap.Object("action", action))
startTime = time.Now()
monitorIface = c.clients.Monitor.GetHandle()
state.Monitor().StartingDownscaleRequest(startTime, action.Target)
if monitorIface == nil {
panic(errors.New(
"core.State asked for vm-monitor downscale request, but Monitor.GetHandle() is nil, so it should be disabled",
))
}
}); !updated {
continue // state has changed, retry.
}
result, err := monitorIface.Downscale(ctx, ifaceLogger, action.Current, action.Target)
endTime := time.Now()
c.update(func(state *core.State) {
unchanged := generationUnchanged(monitorIface)
logFields := []zap.Field{
zap.Object("action", action),
zap.Duration("duration", endTime.Sub(startTime)),
zap.Bool("unchanged", unchanged),
}
warnSkipBecauseChanged := func() {
logger.Warn("Skipping state update after vm-monitor downscale request because MonitorHandle changed")
}
if err != nil {
logger.Error("vm-monitor downscale request failed", append(logFields, zap.Error(err))...)
if unchanged {
state.Monitor().DownscaleRequestFailed(endTime)
} else {
warnSkipBecauseChanged()
}
return
}
logFields = append(logFields, zap.Any("response", result))
if !result.Ok {
logger.Warn("vm-monitor denied downscale", logFields...)
if unchanged {
state.Monitor().DownscaleRequestDenied(endTime, action.TargetRevision)
} else {
warnSkipBecauseChanged()
}
} else {
logger.Info("vm-monitor approved downscale", logFields...)
if unchanged {
state.Monitor().DownscaleRequestAllowed(endTime, action.TargetRevision)
} else {
warnSkipBecauseChanged()
}
}
})
}
}
func (c *ExecutorCoreWithClients) DoMonitorUpscales(ctx context.Context, logger *zap.Logger) {
var (
updates util.BroadcastReceiver = c.updates.NewReceiver()
ifaceLogger *zap.Logger = logger.Named("client")
)
// must be called while holding c's lock
generationUnchanged := func(since MonitorHandle) bool {
return since.Generation() == c.clients.Monitor.CurrentGeneration()
}
for {
// Wait until the state's changed, or we're done.
select {
case <-ctx.Done():
return
case <-updates.Wait():
updates.Awake()
}
last := c.getActions()
if last.actions.MonitorUpscale == nil {
continue // nothing to do; wait until the state changes.
}
var startTime time.Time
var monitorIface MonitorHandle
action := *last.actions.MonitorUpscale
if updated := c.updateIfActionsUnchanged(last, func(state *core.State) {
logger.Info("Starting vm-monitor upscale request", zap.Object("action", action))
startTime = time.Now()
monitorIface = c.clients.Monitor.GetHandle()
state.Monitor().StartingUpscaleRequest(startTime, action.Target)
if monitorIface == nil {
panic(errors.New(
"core.State asked for vm-monitor upscale request, but Monitor.GetHandle() is nil, so it should be disabled",
))
}
}); !updated {
continue // state has changed, retry.
}
err := monitorIface.Upscale(ctx, ifaceLogger, action.Current, action.Target)
endTime := time.Now()
c.update(func(state *core.State) {
unchanged := generationUnchanged(monitorIface)
logFields := []zap.Field{
zap.Object("action", action),
zap.Duration("duration", endTime.Sub(startTime)),
zap.Bool("unchanged", unchanged),
}
warnSkipBecauseChanged := func() {
logger.Warn("Skipping state update after vm-monitor upscale request because MonitorHandle changed")
}
if err != nil {
logger.Error("vm-monitor upscale request failed", append(logFields, zap.Error(err))...)
if unchanged {
state.Monitor().UpscaleRequestFailed(endTime)
} else {
warnSkipBecauseChanged()
}
return
}
logger.Info("vm-monitor upscale request successful", logFields...)
if unchanged {
state.Monitor().UpscaleRequestSuccessful(endTime)
} else {
warnSkipBecauseChanged()
}
})
}
}
package executor
import (
"context"
"time"
"go.uber.org/zap"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/agent/core"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
)
type NeonVMInterface interface {
Request(
_ context.Context,
_ *zap.Logger,
current, target api.Resources,
targetRevision vmv1.RevisionWithTime,
) error
}
func (c *ExecutorCoreWithClients) DoNeonVMRequests(ctx context.Context, logger *zap.Logger) {
var (
updates util.BroadcastReceiver = c.updates.NewReceiver()
ifaceLogger *zap.Logger = logger.Named("client")
)
for {
// Wait until the state's changed, or we're done.
select {
case <-ctx.Done():
return
case <-updates.Wait():
updates.Awake()
}
last := c.getActions()
if last.actions.NeonVMRequest == nil {
continue // nothing to do; wait until the state changes.
}
var startTime time.Time
action := *last.actions.NeonVMRequest
if updated := c.updateIfActionsUnchanged(last, func(state *core.State) {
logger.Info("Starting NeonVM request", zap.Object("action", action))
startTime = time.Now()
state.NeonVM().StartingRequest(startTime, action.Target)
}); !updated {
continue // state has changed, retry.
}
endTime := time.Now()
targetRevision := action.TargetRevision.WithTime(endTime)
err := c.clients.NeonVM.Request(ctx, ifaceLogger, action.Current, action.Target, targetRevision)
logFields := []zap.Field{zap.Object("action", action), zap.Duration("duration", endTime.Sub(startTime))}
c.update(func(state *core.State) {
if err != nil {
logger.Error("NeonVM request failed", append(logFields, zap.Error(err))...)
state.NeonVM().RequestFailed(endTime)
} else /* err == nil */ {
logger.Info("NeonVM request successful", logFields...)
state.NeonVM().RequestSuccessful(endTime)
}
})
}
}
package executor
import (
"context"
"time"
"go.uber.org/zap"
"github.com/neondatabase/autoscaling/pkg/agent/core"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
)
type PluginInterface interface {
Request(_ context.Context, _ *zap.Logger, lastPermit *api.Resources, target api.Resources, _ *api.Metrics) (*api.PluginResponse, error)
}
func (c *ExecutorCoreWithClients) DoPluginRequests(ctx context.Context, logger *zap.Logger) {
var (
updates util.BroadcastReceiver = c.updates.NewReceiver()
ifaceLogger *zap.Logger = logger.Named("client")
)
for {
// Wait until the state's changed, or we're done.
select {
case <-ctx.Done():
return
case <-updates.Wait():
updates.Awake()
}
last := c.getActions()
if last.actions.PluginRequest == nil {
continue // nothing to do; wait until the state changes.
}
var startTime time.Time
action := *last.actions.PluginRequest
if updated := c.updateIfActionsUnchanged(last, func(state *core.State) {
logger.Info("Starting plugin request", zap.Object("action", action))
startTime = time.Now()
state.Plugin().StartingRequest(startTime, action.Target)
}); !updated {
continue // state has changed, retry.
}
resp, err := c.clients.Plugin.Request(ctx, ifaceLogger, action.LastPermit, action.Target, action.Metrics)
endTime := time.Now()
c.update(func(state *core.State) {
logFields := []zap.Field{
zap.Object("action", action),
zap.Duration("duration", endTime.Sub(startTime)),
}
if err != nil {
logger.Error("Plugin request failed", append(logFields, zap.Error(err))...)
state.Plugin().RequestFailed(endTime)
} else {
logFields = append(logFields, zap.Any("response", resp))
logger.Info("Plugin request successful", logFields...)
if err := state.Plugin().RequestSuccessful(endTime, action.TargetRevision, *resp); err != nil {
logger.Error("Plugin response validation failed", append(logFields, zap.Error(err))...)
}
}
})
}
}
package executor
import (
"context"
"time"
"go.uber.org/zap"
"github.com/neondatabase/autoscaling/pkg/agent/core"
)
func (c *ExecutorCore) DoSleeper(ctx context.Context, logger *zap.Logger) {
updates := c.updates.NewReceiver()
// preallocate the timer. We clear it at the top of the loop; the 0 duration is just because we
// need *some* value, so it might as well be zero.
timer := time.NewTimer(0)
defer timer.Stop()
for {
// Ensure the timer is cleared at the top of the loop
if !timer.Stop() {
// Clear timer.C only if we haven't already read from it
select {
case <-timer.C:
default:
}
}
// Wait until the state's changed or we're done
select {
case <-ctx.Done():
return
case <-updates.Wait():
updates.Awake()
}
last := c.getActions()
if last.actions.Wait == nil {
continue // nothing to do; wait until the state changes
}
// NB: It's possible for last.calculatedAt to be somewhat out of date. It's *probably*
// fine, because we'll be given a notification any time the state has changed, so we
// should wake from a select soon enough to get here
timer.Reset(last.actions.Wait.Duration)
select {
case <-ctx.Done():
return
case <-updates.Wait():
// Don't consume the event here. Rely on the event to remain at the top of the loop
continue
case <-timer.C:
select {
// If there's also an update, then let that take preference:
case <-updates.Wait():
// Same thing as above - don't consume the event here.
continue
// Otherwise, trigger cache invalidation because we've waited for the requested
// amount of time:
default:
c.update(func(*core.State) {})
updates.Awake()
last = c.getActions()
}
}
}
}
package executor
// Generation numbers, for use by implementers of the various interfaces (i.e. pkg/agent/execbridge.go)
import (
"sync/atomic"
)
type StoredGenerationNumber struct {
value atomic.Int64
}
type GenerationNumber struct {
value int64
}
func NewStoredGenerationNumber() *StoredGenerationNumber {
return &StoredGenerationNumber{value: atomic.Int64{}}
}
// Inc increments the stored GenerationNumber, returning the new value
func (n *StoredGenerationNumber) Inc() GenerationNumber {
return GenerationNumber{value: n.value.Add(1)}
}
// Get fetches the current value of the stored GenerationNumber
func (n *StoredGenerationNumber) Get() GenerationNumber {
return GenerationNumber{value: n.value.Load()}
}
package agent
import (
"context"
"errors"
"fmt"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/zap"
"k8s.io/client-go/kubernetes"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
"github.com/neondatabase/autoscaling/pkg/agent/schedwatch"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
)
// agentState is the global state for the autoscaler agent
//
// All fields are immutable, except pods.
type agentState struct {
// lock guards access to pods
lock util.ChanMutex
pods map[util.NamespacedName]*podState
// A base logger to pass around, so we can recreate the logger for a Runner on restart, without
// running the risk of leaking keys.
baseLogger *zap.Logger
podIP string
config *Config
kubeClient *kubernetes.Clientset
vmClient *vmclient.Clientset
schedTracker *schedwatch.SchedulerTracker
metrics GlobalMetrics
}
func (r MainRunner) newAgentState(
baseLogger *zap.Logger,
podIP string,
schedTracker *schedwatch.SchedulerTracker,
) (*agentState, *prometheus.Registry) {
metrics, promReg := makeGlobalMetrics()
state := &agentState{
lock: util.NewChanMutex(),
pods: make(map[util.NamespacedName]*podState),
baseLogger: baseLogger,
config: r.Config,
kubeClient: r.KubeClient,
vmClient: r.VMClient,
podIP: podIP,
schedTracker: schedTracker,
metrics: metrics,
}
return state, promReg
}
func vmIsOurResponsibility(vm *vmapi.VirtualMachine, config *Config, nodeName string) bool {
return vm.Status.Node == nodeName &&
(vm.Status.Phase.IsAlive() && vm.Status.Phase != vmapi.VmMigrating) &&
vm.Status.PodIP != "" &&
api.HasAutoscalingEnabled(vm) &&
vm.Spec.SchedulerName == config.Scheduler.SchedulerName
}
func (s *agentState) Stop() {
s.lock.Lock()
defer s.lock.Unlock()
for _, pod := range s.pods {
pod.stop()
}
}
func (s *agentState) handleEvent(ctx context.Context, logger *zap.Logger, event vmEvent) {
logger = logger.With(
zap.Object("event", event),
zap.Object("virtualmachine", event.vmInfo.NamespacedName()),
zap.Object("pod", util.NamespacedName{Namespace: event.vmInfo.Namespace, Name: event.podName}),
)
logger.Debug("Handling event for VM")
if err := s.lock.TryLock(ctx); err != nil {
logger.Warn("Context canceled while starting to handle event", zap.Error(err))
return
}
defer s.lock.Unlock()
podName := util.NamespacedName{Namespace: event.vmInfo.Namespace, Name: event.podName}
state, hasPod := s.pods[podName]
// nb: we add the "pod" key for uniformity, even though it's derived from the event
if event.kind != vmEventAdded && !hasPod {
logger.Error("Received event for pod that isn't present", zap.Object("pod", podName))
return
} else if event.kind == vmEventAdded && hasPod {
logger.Error("Received add event for pod that's already present", zap.Object("pod", podName))
return
}
switch event.kind {
case vmEventDeleted:
state.stop()
// mark the status as deleted, so that it gets removed from metrics.
state.status.update(s, func(stat podStatus) podStatus {
stat.deleted = true
delete(s.pods, podName) // Do the removal while synchronized, because we can :)
return stat
})
case vmEventUpdated:
state.status.update(s, func(stat podStatus) podStatus {
now := time.Now()
stat.vmInfo = event.vmInfo
stat.endpointID = event.endpointID
stat.endpointAssignedAt = &now
state.vmInfoUpdated.Send()
return stat
})
case vmEventAdded:
s.handleVMEventAdded(ctx, event, podName)
default:
panic(errors.New("bad event: unexpected event kind"))
}
}
func (s *agentState) handleVMEventAdded(
ctx context.Context,
event vmEvent,
podName util.NamespacedName,
) {
runnerCtx, cancelRunnerContext := context.WithCancel(ctx)
now := time.Now()
status := &lockedPodStatus{
mu: sync.Mutex{},
podStatus: podStatus{
deleted: false,
endState: nil,
previousEndStates: nil,
vmInfo: event.vmInfo,
endpointID: event.endpointID,
endpointAssignedAt: &now,
state: "", // Explicitly set state to empty so that the initial state update does no decrement
stateUpdatedAt: now,
startTime: now,
lastSuccessfulMonitorComm: nil,
failedMonitorRequestCounter: util.NewRecentCounter(time.Duration(s.config.Monitor.MaxFailedRequestRate.IntervalSeconds) * time.Second),
failedNeonVMRequestCounter: util.NewRecentCounter(time.Duration(s.config.NeonVM.MaxFailedRequestRate.IntervalSeconds) * time.Second),
failedSchedulerRequestCounter: util.NewRecentCounter(time.Duration(s.config.Scheduler.MaxFailedRequestRate.IntervalSeconds) * time.Second),
},
}
// Empty update to trigger updating metrics and state.
status.update(s, func(s podStatus) podStatus { return s })
runner := s.newRunner(event.vmInfo, podName, event.podIP)
runner.status = status
txVMUpdate, rxVMUpdate := util.NewCondChannelPair()
s.pods[podName] = &podState{
podName: podName,
stop: cancelRunnerContext,
runner: runner,
status: status,
vmInfoUpdated: txVMUpdate,
}
s.metrics.runnerStarts.Inc()
restartCount := 0
logger := s.loggerForRunner(restartCount, event.vmInfo.NamespacedName(), podName)
runner.Spawn(runnerCtx, logger, rxVMUpdate)
}
// FIXME: make these timings configurable.
const (
RunnerRestartMinWaitSeconds = 5
RunnerRestartMaxWaitSeconds = 10
)
// TriggerRestartIfNecessary restarts the Runner for podName, after a delay if necessary.
//
// NB: runnerCtx is the context *passed to the new Runner*. It is only used here to end our restart
// process early if it's already been canceled. logger is not passed, and so can be handled a bit
// more freely.
func (s *agentState) TriggerRestartIfNecessary(runnerCtx context.Context, logger *zap.Logger, podName util.NamespacedName, podIP string) {
// Three steps:
// 1. Check if the Runner needs to restart. If no, we're done.
// 2. Wait for a random amount of time (between RunnerRestartMinWaitSeconds and RunnerRestartMaxWaitSeconds)
// 3. Restart the Runner (if it still should be restarted)
status, ok := func() (*lockedPodStatus, bool) {
s.lock.Lock()
defer s.lock.Unlock()
// note: pod.status has a separate lock, so we're ok to release s.lock
if pod, ok := s.pods[podName]; ok {
return pod.status, true
} else {
return nil, false
}
}()
if !ok {
return
}
status.mu.Lock()
defer status.mu.Unlock()
if status.endState == nil {
logger.Panic("TriggerRestartIfNecessary called with nil endState (should only be called after the pod is finished, when endState != nil)")
}
endTime := status.endState.Time
if endTime.IsZero() {
// If we don't check this, we run the risk of spinning on failures.
logger.Error("TriggerRestartIfNecessary called with zero'd Time for pod")
// Continue on, but with the time overridden, so we guarantee our minimum wait.
endTime = time.Now()
}
// keep this for later.
exitKind := status.endState.ExitKind
switch exitKind {
case podStatusExitCanceled:
logger.Info("Runner's context was canceled; no need to restart")
return // successful exit, no need to restart.
case podStatusExitPanicked:
// Should restart; continue.
logger.Info("Runner had abnormal exit kind; it will restart", zap.String("exitKind", string(exitKind)))
default:
logger.Error("TriggerRestartIfNecessary called with unexpected ExitKind", zap.String("exitKind", string(exitKind)))
// continue on; false positives (restarting when we shouldn't) are much better than the
// alternative here (not restarting when we should)
}
// Begin steps (2) and (3) -- wait, then restart.
var waitDuration time.Duration
totalRuntime := endTime.Sub(status.startTime)
// If the runner was running for a while, restart immediately.
//
// NOTE: this will have incorrect behavior when the system clock is behaving weirdly, but that's
// mostly ok. It's ok to e.g. restart an extra time at the switchover to daylight saving time.
if totalRuntime > time.Second*time.Duration(RunnerRestartMaxWaitSeconds) {
logger.Info("Runner was running for a long time, restarting immediately", zap.Duration("totalRuntime", totalRuntime))
waitDuration = 0
} else /* Otherwise, randomly pick within RunnerRestartMinWait..RunnerRestartMaxWait */ {
r := util.NewTimeRange(time.Second, RunnerRestartMinWaitSeconds, RunnerRestartMaxWaitSeconds)
waitDuration = r.Random()
logger.Info(
"Runner was not running for long, restarting after delay",
zap.Duration("totalRuntime", totalRuntime),
zap.Duration("delay", waitDuration),
)
}
// Run the waiting (if necessary) and restarting in another goroutine, so we're not blocking the
// caller of this function.
go func() {
logCancel := func(logFunc func(string, ...zap.Field), err error) {
logFunc(
"Canceling restart of Runner",
zap.Duration("delay", waitDuration),
zap.Duration("waitTime", time.Since(endTime)),
zap.Error(err),
)
}
if waitDuration != 0 {
select {
case <-time.After(waitDuration):
case <-runnerCtx.Done():
logCancel(logger.Info, runnerCtx.Err())
return
}
}
s.lock.Lock()
defer s.lock.Unlock()
// Need to update pod itself; can't release s.lock. Also, pod *theoretically* may been
// deleted + restarted since we started, so it's incorrect to hold on to the original
// podStatus.
pod, ok := s.pods[podName]
if !ok {
logCancel(logger.Warn, errors.New("no longer present in pod map"))
return
}
pod.status.update(s, func(status podStatus) podStatus {
// Runner was already restarted
if status.endState == nil {
addedInfo := "this generally shouldn't happen, but could if there's a new pod with the same name"
logCancel(logger.Warn, fmt.Errorf("Runner was already restarted (%s)", addedInfo))
return status
}
logger.Info("Restarting runner", zap.String("exitKind", string(exitKind)), zap.Duration("delay", time.Since(endTime)))
s.metrics.runnerRestarts.Inc()
restartCount := len(status.previousEndStates) + 1
runner := s.newRunner(status.vmInfo, podName, podIP)
runner.status = pod.status
txVMUpdate, rxVMUpdate := util.NewCondChannelPair()
// note: pod is *podState, so we don't need to re-assign to the map.
pod.vmInfoUpdated = txVMUpdate
pod.runner = runner
status.previousEndStates = append(status.previousEndStates, *status.endState)
status.endState = nil
status.startTime = time.Now()
runnerLogger := s.loggerForRunner(restartCount, status.vmInfo.NamespacedName(), podName)
runner.Spawn(runnerCtx, runnerLogger, rxVMUpdate)
return status
})
}()
}
func (s *agentState) loggerForRunner(restartCount int, vmName, podName util.NamespacedName) *zap.Logger {
return s.baseLogger.Named("runner").With(
zap.Int("restarts", restartCount),
zap.Object("virtualmachine", vmName),
zap.Object("pod", podName),
)
}
// NB: caller must set Runner.status after creation
func (s *agentState) newRunner(vmInfo api.VmInfo, podName util.NamespacedName, podIP string) *Runner {
return &Runner{
global: s,
status: nil, // set by caller
shutdown: nil, // set by (*Runner).Run
vmName: vmInfo.NamespacedName(),
podName: podName,
podIP: podIP,
memSlotSize: vmInfo.Mem.SlotSize,
lock: util.NewChanMutex(),
executorStateDump: nil, // set by (*Runner).Run
monitor: nil,
backgroundWorkerCount: atomic.Int64{},
backgroundPanic: make(chan error),
}
}
type podState struct {
podName util.NamespacedName
stop context.CancelFunc
runner *Runner
status *lockedPodStatus
vmInfoUpdated util.CondChannelSender
}
type podStateDump struct {
PodName util.NamespacedName `json:"podName"`
Status podStatusDump `json:"status"`
Runner *RunnerState `json:"runner,omitempty"`
CollectionError error `json:"collectionError,omitempty"`
}
func (p *podState) dump(ctx context.Context) podStateDump {
status := p.status.dump()
runner, collectErr := p.runner.State(ctx)
if collectErr != nil {
collectErr = fmt.Errorf("error reading runner state: %w", collectErr)
}
return podStateDump{
PodName: p.podName,
Status: status,
Runner: runner,
CollectionError: collectErr,
}
}
type lockedPodStatus struct {
mu sync.Mutex
podStatus
}
type podStatus struct {
startTime time.Time
// if true, the corresponding podState is no longer included in the global pod map
deleted bool
// if non-nil, the runner is finished
endState *podStatusEndState
previousEndStates []podStatusEndState
lastSuccessfulMonitorComm *time.Time
failedMonitorRequestCounter *util.RecentCounter
failedNeonVMRequestCounter *util.RecentCounter
failedSchedulerRequestCounter *util.RecentCounter
// vmInfo stores the latest information about the VM, as given by the global VM watcher.
//
// There is also a similar field inside the Runner itself, but it's better to store this out
// here, where we don't have to rely on the Runner being well-behaved w.r.t. locking.
vmInfo api.VmInfo
// endpointID, if non-empty, stores the ID of the endpoint associated with the VM
endpointID string
// NB: this value, once non-nil, is never changed.
endpointAssignedAt *time.Time
state runnerMetricState
stateUpdatedAt time.Time
}
type podStatusDump struct {
StartTime time.Time `json:"startTime"`
EndState *podStatusEndState `json:"endState"`
PreviousEndStates []podStatusEndState `json:"previousEndStates"`
LastSuccessfulMonitorComm *time.Time `json:"lastSuccessfulMonitorComm"`
FailedMonitorRequestCounter uint `json:"failedMonitorRequestCounter"`
FailedNeonVMRequestCounter uint `json:"failedNeonVMRequestCounter"`
FailedSchedulerRequestCounter uint `json:"failedSchedulerRequestCounter"`
VMInfo api.VmInfo `json:"vmInfo"`
EndpointID string `json:"endpointID"`
EndpointAssignedAt *time.Time `json:"endpointAssignedAt"`
State runnerMetricState `json:"state"`
StateUpdatedAt time.Time `json:"stateUpdatedAt"`
}
type podStatusEndState struct {
// The reason the Runner exited.
ExitKind podStatusExitKind `json:"exitKind"`
// If ExitKind is "panicked" or "errored", the error message.
Error error `json:"error"`
Time time.Time `json:"time"`
}
type podStatusExitKind string
const (
podStatusExitPanicked podStatusExitKind = "panicked"
podStatusExitCanceled podStatusExitKind = "canceled" // top-down signal that the Runner should stop.
)
func (s *lockedPodStatus) update(global *agentState, with func(podStatus) podStatus) {
s.mu.Lock()
defer s.mu.Unlock()
newStatus := with(s.podStatus)
now := time.Now()
// Calculate the new state:
var newState runnerMetricState
if s.deleted {
// If deleted, don't change anything.
} else if s.endState != nil {
switch s.endState.ExitKind {
case podStatusExitCanceled:
// If canceled, don't change the state.
newState = s.state
case podStatusExitPanicked:
newState = runnerMetricStatePanicked
}
} else if isStuck, _ := newStatus.isStuck(global, now); isStuck {
newState = runnerMetricStateStuck
} else {
newState = runnerMetricStateOk
}
if !newStatus.deleted {
newStatus.state = newState
newStatus.stateUpdatedAt = now
}
// Update the metrics:
// Note: s.state is initialized to the empty string to signify that it's not yet represented in
// the metrics.
if !s.deleted && s.state != "" {
oldIsEndpoint := strconv.FormatBool(s.endpointID != "")
global.metrics.runnersCount.WithLabelValues(oldIsEndpoint, string(s.state)).Dec()
}
if !newStatus.deleted && newStatus.state != "" {
newIsEndpoint := strconv.FormatBool(newStatus.endpointID != "")
global.metrics.runnersCount.WithLabelValues(newIsEndpoint, string(newStatus.state)).Inc()
}
s.podStatus = newStatus
}
func (s podStatus) isStuck(global *agentState, now time.Time) (bool, []string) {
var reasons []string
if s.monitorStuckAt(global.config).Before(now) {
reasons = append(reasons, "monitor health check failed")
}
if s.failedMonitorRequestCounter.Get() > global.config.Monitor.MaxFailedRequestRate.Threshold {
reasons = append(reasons, "monitor requests failed")
}
if s.failedSchedulerRequestCounter.Get() > global.config.Scheduler.MaxFailedRequestRate.Threshold {
reasons = append(reasons, "scheduler requests failed")
}
if s.failedNeonVMRequestCounter.Get() > global.config.NeonVM.MaxFailedRequestRate.Threshold {
reasons = append(reasons, "neonvm requests failed")
}
return len(reasons) > 0, reasons
}
// monitorStuckAt returns the time at which the Runner will be marked "stuck"
func (s podStatus) monitorStuckAt(config *Config) time.Time {
startupGracePeriod := time.Second * time.Duration(config.Monitor.UnhealthyStartupGracePeriodSeconds)
unhealthySilencePeriod := time.Second * time.Duration(config.Monitor.UnhealthyAfterSilenceDurationSeconds)
if s.lastSuccessfulMonitorComm == nil {
start := s.startTime
// For endpoints, we should start the grace period from when the VM was *assigned* the
// endpoint, rather than when the VM was created.
if s.endpointID != "" {
start = *s.endpointAssignedAt
}
return start.Add(startupGracePeriod)
} else {
return s.lastSuccessfulMonitorComm.Add(unhealthySilencePeriod)
}
}
func (s *lockedPodStatus) periodicallyRefreshState(ctx context.Context, logger *zap.Logger, global *agentState) {
ticker := time.NewTicker(time.Second * time.Duration(global.config.RefreshStateIntervalSeconds))
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
}
// use s.update to trigger re-evaluating the metrics, and simultaneously reset the timer to
// the next point in time at which the state might have changed, so that we minimize the
// time between the VM meeting the conditions for being "stuck" and us recognizing it.
s.update(global, func(stat podStatus) podStatus {
isStuck, reasons := stat.isStuck(global, time.Now())
if isStuck && stat.state != runnerMetricStatePanicked {
if stat.endpointID != "" {
logger.Warn("Runner with endpoint is currently stuck",
zap.String("endpointID", stat.endpointID), zap.String("reasons", strings.Join(reasons, ",")))
} else {
logger.Warn("Runner without endpoint is currently stuck",
zap.String("reasons", strings.Join(reasons, ",")))
}
}
return stat
})
}
}
func (s *lockedPodStatus) dump() podStatusDump {
s.mu.Lock()
defer s.mu.Unlock()
var endState *podStatusEndState
if s.endState != nil {
es := *s.endState
endState = &es
}
previousEndStates := make([]podStatusEndState, len(s.previousEndStates))
copy(previousEndStates, s.previousEndStates)
return podStatusDump{
EndState: endState,
PreviousEndStates: previousEndStates,
// FIXME: api.VmInfo contains a resource.Quantity - is that safe to copy by value?
VMInfo: s.vmInfo,
EndpointID: s.endpointID,
EndpointAssignedAt: s.endpointAssignedAt, // ok to share the pointer, because it's not updated
StartTime: s.startTime,
State: s.state,
StateUpdatedAt: s.stateUpdatedAt,
LastSuccessfulMonitorComm: s.lastSuccessfulMonitorComm,
FailedMonitorRequestCounter: s.failedMonitorRequestCounter.Get(),
FailedNeonVMRequestCounter: s.failedNeonVMRequestCounter.Get(),
FailedSchedulerRequestCounter: s.failedSchedulerRequestCounter.Get(),
}
}
package agent
import (
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/agent/core/revsource"
"github.com/neondatabase/autoscaling/pkg/util"
)
type GlobalMetrics struct {
schedulerRequests *prometheus.CounterVec
schedulerRequestedChange resourceChangePair
schedulerApprovedChange resourceChangePair
scalingFullDeniesTotal *prometheus.CounterVec
scalingPartialApprovalsTotal *prometheus.CounterVec
monitorRequestsOutbound *prometheus.CounterVec
monitorRequestsInbound *prometheus.CounterVec
monitorRequestedChange resourceChangePair
monitorApprovedChange resourceChangePair
neonvmRequestsOutbound *prometheus.CounterVec
neonvmRequestedChange resourceChangePair
runnersCount *prometheus.GaugeVec
runnerThreadPanics prometheus.Counter
runnerStarts prometheus.Counter
runnerRestarts prometheus.Counter
runnerNextActions prometheus.Counter
scalingLatency prometheus.HistogramVec
pluginLatency prometheus.HistogramVec
monitorLatency prometheus.HistogramVec
neonvmLatency prometheus.HistogramVec
}
func (m *GlobalMetrics) PluginLatency() *prometheus.HistogramVec {
return &m.pluginLatency
}
func (m *GlobalMetrics) MonitorLatency() *prometheus.HistogramVec {
return &m.monitorLatency
}
func (m *GlobalMetrics) NeonVMLatency() *prometheus.HistogramVec {
return &m.neonvmLatency
}
type resourceChangePair struct {
cpu *prometheus.CounterVec
mem *prometheus.CounterVec
}
const (
directionLabel = "direction"
directionValueInc = "inc"
directionValueDec = "dec"
directionValueBoth = "both"
directionValueNone = "none"
)
type runnerMetricState string
const (
runnerMetricStateOk runnerMetricState = "ok"
runnerMetricStateStuck runnerMetricState = "stuck"
runnerMetricStatePanicked runnerMetricState = "panicked"
)
// Copied bucket values from controller runtime latency metric. We can
// adjust them in the future if needed.
var buckets = []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}
func makeGlobalMetrics() (GlobalMetrics, *prometheus.Registry) {
reg := prometheus.NewRegistry()
// register stock collectors directly:
// (even though MustRegister is variadic, the function calls
// are cheap and calling it more than once means that when
// it panics, we know exactly which metric caused the error.)
reg.MustRegister(collectors.NewGoCollector())
reg.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
metrics := GlobalMetrics{
// the util.RegisterMetric() function registers the collector and returns
// it so we can set it directly on the output structure.
// ---- SCHEDULER ----
schedulerRequests: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_scheduler_plugin_requests_total",
Help: "Number of attempted HTTP requests to the scheduler plugin by autoscaler-agents",
},
[]string{"code"},
)),
schedulerRequestedChange: resourceChangePair{
cpu: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_scheduler_plugin_requested_cpu_change_total",
Help: "Total change in CPU requested from the scheduler",
},
[]string{directionLabel},
)),
mem: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_scheduler_plugin_requested_mem_change_total",
Help: "Total change in memory (in MiB) requested from the scheduler",
},
[]string{directionLabel},
)),
},
schedulerApprovedChange: resourceChangePair{
cpu: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_scheduler_plugin_accepted_cpu_change_total",
Help: "Total change in CPU approved by the scheduler",
},
[]string{directionLabel},
)),
mem: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_scheduler_plugin_accepted_mem_change_total",
Help: "Total change in memory (in MiB) approved by the scheduler",
},
[]string{directionLabel},
)),
},
// ---- scaling denies related metrics ----
scalingFullDeniesTotal: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_scaling_full_denials_total",
Help: "Number of the scheduler or vmmon full denials responses",
},
[]string{directionLabel},
)),
scalingPartialApprovalsTotal: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_scaling_partial_approvals_total",
Help: "Number of the scheduler partially approved responses",
},
[]string{directionLabel},
)),
// ---- MONITOR ----
monitorRequestsOutbound: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_monitor_outbound_requests_total",
Help: "Number of attempted HTTP requests to vm-monitors by autoscaler-agents",
},
[]string{"endpoint", "code"},
)),
monitorRequestsInbound: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_monitor_inbound_requests_total",
Help: "Number of HTTP requests from vm-monitors received by autoscaler-agents",
},
[]string{"endpoint", "code"},
)),
monitorRequestedChange: resourceChangePair{
cpu: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_monitor_requested_cpu_change_total",
Help: "Total change in CPU requested from the vm-monitor(s)",
},
[]string{directionLabel},
)),
mem: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_monitor_requested_mem_change_total",
Help: "Total change in memory (in MiB) requested from the vm-monitor(s)",
},
[]string{directionLabel},
)),
},
monitorApprovedChange: resourceChangePair{
cpu: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_monitor_approved_cpu_change_total",
Help: "Total change in CPU approved by the vm-monitor(s)",
},
[]string{directionLabel},
)),
mem: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_monitor_approved_mem_change_total",
Help: "Total change in memory (in MiB) approved by the vm-monitor(s)",
},
[]string{directionLabel},
)),
},
// ---- NEONVM ----
neonvmRequestsOutbound: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_neonvm_outbound_requests_total",
Help: "Number of k8s patch requests to NeonVM objects",
},
// NOTE: "result" is either "ok" or "[error: $CAUSE]", with $CAUSE as the root cause of
// the request error.
[]string{"result"},
)),
neonvmRequestedChange: resourceChangePair{
cpu: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_neonvm_requested_cpu_change_total",
Help: "Total change in CPU requested for VMs",
},
[]string{directionLabel},
)),
mem: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_agent_neonvm_requested_mem_changed_total",
Help: "Total change in memory (in MiB) requested for VMs",
},
[]string{directionLabel},
)),
},
// ---- RUNNER LIFECYCLE ----
runnersCount: util.RegisterMetric(reg, prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "autoscaling_agent_runners_current",
Help: "Number of per-VM runners, with associated metadata",
},
// NB: is_endpoint ∈ ("true", "false"), state ∈ runnerMetricState = ("ok", "stuck", "errored", "panicked")
[]string{"is_endpoint", "state"},
)),
runnerThreadPanics: util.RegisterMetric(reg, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "autoscaling_agent_runner_thread_panics_total",
Help: "Number of panics from autoscaler-agent per-VM runner threads",
},
)),
runnerStarts: util.RegisterMetric(reg, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "autoscaling_agent_runner_starts",
Help: "Number of new per-VM Runners started",
},
)),
runnerRestarts: util.RegisterMetric(reg, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "autoscaling_agent_runner_restarts",
Help: "Number of existing per-VM Runners restarted due to failure",
},
)),
runnerNextActions: util.RegisterMetric(reg, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "autoscaling_agent_runner_next_actions_total",
Help: "Number of times (*core.State).NextActions() has been called",
},
)),
scalingLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "autoscaling_agent_scaling_latency_seconds",
Help: "End-to-end scaling latency",
Buckets: buckets,
},
[]string{directionLabel},
)),
pluginLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "autoscaling_agent_plugin_latency_seconds",
Help: "Plugin request latency",
Buckets: buckets,
},
[]string{directionLabel},
)),
monitorLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "autoscaling_agent_monitor_latency_seconds",
Help: "Monitor request latency",
Buckets: buckets,
},
[]string{directionLabel},
)),
neonvmLatency: *util.RegisterMetric(reg, prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "autoscaling_agent_neonvm_latency_seconds",
Help: "NeonVM request latency",
Buckets: buckets,
},
[]string{directionLabel},
)),
}
// Some of of the metrics should have default keys set to zero. Otherwise, these won't be filled
// unil the value is non-zero (because something's happened), which makes it harder to
// distinguish between "valid signal of nothing" vs "no signal".
metricsWithDirection := []resourceChangePair{
// scheduler:
metrics.schedulerRequestedChange,
metrics.schedulerApprovedChange,
// monitor:
metrics.monitorRequestedChange,
metrics.monitorApprovedChange,
// neonvm:
metrics.neonvmRequestedChange,
}
for _, p := range metricsWithDirection {
for _, m := range []*prometheus.CounterVec{p.cpu, p.mem} {
m.WithLabelValues(directionValueInc).Add(0.0)
m.WithLabelValues(directionValueDec).Add(0.0)
}
}
runnerStates := []runnerMetricState{
runnerMetricStateOk,
runnerMetricStateStuck,
runnerMetricStatePanicked,
}
for _, s := range runnerStates {
metrics.runnersCount.WithLabelValues("true", string(s)).Set(0.0)
metrics.runnersCount.WithLabelValues("false", string(s)).Set(0.0)
}
return metrics, reg
}
func flagsToDirection(flags vmv1.Flag) string {
if flags.Has(revsource.Upscale) && flags.Has(revsource.Downscale) {
return directionValueBoth
}
if flags.Has(revsource.Upscale) {
return directionValueInc
}
if flags.Has(revsource.Downscale) {
return directionValueDec
}
return directionValueNone
}
func WrapHistogramVec(hist *prometheus.HistogramVec) revsource.ObserveCallback {
return func(dur time.Duration, flags vmv1.Flag) {
hist.WithLabelValues(flagsToDirection(flags)).Observe(dur.Seconds())
}
}
type PerVMMetrics struct {
cpu *prometheus.GaugeVec
memory *prometheus.GaugeVec
restartCount *prometheus.GaugeVec
}
type vmResourceValueType string
const (
vmResourceValueSpecMin vmResourceValueType = "spec_min"
vmResourceValueAutoscalingMin vmResourceValueType = "autoscaling_min"
vmResourceValueSpecUse vmResourceValueType = "spec_use"
vmResourceValueStatusUse vmResourceValueType = "status_use"
vmResourceValueSpecMax vmResourceValueType = "spec_max"
vmResourceValueAutoscalingMax vmResourceValueType = "autoscaling_max"
)
func makePerVMMetrics() (PerVMMetrics, *prometheus.Registry) {
reg := prometheus.NewRegistry()
metrics := PerVMMetrics{
cpu: util.RegisterMetric(reg, prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "autoscaling_vm_cpu_cores",
Help: "Number of CPUs for a VM: min, max, spec using, or status using",
},
[]string{
"vm_namespace", // .metadata.namespace
"vm_name", // .metadata.name
"endpoint_id", // .metadata.labels["neon/endpoint-id"]
"project_id", // .metadata.labels["neon/project-id"]
"value", // vmResourceValue: min, spec_use, status_use, max
},
)),
memory: util.RegisterMetric(reg, prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "autoscaling_vm_memory_bytes",
Help: "Amount of memory in bytes for a VM: min, max, spec using, or status using",
},
[]string{
"vm_namespace", // .metadata.namespace
"vm_name", // .metadata.name
"endpoint_id", // .metadata.labels["neon/endpoint-id"]
"project_id", // .metadata.labels["neon/project-id"]
"value", // vmResourceValue: min, spec_use, status_use, max
},
)),
restartCount: util.RegisterMetric(reg, prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "autoscaling_vm_restart_count",
Help: "Number of times that the VM has restarted",
},
[]string{
"vm_namespace", // .metadata.namespace
"vm_name", // .metadata.name
"endpoint_id", // .metadata.labels["neon/endpoint-id"]
"project_id", // .metadata.labels["neon/project-id"]
},
)),
}
return metrics, reg
}
func makePerVMMetricsLabels(namespace string, vmName string, endpointID string, projectID string, valueType vmResourceValueType) prometheus.Labels {
labels := prometheus.Labels{
"vm_namespace": namespace,
"vm_name": vmName,
"endpoint_id": endpointID,
"project_id": projectID,
}
if len(valueType) > 0 {
labels["value"] = string(valueType)
}
return labels
}
// vmMetric is a data object that represents a single metric
// (either CPU or memory) for a VM.
type vmMetric struct {
labels prometheus.Labels
value float64
}
package agent
// Core glue and logic for a single VM
//
// The primary object in this file is the Runner. We create a new Runner for each VM, and the Runner
// spawns a handful of long-running tasks that share state via the Runner object itself.
//
// Each of these tasks is created by (*Runner).spawnBackgroundWorker(), which gracefully handles
// panics so that it terminates (and restarts) the Runner itself, instead of e.g. taking down the
// entire autoscaler-agent.
//
// The main entrypoint is (*Runner).Spawn(), which in turn calls (*Runner).Run(), etc.
//
// For more information, refer to ARCHITECTURE.md.
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"runtime/debug"
"strconv"
"strings"
"sync/atomic"
"time"
"go.uber.org/zap"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
ktypes "k8s.io/apimachinery/pkg/types"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/agent/core"
"github.com/neondatabase/autoscaling/pkg/agent/core/revsource"
"github.com/neondatabase/autoscaling/pkg/agent/executor"
"github.com/neondatabase/autoscaling/pkg/agent/schedwatch"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
"github.com/neondatabase/autoscaling/pkg/util/patch"
)
// PluginProtocolVersion is the current version of the agent<->scheduler plugin in use by this
// autoscaler-agent.
//
// Currently, each autoscaler-agent supports only one version at a time. In the future, this may
// change.
const PluginProtocolVersion api.PluginProtoVersion = api.PluginProtoV5_0
// Runner is per-VM Pod god object responsible for handling everything
//
// It primarily operates as a source of shared data for a number of long-running tasks. For
// additional general information, refer to the comment at the top of this file.
type Runner struct {
global *agentState
// status provides the high-level status of the Runner. Reading or updating the status requires
// holding podStatus.lock. Updates are typically done handled by the setStatus method.
status *lockedPodStatus
// shutdown provides a clean way to trigger all background Runner threads to shut down. shutdown
// is set exactly once, by (*Runner).Run
shutdown context.CancelFunc
vmName util.NamespacedName
podName util.NamespacedName
podIP string
memSlotSize api.Bytes
// lock guards the values of all mutable fields - namely, scheduler and monitor (which may be
// read without the lock, but the lock must be acquired to lock them).
lock util.ChanMutex
// executorStateDump is set by (*Runner).Run and provides a way to get the state of the
// "executor"
executorStateDump func() executor.StateDump
// monitor, if non nil, stores the current Dispatcher in use for communicating with the
// vm-monitor, alongside a generation number.
//
// Additionally, this field MAY ONLY be updated while holding both lock AND the executor's lock,
// which means that it may be read when EITHER holding lock OR the executor's lock.
monitor *monitorInfo
// backgroundWorkerCount tracks the current number of background workers. It is exclusively
// updated by r.spawnBackgroundWorker
backgroundWorkerCount atomic.Int64
backgroundPanic chan error
}
// RunnerState is the serializable state of the Runner, extracted by its State method
type RunnerState struct {
PodIP string `json:"podIP"`
ExecutorState executor.StateDump `json:"executorState"`
Monitor *MonitorState `json:"monitor"`
BackgroundWorkerCount int64 `json:"backgroundWorkerCount"`
}
// SchedulerState is the state of a Scheduler, constructed as part of a Runner's State Method
type SchedulerState struct {
Info schedwatch.SchedulerInfo `json:"info"`
}
// Temporary type, to hopefully help with debugging https://github.com/neondatabase/autoscaling/issues/503
type MonitorState struct {
WaitersSize int `json:"waitersSize"`
}
func (r *Runner) State(ctx context.Context) (*RunnerState, error) {
if err := r.lock.TryLock(ctx); err != nil {
return nil, err
}
defer r.lock.Unlock()
var monitorState *MonitorState
if r.monitor != nil {
monitorState = &MonitorState{
WaitersSize: r.monitor.dispatcher.lenWaiters(),
}
}
var executorState *executor.StateDump
if r.executorStateDump != nil /* may be nil if r.Run() hasn't fully started yet */ {
s := r.executorStateDump()
executorState = &s
}
return &RunnerState{
PodIP: r.podIP,
ExecutorState: *executorState,
Monitor: monitorState,
BackgroundWorkerCount: r.backgroundWorkerCount.Load(),
}, nil
}
func (r *Runner) Spawn(ctx context.Context, logger *zap.Logger, vmInfoUpdated util.CondChannelReceiver) {
go func() {
// Gracefully handle panics, plus trigger restart
defer func() {
if err := recover(); err != nil {
now := time.Now()
r.status.update(r.global, func(stat podStatus) podStatus {
stat.endState = &podStatusEndState{
ExitKind: podStatusExitPanicked,
Error: fmt.Errorf("Runner %v panicked: %v", stat.vmInfo.NamespacedName(), err),
Time: now,
}
return stat
})
}
r.global.TriggerRestartIfNecessary(ctx, logger, r.podName, r.podIP)
}()
r.Run(ctx, logger, vmInfoUpdated)
endTime := time.Now()
exitKind := podStatusExitCanceled // normal exit, only by context being canceled.
r.status.update(r.global, func(stat podStatus) podStatus {
stat.endState = &podStatusEndState{
ExitKind: exitKind,
Error: nil,
Time: endTime,
}
return stat
})
logger.Info("Ended without error")
}()
}
// Run is the main entrypoint to the long-running per-VM pod tasks
func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util.CondChannelReceiver) {
ctx, r.shutdown = context.WithCancel(ctx)
defer r.shutdown()
getVmInfo := func() api.VmInfo {
r.status.mu.Lock()
defer r.status.mu.Unlock()
return r.status.vmInfo
}
execLogger := logger.Named("exec")
// Subtract a small random amount from core.Config.PluginRequestTick so that periodic requests
// tend to become distribted randomly over time.
pluginRequestJitter := util.NewTimeRange(time.Millisecond, 0, 100).Random()
coreExecLogger := execLogger.Named("core")
vmInfo := getVmInfo()
var initialRevision int64
if vmInfo.CurrentRevision != nil {
initialRevision = vmInfo.CurrentRevision.Value
}
revisionSource := revsource.NewRevisionSource(initialRevision, WrapHistogramVec(&r.global.metrics.scalingLatency))
executorCore := executor.NewExecutorCore(coreExecLogger, vmInfo, executor.Config{
OnNextActions: r.global.metrics.runnerNextActions.Inc,
Core: core.Config{
ComputeUnit: r.global.config.Scaling.ComputeUnit,
DefaultScalingConfig: r.global.config.Scaling.DefaultConfig,
NeonVMRetryWait: time.Second * time.Duration(r.global.config.NeonVM.RetryFailedRequestSeconds),
PluginRequestTick: time.Second*time.Duration(r.global.config.Scheduler.RequestAtLeastEverySeconds) - pluginRequestJitter,
PluginRetryWait: time.Second * time.Duration(r.global.config.Scheduler.RetryFailedRequestSeconds),
PluginDeniedRetryWait: time.Second * time.Duration(r.global.config.Scheduler.RetryDeniedUpscaleSeconds),
MonitorDeniedDownscaleCooldown: time.Second * time.Duration(r.global.config.Monitor.RetryDeniedDownscaleSeconds),
MonitorRequestedUpscaleValidPeriod: time.Second * time.Duration(r.global.config.Monitor.RequestedUpscaleValidSeconds),
MonitorRetryWait: time.Second * time.Duration(r.global.config.Monitor.RetryFailedRequestSeconds),
Log: core.LogConfig{
Info: coreExecLogger.Info,
Warn: coreExecLogger.Warn,
},
RevisionSource: revisionSource,
ObservabilityCallbacks: core.ObservabilityCallbacks{
PluginLatency: WrapHistogramVec(&r.global.metrics.pluginLatency),
MonitorLatency: WrapHistogramVec(&r.global.metrics.monitorLatency),
NeonVMLatency: WrapHistogramVec(&r.global.metrics.neonvmLatency),
},
},
})
r.executorStateDump = executorCore.StateDump
monitorGeneration := executor.NewStoredGenerationNumber()
pluginIface := makePluginInterface(r)
neonvmIface := makeNeonVMInterface(r)
monitorIface := makeMonitorInterface(r, executorCore, monitorGeneration)
// "ecwc" stands for "ExecutorCoreWithClients"
ecwc := executorCore.WithClients(executor.ClientSet{
Plugin: pluginIface,
NeonVM: neonvmIface,
Monitor: monitorIface,
})
logger.Info("Starting background workers")
// FIXME: make this timeout/delay a separately defined constant, or configurable
mainDeadlockChecker := r.lock.DeadlockChecker(250*time.Millisecond, time.Second)
r.spawnBackgroundWorker(ctx, logger, "deadlock checker", ignoreLogger(mainDeadlockChecker))
r.spawnBackgroundWorker(ctx, logger, "podStatus updater", func(ctx2 context.Context, logger2 *zap.Logger) {
r.status.periodicallyRefreshState(ctx2, logger2, r.global)
})
r.spawnBackgroundWorker(ctx, logger, "VmInfo updater", func(ctx2 context.Context, logger2 *zap.Logger) {
for {
select {
case <-ctx2.Done():
return
case <-vmInfoUpdated.Recv():
vm := getVmInfo()
ecwc.Updater().UpdatedVM(vm, func() {
logger2.Info("VmInfo updated", zap.Any("vmInfo", vm))
})
}
}
})
r.spawnBackgroundWorker(ctx, logger, "get system metrics", func(ctx2 context.Context, logger2 *zap.Logger) {
getMetricsLoop(
r,
ctx2,
logger2,
r.global.config.Metrics.System,
metricsMgr[*core.SystemMetrics]{
kind: "system",
emptyMetrics: func() *core.SystemMetrics { return new(core.SystemMetrics) },
isActive: func() bool { return true },
updateMetrics: func(metrics *core.SystemMetrics, withLock func()) {
ecwc.Updater().UpdateSystemMetrics(*metrics, withLock)
},
},
)
})
r.spawnBackgroundWorker(ctx, logger, "get LFC metrics", func(ctx2 context.Context, logger2 *zap.Logger) {
getMetricsLoop(
r,
ctx2,
logger2,
r.global.config.Metrics.LFC,
metricsMgr[*core.LFCMetrics]{
kind: "LFC",
emptyMetrics: func() *core.LFCMetrics { return new(core.LFCMetrics) },
isActive: func() bool {
scalingConfig := r.global.config.Scaling.DefaultConfig.WithOverrides(getVmInfo().Config.ScalingConfig)
return *scalingConfig.EnableLFCMetrics // guaranteed non-nil as a required field.
},
updateMetrics: func(metrics *core.LFCMetrics, withLock func()) {
ecwc.Updater().UpdateLFCMetrics(*metrics, withLock)
},
},
)
})
r.spawnBackgroundWorker(ctx, logger.Named("vm-monitor"), "vm-monitor reconnection loop", func(ctx2 context.Context, logger2 *zap.Logger) {
r.connectToMonitorLoop(ctx2, logger2, monitorGeneration, monitorStateCallbacks{
reset: func(withLock func()) {
ecwc.Updater().ResetMonitor(withLock)
},
upscaleRequested: func(request api.MoreResources, withLock func()) {
ecwc.Updater().UpscaleRequested(request, withLock)
},
setActive: func(active bool, withLock func()) {
ecwc.Updater().MonitorActive(active, withLock)
},
})
})
r.spawnBackgroundWorker(ctx, execLogger.Named("sleeper"), "executor: sleeper", ecwc.DoSleeper)
r.spawnBackgroundWorker(ctx, execLogger.Named("plugin"), "executor: plugin", ecwc.DoPluginRequests)
r.spawnBackgroundWorker(ctx, execLogger.Named("neonvm"), "executor: neonvm", ecwc.DoNeonVMRequests)
r.spawnBackgroundWorker(ctx, execLogger.Named("vm-monitor-downscale"), "executor: vm-monitor downscale", ecwc.DoMonitorDownscales)
r.spawnBackgroundWorker(ctx, execLogger.Named("vm-monitor-upscale"), "executor: vm-monitor upscale", ecwc.DoMonitorUpscales)
// Note: Run doesn't terminate unless the parent context is cancelled - either because the VM
// pod was deleted, or the autoscaler-agent is exiting.
select {
case <-ctx.Done():
return
case err := <-r.backgroundPanic:
panic(err)
}
}
//////////////////////
// Background tasks //
//////////////////////
func ignoreLogger(f func(context.Context)) func(context.Context, *zap.Logger) {
return func(c context.Context, _ *zap.Logger) {
f(c)
}
}
// spawnBackgroundWorker is a helper function to appropriately handle panics in the various goroutines
// spawned by `(Runner) Run`, sending them back on r.backgroundPanic
//
// This method is essentially equivalent to 'go f(ctx)' but with appropriate panic handling,
// start/stop logging, and updating of r.backgroundWorkerCount
func (r *Runner) spawnBackgroundWorker(ctx context.Context, logger *zap.Logger, name string, f func(context.Context, *zap.Logger)) {
// Increment the background worker count
r.backgroundWorkerCount.Add(1)
logger = logger.With(zap.String("taskName", name))
go func() {
defer func() {
// Decrement the background worker count
r.backgroundWorkerCount.Add(-1)
if v := recover(); v != nil {
r.global.metrics.runnerThreadPanics.Inc()
err := fmt.Errorf("background worker %q panicked: %v", name, v)
// note: In Go, the stack doesn't "unwind" on panic. Instead, a panic will traverse up
// the callstack, and each deferred function, when called, will be *added* to the stack
// as if the original panic() is calling them. So the output of runtime/debug.Stack()
// has a couple frames do with debug.Stack() and this deferred function, and then the
// rest of the callstack starts from where the panic occurred.
//
// FIXME: we should handle the stack ourselves to remove the stack frames from
// debug.Stack() and co. -- it's ok to have nice things!
logger.Error(
"background worker panicked",
zap.String("error", fmt.Sprint(v)),
zap.String("stack", string(debug.Stack())),
)
// send to r.backgroundPanic if we can; otherwise, don't worry about it.
select {
case r.backgroundPanic <- err:
default:
}
} else {
logger.Info("background worker ended normally")
}
}()
logger.Info("background worker started")
f(ctx, logger)
}()
}
type metricsMgr[M core.FromPrometheus] struct {
// kind is the human-readable name representing this type of metrics.
// It's either "system" or "LFC".
kind string
// emptyMetrics returns a new M
//
// Typically this is required because M is itself a pointer, so if we just initialized it with a
// zero value, we'd end up with nil pointer derefs. There *are* ways around this with generics,
// but at the time we decided this is the least convoluted way.
emptyMetrics func() M
// isActive returns whether these metrics should currently be collected for the VM.
//
// For example, with LFC metrics, we return false if they are not enabled for the VM.
isActive func() bool
// updateMetrics is a callback to update the internal state with new values for these metrics.
updateMetrics func(metrics M, withLock func())
}
// getMetricsLoop repeatedly attempts to fetch metrics from the VM
//
// Every time metrics are successfully fetched, the value is recorded with mgr.updateMetrics().
func getMetricsLoop[M core.FromPrometheus](
r *Runner,
ctx context.Context,
logger *zap.Logger,
config MetricsSourceConfig,
mgr metricsMgr[M],
) {
waitBetweenDuration := time.Second * time.Duration(config.SecondsBetweenRequests)
randomStartWait := util.NewTimeRange(time.Second, 0, int(config.SecondsBetweenRequests)).Random()
lastActive := mgr.isActive()
// Don't log anything if we're not making this type of metrics request currently.
//
// The idea is that isActive() can/should be used for gradual rollout of new metrics, and we
// don't want to log every time we *don't* do the new thing.
if lastActive {
logger.Info(
fmt.Sprintf("Sleeping for random delay before making first %s metrics request", mgr.kind),
zap.Duration("delay", randomStartWait),
)
}
select {
case <-ctx.Done():
return
case <-time.After(randomStartWait):
}
for {
if !mgr.isActive() {
if lastActive {
logger.Info(fmt.Sprintf("VM is no longer active for %s metrics requests", mgr.kind))
}
lastActive = false
} else {
if !lastActive {
logger.Info(fmt.Sprintf("VM is now active for %s metrics requests", mgr.kind))
}
lastActive = true
metrics := mgr.emptyMetrics()
err := doMetricsRequest(r, ctx, logger, metrics, config)
if err != nil {
logger.Error("Error making metrics request", zap.Error(err))
goto next
}
mgr.updateMetrics(metrics, func() {
logger.Info("Updated metrics", zap.Any("metrics", metrics))
})
}
next:
select {
case <-ctx.Done():
return
case <-time.After(waitBetweenDuration):
}
}
}
type monitorInfo struct {
generation executor.GenerationNumber
dispatcher *Dispatcher
}
type monitorStateCallbacks struct {
reset func(withLock func())
upscaleRequested func(request api.MoreResources, withLock func())
setActive func(active bool, withLock func())
}
// connectToMonitorLoop does lifecycle management of the (re)connection to the vm-monitor
func (r *Runner) connectToMonitorLoop(
ctx context.Context,
logger *zap.Logger,
generation *executor.StoredGenerationNumber,
callbacks monitorStateCallbacks,
) {
addr := fmt.Sprintf("ws://%s:%d/monitor", r.podIP, r.global.config.Monitor.ServerPort)
minWait := time.Second * time.Duration(r.global.config.Monitor.ConnectionRetryMinWaitSeconds)
var lastStart time.Time
for i := 0; ; i += 1 {
// Remove any prior Dispatcher from the Runner
if i != 0 {
func() {
r.lock.Lock()
defer r.lock.Unlock()
callbacks.reset(func() {
generation.Inc()
r.monitor = nil
logger.Info("Reset previous vm-monitor connection")
})
}()
}
// If the context was canceled, don't restart
if err := ctx.Err(); err != nil {
action := "attempt"
if i != 0 {
action = "retry "
}
logger.Info(
fmt.Sprintf("Aborting vm-monitor connection %s because context is already canceled", action),
zap.Error(err),
)
return
}
// Delayed restart management, long because of friendly logging:
if i != 0 {
endTime := time.Now()
runtime := endTime.Sub(lastStart)
if runtime > minWait {
logger.Info(
"Immediately retrying connection to vm-monitor",
zap.String("addr", addr),
zap.Duration("totalRuntime", runtime),
)
} else {
delay := minWait - runtime
logger.Info(
"Connection to vm-monitor was not live for long, retrying after delay",
zap.Duration("delay", delay),
zap.Duration("totalRuntime", runtime),
)
select {
case <-time.After(delay):
logger.Info(
"Retrying connection to vm-monitor",
zap.Duration("delay", delay),
zap.Duration("waitTime", time.Since(endTime)),
zap.String("addr", addr),
)
case <-ctx.Done():
logger.Info(
"Canceling retrying connection to vm-monitor",
zap.Duration("delay", delay),
zap.Duration("waitTime", time.Since(endTime)),
zap.Error(ctx.Err()),
)
return
}
}
} else {
logger.Info("Connecting to vm-monitor", zap.String("addr", addr))
}
lastStart = time.Now()
dispatcher, err := NewDispatcher(ctx, logger, addr, r, callbacks.upscaleRequested)
if err != nil {
logger.Error("Failed to connect to vm-monitor", zap.String("addr", addr), zap.Error(err))
continue
}
// Update runner to the new dispatcher
func() {
r.lock.Lock()
defer r.lock.Unlock()
callbacks.setActive(true, func() {
r.monitor = &monitorInfo{
generation: generation.Inc(),
dispatcher: dispatcher,
}
logger.Info("Connected to vm-monitor")
})
}()
// Wait until the dispatcher is no longer running, either due to error or because the
// root-level Runner context was canceled.
<-dispatcher.ExitSignal()
if err := dispatcher.ExitError(); err != nil {
logger.Error("Dispatcher for vm-monitor connection exited due to error", zap.Error(err))
}
}
}
//////////////////////////////////////////
// Lower-level implementation functions //
//////////////////////////////////////////
// doMetricsRequest makes a single metrics request to the VM, writing the result into 'metrics'
func doMetricsRequest(
r *Runner,
ctx context.Context,
logger *zap.Logger,
metrics core.FromPrometheus,
config MetricsSourceConfig,
) error {
url := fmt.Sprintf("http://%s:%d/metrics", r.podIP, config.Port)
timeout := time.Second * time.Duration(config.RequestTimeoutSeconds)
reqCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
req, err := http.NewRequestWithContext(reqCtx, http.MethodGet, url, bytes.NewReader(nil))
if err != nil {
panic(fmt.Errorf("Error constructing metrics request to %q: %w", url, err))
}
logger.Debug("Making metrics request to VM", zap.String("url", url))
resp, err := http.DefaultClient.Do(req)
if ctx.Err() != nil {
return ctx.Err()
} else if err != nil {
return fmt.Errorf("Error making request to %q: %w", url, err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return fmt.Errorf("Unsuccessful response status %d", resp.StatusCode)
}
if err := core.ParseMetrics(resp.Body, metrics); err != nil {
return fmt.Errorf("Error parsing metrics from prometheus output: %w", err)
}
return nil
}
func (r *Runner) doNeonVMRequest(
ctx context.Context,
target api.Resources,
targetRevision vmv1.RevisionWithTime,
) error {
patches := []patch.Operation{{
Op: patch.OpReplace,
Path: "/spec/guest/cpus/use",
Value: target.VCPU.ToResourceQuantity(),
}, {
Op: patch.OpReplace,
Path: "/spec/guest/memorySlots/use",
Value: uint32(target.Mem / r.memSlotSize),
}, {
Op: patch.OpReplace,
Path: "/spec/targetRevision",
Value: targetRevision,
}}
patchPayload, err := json.Marshal(patches)
if err != nil {
panic(fmt.Errorf("Error marshalling JSON patch: %w", err))
}
timeout := time.Second * time.Duration(r.global.config.NeonVM.RequestTimeoutSeconds)
requestCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
// FIXME: We should check the returned VM object here, in case the values are different.
//
// Also relevant: <https://github.com/neondatabase/autoscaling/issues/23>
_, err = r.global.vmClient.NeonvmV1().VirtualMachines(r.vmName.Namespace).
Patch(requestCtx, r.vmName.Name, ktypes.JSONPatchType, patchPayload, metav1.PatchOptions{})
if err != nil {
errMsg := util.RootError(err).Error()
// Some error messages contain the object name. We could try to filter them all out, but
// it's probably more maintainable to just keep them as-is and remove the name.
errMsg = strings.ReplaceAll(errMsg, r.vmName.Name, "<name>")
r.global.metrics.neonvmRequestsOutbound.WithLabelValues(fmt.Sprintf("[error: %s]", errMsg)).Inc()
return err
}
r.global.metrics.neonvmRequestsOutbound.WithLabelValues("ok").Inc()
return nil
}
func (r *Runner) recordResourceChange(current, target api.Resources, metrics resourceChangePair) {
getDirection := func(targetIsGreater bool) string {
if targetIsGreater {
return directionValueInc
} else {
return directionValueDec
}
}
abs := current.AbsDiff(target)
// Add CPU
if abs.VCPU != 0 {
direction := getDirection(target.VCPU > current.VCPU)
metrics.cpu.WithLabelValues(direction).Add(abs.VCPU.AsFloat64())
}
// Add memory
if abs.Mem != 0 {
direction := getDirection(target.Mem > current.Mem)
// Avoid floating-point inaccuracy.
byteTotal := abs.Mem
mib := api.Bytes(1 << 20)
floatMB := float64(byteTotal/mib) + float64(byteTotal%mib)/float64(mib)
metrics.mem.WithLabelValues(direction).Add(floatMB)
}
}
func doMonitorDownscale(
ctx context.Context,
logger *zap.Logger,
dispatcher *Dispatcher,
target api.Resources,
) (*api.DownscaleResult, error) {
r := dispatcher.runner
rawResources := target.ConvertToAllocation()
timeout := time.Second * time.Duration(r.global.config.Monitor.ResponseTimeoutSeconds)
res, err := dispatcher.Call(ctx, logger, timeout, "DownscaleRequest", api.DownscaleRequest{
Target: rawResources,
})
if err != nil {
return nil, err
}
return res.Result, nil
}
func doMonitorUpscale(
ctx context.Context,
logger *zap.Logger,
dispatcher *Dispatcher,
target api.Resources,
) error {
r := dispatcher.runner
rawResources := target.ConvertToAllocation()
timeout := time.Second * time.Duration(r.global.config.Monitor.ResponseTimeoutSeconds)
_, err := dispatcher.Call(ctx, logger, timeout, "UpscaleNotification", api.UpscaleNotification{
Granted: rawResources,
})
return err
}
// DoSchedulerRequest sends a request to the scheduler and does not validate the response.
func (r *Runner) DoSchedulerRequest(
ctx context.Context,
logger *zap.Logger,
resources api.Resources,
lastPermit *api.Resources,
metrics *api.Metrics,
) (_ *api.PluginResponse, err error) {
reqData := &api.AgentRequest{
ProtoVersion: PluginProtocolVersion,
Pod: r.podName,
ComputeUnit: r.global.config.Scaling.ComputeUnit,
Resources: resources,
LastPermit: lastPermit,
Metrics: metrics,
}
// make sure we log any error we're returning:
defer func() {
if err != nil {
logger.Error("Scheduler request failed", zap.Error(err))
}
}()
sched := r.global.schedTracker.Get()
if sched == nil {
err := errors.New("no known ready scheduler to send request to")
description := fmt.Sprintf("[error doing request: %s]", err)
r.global.metrics.schedulerRequests.WithLabelValues(description).Inc()
return nil, err
}
reqBody, err := json.Marshal(reqData)
if err != nil {
return nil, fmt.Errorf("Error encoding request JSON: %w", err)
}
timeout := time.Second * time.Duration(r.global.config.NeonVM.RequestTimeoutSeconds)
reqCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
url := fmt.Sprintf("http://%s:%d/", sched.IP, r.global.config.Scheduler.RequestPort)
request, err := http.NewRequestWithContext(reqCtx, http.MethodPost, url, bytes.NewReader(reqBody))
if err != nil {
return nil, fmt.Errorf("Error building request to %q: %w", url, err)
}
request.Header.Set("content-type", "application/json")
logger.Debug("Sending request to scheduler", zap.Any("request", reqData))
response, err := http.DefaultClient.Do(request)
if err != nil {
description := fmt.Sprintf("[error doing request: %s]", util.RootError(err))
r.global.metrics.schedulerRequests.WithLabelValues(description).Inc()
return nil, fmt.Errorf("Error doing request: %w", err)
}
defer response.Body.Close()
r.global.metrics.schedulerRequests.WithLabelValues(strconv.Itoa(response.StatusCode)).Inc()
respBody, err := io.ReadAll(response.Body)
if err != nil {
return nil, fmt.Errorf("Error reading body for response: %w", err)
}
if response.StatusCode != 200 {
// Fatal because 4XX implies our state doesn't match theirs, 5XX means we can't assume
// current contents of the state, and anything other than 200, 4XX, or 5XX shouldn't happen
return nil, fmt.Errorf("Received response status %d body %q", response.StatusCode, string(respBody))
}
var respData api.PluginResponse
if err := json.Unmarshal(respBody, &respData); err != nil {
// Fatal because invalid JSON might also be semantically invalid
return nil, fmt.Errorf("Bad JSON response: %w", err)
}
level := zap.DebugLevel
if respData.Permit.HasFieldLessThan(resources) {
level = zap.WarnLevel
}
logger.Log(level, "Received response from scheduler", zap.Any("response", respData), zap.Any("requested", resources))
return &respData, nil
}
package schedwatch
import (
"time"
"go.uber.org/zap/zapcore"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"github.com/neondatabase/autoscaling/pkg/util"
)
type SchedulerInfo struct {
PodName util.NamespacedName
UID types.UID
IP string
CreationTimestamp time.Time
}
// MarshalLogObject implements zapcore.ObjectMarshaler
func (s SchedulerInfo) MarshalLogObject(enc zapcore.ObjectEncoder) error {
if err := enc.AddObject("pod", s.PodName); err != nil {
return err
}
enc.AddString("uid", string(s.UID))
enc.AddString("ip", string(s.IP))
enc.AddTime("creationTimestamp", s.CreationTimestamp)
return nil
}
func newSchedulerInfo(pod *corev1.Pod) SchedulerInfo {
return SchedulerInfo{
PodName: util.NamespacedName{Name: pod.Name, Namespace: pod.Namespace},
UID: pod.UID,
IP: pod.Status.PodIP,
CreationTimestamp: pod.CreationTimestamp.Time,
}
}
package schedwatch
import (
"context"
"fmt"
"sync"
"time"
"go.uber.org/zap"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"github.com/neondatabase/autoscaling/pkg/util"
"github.com/neondatabase/autoscaling/pkg/util/watch"
)
func isActivePod(pod *corev1.Pod) bool {
return pod.Status.PodIP != "" && util.PodReady(pod)
}
type SchedulerTracker struct {
sp *schedPods
Stop func()
}
func (s SchedulerTracker) Get() *SchedulerInfo {
s.sp.mu.RLock()
defer s.sp.mu.RUnlock()
return s.sp.current
}
type schedPods struct {
mu sync.RWMutex
current *SchedulerInfo
pods map[types.UID]*SchedulerInfo
}
const schedulerNamespace string = "kube-system"
func schedulerLabelSelector(schedulerName string) string {
return fmt.Sprintf("name=%s", schedulerName)
}
func StartSchedulerWatcher(
ctx context.Context,
parentLogger *zap.Logger,
kubeClient *kubernetes.Clientset,
metrics watch.Metrics,
schedulerName string,
) (*SchedulerTracker, error) {
logger := parentLogger.Named("watch-schedulers")
sp := &schedPods{
mu: sync.RWMutex{},
current: nil,
pods: make(map[types.UID]*SchedulerInfo),
}
store, err := watch.Watch(
ctx,
logger.Named("watch"),
kubeClient.CoreV1().Pods(schedulerNamespace),
watch.Config{
ObjectNameLogField: "pod",
Metrics: watch.MetricsConfig{
Metrics: metrics,
Instance: "Scheduler Pod",
},
// We don't need to be super responsive to scheduler changes.
//
// FIXME: make these configurable.
RetryRelistAfter: util.NewTimeRange(time.Second, 4, 5),
RetryWatchAfter: util.NewTimeRange(time.Second, 4, 5),
},
watch.Accessors[*corev1.PodList, corev1.Pod]{
Items: func(list *corev1.PodList) []corev1.Pod { return list.Items },
},
watch.InitModeSync,
metav1.ListOptions{LabelSelector: schedulerLabelSelector(schedulerName)},
watch.HandlerFuncs[*corev1.Pod]{
AddFunc: func(pod *corev1.Pod, preexisting bool) {
if isActivePod(pod) {
info := newSchedulerInfo(pod)
logger.Info("New scheduler, already ready", zap.Object("scheduler", info))
sp.add(logger, &info)
}
},
UpdateFunc: func(oldPod, newPod *corev1.Pod) {
oldReady := isActivePod(oldPod)
newReady := isActivePod(newPod)
if !oldReady && newReady {
info := newSchedulerInfo(newPod)
logger.Info("Existing scheduler became ready", zap.Object("scheduler", info))
sp.add(logger, &info)
} else if oldReady && !newReady {
info := newSchedulerInfo(newPod)
logger.Info("Existing scheduler no longer ready", zap.Object("scheduler", info))
sp.remove(logger, &info)
}
},
DeleteFunc: func(pod *corev1.Pod, mayBeStale bool) {
wasReady := isActivePod(pod)
if wasReady {
info := newSchedulerInfo(pod)
logger.Info("Previously-ready scheduler deleted", zap.Object("scheduler", info))
sp.remove(logger, &info)
}
},
},
)
if err != nil {
return nil, err
}
return &SchedulerTracker{
sp: sp,
Stop: store.Stop,
}, nil
}
func (s *schedPods) add(logger *zap.Logger, pod *SchedulerInfo) {
s.mu.Lock()
defer s.mu.Unlock()
s.pods[pod.UID] = pod
s.reconcile(logger)
}
func (s *schedPods) remove(logger *zap.Logger, pod *SchedulerInfo) {
s.mu.Lock()
defer s.mu.Unlock()
delete(s.pods, pod.UID)
s.reconcile(logger)
}
// reconcile refreshes the value of s.current based on s.pods.
// s.mu MUST be exclusively locked while calling reconcile.
func (s *schedPods) reconcile(logger *zap.Logger) {
var newCurrent *SchedulerInfo
// There's *basically* guaranteed to be ≤ 2 scheduler pods because the scheduler deployment has
// replicas=1, so "just" looping here is fine; it's not worth a more complex data structure.
for _, pod := range s.pods {
// Use the pod if we don't already have one, or if it was created more recently than
// whatever we've seen so far.
// The ordering isn't *too* important here, but we need to pick one to be consistent, and
// preferring a newer scheduler (remember: the pod is 'Ready') is likely to be more correct.
if newCurrent == nil || newCurrent.CreationTimestamp.Before(pod.CreationTimestamp) {
newCurrent = pod
}
}
if s.current != nil && newCurrent != nil {
count := len(s.pods)
if s.current.UID != newCurrent.UID {
logger.Info("Scheduler pod selection changed", zap.Int("count", count), zap.Object("scheduler", newCurrent))
} else {
logger.Info("Scheduler pod selection is unchanged", zap.Int("count", count), zap.Object("scheduler", newCurrent))
}
} else if newCurrent == nil && s.current != nil {
logger.Warn("No scheduler pod available anymore")
} else if newCurrent != nil && s.current == nil {
logger.Info("Scheduler pod now available (there was none before)", zap.Object("scheduler", newCurrent))
} else /* newCurrent == nil && s.current.pod == nil */ {
logger.Warn("No scheduler pod available (still)")
}
s.current = newCurrent
}
package agent
import (
"context"
"encoding/json"
"fmt"
"maps"
"slices"
"time"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
"github.com/neondatabase/autoscaling/pkg/util/watch"
)
type vmEvent struct {
kind vmEventKind
vmInfo api.VmInfo
podName string
podIP string
// if present, the ID of the endpoint associated with the VM. May be empty.
endpointID string
}
const (
endpointLabel = "neon/endpoint-id"
projectLabel = "neon/project-id"
)
// MarshalLogObject implements zapcore.ObjectMarshaler
func (ev vmEvent) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddString("kind", string(ev.kind))
enc.AddString("podName", ev.podName)
enc.AddString("podIP", ev.podIP)
enc.AddString("endpointID", ev.endpointID)
if err := enc.AddReflected("vmInfo", ev.vmInfo); err != nil {
return err
}
return nil
}
type vmEventKind string
const (
vmEventAdded vmEventKind = "added"
vmEventUpdated vmEventKind = "updated"
vmEventDeleted vmEventKind = "deleted"
)
// note: unlike startPodWatcher, we aren't able to use a field selector on VM status.node (currently; NeonVM v0.4.6)
func startVMWatcher(
ctx context.Context,
parentLogger *zap.Logger,
config *Config,
vmClient *vmclient.Clientset,
metrics watch.Metrics,
perVMMetrics PerVMMetrics,
nodeName string,
submitEvent func(vmEvent),
) (*watch.Store[vmapi.VirtualMachine], error) {
logger := parentLogger.Named("vm-watch")
return watch.Watch(
ctx,
logger.Named("watch"),
vmClient.NeonvmV1().VirtualMachines(corev1.NamespaceAll),
watch.Config{
ObjectNameLogField: "virtualmachine",
Metrics: watch.MetricsConfig{
Metrics: metrics,
Instance: "VirtualMachines",
},
// We want to be relatively snappy; don't wait for too long before retrying.
RetryRelistAfter: util.NewTimeRange(time.Millisecond, 500, 1000),
RetryWatchAfter: util.NewTimeRange(time.Millisecond, 500, 1000),
},
watch.Accessors[*vmapi.VirtualMachineList, vmapi.VirtualMachine]{
Items: func(list *vmapi.VirtualMachineList) []vmapi.VirtualMachine { return list.Items },
},
watch.InitModeDefer,
metav1.ListOptions{},
watch.HandlerFuncs[*vmapi.VirtualMachine]{
AddFunc: func(vm *vmapi.VirtualMachine, preexisting bool) {
setVMMetrics(&perVMMetrics, vm, nodeName)
if vmIsOurResponsibility(vm, config, nodeName) {
event, err := makeVMEvent(logger, vm, vmEventAdded)
if err != nil {
logger.Error(
"Failed to create vmEvent for added VM",
util.VMNameFields(vm), zap.Error(err),
)
return
}
submitEvent(event)
}
},
UpdateFunc: func(oldVM, newVM *vmapi.VirtualMachine) {
updateVMMetrics(&perVMMetrics, oldVM, newVM, nodeName)
oldIsOurs := vmIsOurResponsibility(oldVM, config, nodeName)
newIsOurs := vmIsOurResponsibility(newVM, config, nodeName)
if !oldIsOurs && !newIsOurs {
return
}
var vmForEvent *vmapi.VirtualMachine
var eventKind vmEventKind
if !oldIsOurs && newIsOurs {
vmForEvent = newVM
eventKind = vmEventAdded
} else if oldIsOurs && !newIsOurs {
vmForEvent = oldVM
eventKind = vmEventDeleted
} else {
vmForEvent = newVM
eventKind = vmEventUpdated
}
event, err := makeVMEvent(logger, vmForEvent, eventKind)
if err != nil {
logger.Error(
"Failed to create vmEvent for updated VM",
util.VMNameFields(vmForEvent), zap.Error(err),
)
return
}
submitEvent(event)
},
DeleteFunc: func(vm *vmapi.VirtualMachine, maybeStale bool) {
deleteVMMetrics(&perVMMetrics, vm, nodeName)
if vmIsOurResponsibility(vm, config, nodeName) {
event, err := makeVMEvent(logger, vm, vmEventDeleted)
if err != nil {
logger.Error(
"Failed to create vmEvent for deleted VM",
util.VMNameFields(vm), zap.Error(err),
)
return
}
submitEvent(event)
}
},
},
)
}
func makeVMEvent(logger *zap.Logger, vm *vmapi.VirtualMachine, kind vmEventKind) (vmEvent, error) {
info, err := api.ExtractVmInfo(logger, vm)
if err != nil {
return vmEvent{}, fmt.Errorf("Error extracting VM info: %w", err)
}
endpointID := ""
if vm.Labels != nil {
endpointID = vm.Labels[endpointLabel]
}
return vmEvent{
kind: kind,
vmInfo: *info,
podName: vm.Status.PodName,
podIP: vm.Status.PodIP,
endpointID: endpointID,
}, nil
}
// extractAutoscalingBounds extracts the ScalingBounds from a VM's autoscaling
// annotation, for the purpose of exposing it in per-VM metrics.
//
// We're not reusing api.ExtractVmInfo even though it also looks at the bounds
// annotation, because its data is less precise - CPU and memory values might
// come from the VM spec without us knowing.
func extractAutoscalingBounds(vm *vmapi.VirtualMachine) *api.ScalingBounds {
boundsJSON, ok := vm.Annotations[api.AnnotationAutoscalingBounds]
if !ok {
return nil
}
var bounds api.ScalingBounds
if err := json.Unmarshal([]byte(boundsJSON), &bounds); err != nil {
return nil
}
return &bounds
}
type pair[T1 any, T2 any] struct {
first T1
second T2
}
func makeVMMetric(vm *vmapi.VirtualMachine, valType vmResourceValueType, val float64) vmMetric {
endpointID := vm.Labels[endpointLabel]
projectID := vm.Labels[projectLabel]
labels := makePerVMMetricsLabels(vm.Namespace, vm.Name, endpointID, projectID, valType)
return vmMetric{
labels: labels,
value: val,
}
}
func makeVMCPUMetrics(vm *vmapi.VirtualMachine) []vmMetric {
var metrics []vmMetric
// metrics from spec
specPairs := []pair[vmResourceValueType, vmapi.MilliCPU]{
{vmResourceValueSpecMin, vm.Spec.Guest.CPUs.Min},
{vmResourceValueSpecMax, vm.Spec.Guest.CPUs.Max},
{vmResourceValueSpecUse, vm.Spec.Guest.CPUs.Use},
}
for _, p := range specPairs {
m := makeVMMetric(vm, p.first, p.second.AsFloat64())
metrics = append(metrics, m)
}
// metrics from status
if vm.Status.CPUs != nil {
m := makeVMMetric(vm, vmResourceValueStatusUse, vm.Status.CPUs.AsFloat64())
metrics = append(metrics, m)
}
// metrics from autoscaling bounds annotation
if bounds := extractAutoscalingBounds(vm); bounds != nil {
boundPairs := []pair[vmResourceValueType, resource.Quantity]{
{vmResourceValueAutoscalingMin, bounds.Min.CPU},
{vmResourceValueAutoscalingMax, bounds.Max.CPU},
}
for _, p := range boundPairs {
// avoid using resource.Quantity.AsApproximateFloat64() since it's quite inaccurate
m := makeVMMetric(vm, p.first, vmapi.MilliCPUFromResourceQuantity(p.second).AsFloat64())
metrics = append(metrics, m)
}
}
return metrics
}
func makeVMMemMetrics(vm *vmapi.VirtualMachine) []vmMetric {
var metrics []vmMetric
memorySlotsToBytes := func(m int32) int64 {
return vm.Spec.Guest.MemorySlotSize.Value() * int64(m)
}
// metrics from spec
specPairs := []pair[vmResourceValueType, int32]{
{vmResourceValueSpecMin, vm.Spec.Guest.MemorySlots.Min},
{vmResourceValueSpecMax, vm.Spec.Guest.MemorySlots.Max},
{vmResourceValueSpecUse, vm.Spec.Guest.MemorySlots.Use},
}
for _, p := range specPairs {
m := makeVMMetric(vm, p.first, float64(memorySlotsToBytes(p.second)))
metrics = append(metrics, m)
}
// metrics from status
if vm.Status.MemorySize != nil {
m := makeVMMetric(vm, vmResourceValueStatusUse, float64(vm.Status.MemorySize.Value()))
metrics = append(metrics, m)
}
// metrics from autoscaling bounds annotation
if bounds := extractAutoscalingBounds(vm); bounds != nil {
boundPairs := []pair[vmResourceValueType, resource.Quantity]{
{vmResourceValueAutoscalingMin, bounds.Min.Mem},
{vmResourceValueAutoscalingMax, bounds.Max.Mem},
}
for _, p := range boundPairs {
m := makeVMMetric(vm, p.first, float64(p.second.Value()))
metrics = append(metrics, m)
}
}
return metrics
}
// makeVMRestartMetrics makes metrics related to VM restarts. Currently, it
// only includes one metrics, which is restartCount.
func makeVMRestartMetrics(vm *vmapi.VirtualMachine) []vmMetric {
endpointID := vm.Labels[endpointLabel]
projectID := vm.Labels[projectLabel]
labels := makePerVMMetricsLabels(vm.Namespace, vm.Name, endpointID, projectID, "")
return []vmMetric{
{
labels: labels,
value: float64(vm.Status.RestartCount),
},
}
}
func setVMMetrics(perVMMetrics *PerVMMetrics, vm *vmapi.VirtualMachine, nodeName string) {
if vm.Status.Node != nodeName {
return
}
cpuMetrics := makeVMCPUMetrics(vm)
for _, m := range cpuMetrics {
perVMMetrics.cpu.With(m.labels).Set(m.value)
}
memMetrics := makeVMMemMetrics(vm)
for _, m := range memMetrics {
perVMMetrics.memory.With(m.labels).Set(m.value)
}
restartCountMetrics := makeVMRestartMetrics(vm)
for _, m := range restartCountMetrics {
perVMMetrics.restartCount.With(m.labels).Set(m.value)
}
}
func updateVMMetrics(perVMMetrics *PerVMMetrics, oldVM, newVM *vmapi.VirtualMachine, nodeName string) {
if newVM.Status.Node != nodeName || oldVM.Status.Node != nodeName {
// this case we don't need an in-place metric update. Either we just have
// to add the new metrics, or delete the old ones, or nothing!
deleteVMMetrics(perVMMetrics, oldVM, nodeName)
setVMMetrics(perVMMetrics, newVM, nodeName)
return
}
updateMetrics := func(gauge *prometheus.GaugeVec, oldMetrics, newMetrics []vmMetric) {
for _, m := range oldMetrics {
// this is a linear search, but since we have small number (~10) of
// different metrics for each vm, this should be fine.
ok := slices.ContainsFunc(newMetrics, func(vm vmMetric) bool {
return maps.Equal(m.labels, vm.labels)
})
if !ok {
gauge.Delete(m.labels)
}
}
for _, m := range newMetrics {
gauge.With(m.labels).Set(m.value)
}
}
oldCPUMetrics := makeVMCPUMetrics(oldVM)
newCPUMetrics := makeVMCPUMetrics(newVM)
updateMetrics(perVMMetrics.cpu, oldCPUMetrics, newCPUMetrics)
oldMemMetrics := makeVMMemMetrics(oldVM)
newMemMetrics := makeVMMemMetrics(newVM)
updateMetrics(perVMMetrics.memory, oldMemMetrics, newMemMetrics)
oldRestartCountMetrics := makeVMRestartMetrics(oldVM)
newRestartCountMetrics := makeVMRestartMetrics(newVM)
updateMetrics(perVMMetrics.restartCount, oldRestartCountMetrics, newRestartCountMetrics)
}
func deleteVMMetrics(perVMMetrics *PerVMMetrics, vm *vmapi.VirtualMachine, nodeName string) {
if vm.Status.Node != nodeName {
return
}
cpuMetrics := makeVMCPUMetrics(vm)
for _, m := range cpuMetrics {
perVMMetrics.cpu.Delete(m.labels)
}
memMetrics := makeVMMemMetrics(vm)
for _, m := range memMetrics {
perVMMetrics.memory.Delete(m.labels)
}
restartCountMetrics := makeVMRestartMetrics(vm)
for _, m := range restartCountMetrics {
perVMMetrics.restartCount.Delete(m.labels)
}
}
package api
import (
"encoding/json"
"errors"
"fmt"
"reflect"
"go.uber.org/zap/zapcore"
"k8s.io/apimachinery/pkg/api/resource"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/util"
)
/////////////////////////////////
// (Autoscaler) Agent Messages //
/////////////////////////////////
// PluginProtoVersion represents a single version of the agent<->scheduler plugin protocol
//
// Each version of the agent<->scheduler plugin protocol is named independently from releases of the
// repository containing this code. Names follow semver, although this does not necessarily
// guarantee support - for example, the plugin may only support a single version, even though others
// may appear to be semver-compatible.
//
// Version compatibility is documented in the neighboring file VERSIONING.md.
type PluginProtoVersion uint32
const (
// PluginProtoV1_0 represents v1.0 of the agent<->scheduler plugin protocol - the initial
// version.
//
// Last used in release version v0.1.8.
PluginProtoV1_0 PluginProtoVersion = iota + 1 // start from zero, for backwards compatibility with pre-versioned messages
// PluginProtoV1_1 represents v1.1 of the agent<->scheduler plugin protocol.
//
// Changes from v1.0:
//
// * Allows a nil value of the AgentRequest.Metrics field.
//
// Last used in release version v0.6.0.
PluginProtoV1_1
// PluginProtoV2_0 represents v2.0 of the agent<->scheduler plugin protocol.
//
// Changes from v1.1:
//
// * Supports fractional CPU
//
// Last used in release version v0.19.x.
PluginProtoV2_0
// PluginProtoV2_1 represents v2.1 of the agent<->scheduler plugin protocol.
//
// Changes from v2.0:
//
// * added AgentRequest.LastPermit
//
// Last used in release version v0.21.0.
PluginProtoV2_1
// PluginProtoV3_0 represents v3.0 of the agent<->scheduler plugin protocol.
//
// Changes from v2.1:
//
// * Removes PluginResponse.ComputeUnit (agent is now responsible for source of truth)
//
// Last used in release version v0.22.0.
PluginProtoV3_0
// PluginProtoV4_0 represents v4.0 of the agent<->scheduler plugin protocol.
//
// Changes from v3.0:
//
// * Memory quantities now use "number of bytes" instead of "number of memory slots"
// * Adds AgentRequest.ComputeUnit
//
// Last used in release version v0.27.0.
PluginProtoV4_0
// PluginProtoV5_0 represents v5.0 of the agent<->scheduler plugin protocol.
//
// Changes from v4.0:
//
// * Removed AgentRequest.metrics fields loadAvg5M and memoryUsageBytes
//
// Currently the latest version.
PluginProtoV5_0
// latestPluginProtoVersion represents the latest version of the agent<->scheduler plugin
// protocol
//
// This value is kept private because it should not be used externally; any desired
// functionality that could be implemented with it should instead be a method on
// PluginProtoVersion.
latestPluginProtoVersion PluginProtoVersion = iota // excluding +1 makes it equal to previous
)
func (v PluginProtoVersion) String() string {
var zero PluginProtoVersion
switch v {
case zero:
return "<invalid: zero>"
case PluginProtoV1_0:
return "v1.0"
case PluginProtoV1_1:
return "v1.1"
case PluginProtoV2_0:
return "v2.0"
case PluginProtoV2_1:
return "v2.1"
case PluginProtoV3_0:
return "v3.0"
case PluginProtoV4_0:
return "v4.0"
case PluginProtoV5_0:
return "v5.0"
default:
diff := v - latestPluginProtoVersion
return fmt.Sprintf("<unknown = %v + %d>", latestPluginProtoVersion, diff)
}
}
// IsValid returns whether the protocol version is valid. The zero value is not valid.
func (v PluginProtoVersion) IsValid() bool {
return uint(v) != 0
}
// AllowsNilMetrics returns whether this version of the protocol allows the autoscaler-agent to send
// a nil metrics field.
//
// This is true for version v1.1 and greater.
func (v PluginProtoVersion) AllowsNilMetrics() bool {
return v >= PluginProtoV1_1
}
func (v PluginProtoVersion) SupportsFractionalCPU() bool {
return v >= PluginProtoV2_0
}
// PluginSendsComputeUnit returns whether this version of the protocol expects the scheduler plugin
// to send the value of the Compute Unit in its PluginResponse.
//
// This is true for all versions below v3.0.
func (v PluginProtoVersion) PluginSendsComputeUnit() bool {
return v < PluginProtoV3_0
}
// AgentSendsComputeUnit returns whether this version of the protocol expects the autoscaler-agent
// to send the value of its configured Compute Unit in its AgentRequest.
//
// This is true for version v4.0 and greater.
func (v PluginProtoVersion) AgentSendsComputeUnit() bool {
return v >= PluginProtoV4_0
}
// RepresentsMemoryAsBytes returns whether this version of the protocol uses byte quantities to
// refer to memory amounts, rather than a number of memory slots.
//
// This is true for version v4.0 and greater.
func (v PluginProtoVersion) RepresentsMemoryAsBytes() bool {
return v >= PluginProtoV4_0
}
// IncludesExtendedMetrics returns whether this version of the protocol includes the AgentRequest's
// metrics loadAvg5M and memoryUsageBytes.
//
// This is true for all versions below v5.0.
func (v PluginProtoVersion) IncludesExtendedMetrics() bool {
return v < PluginProtoV5_0
}
// AgentRequest is the type of message sent from an autoscaler-agent to the scheduler plugin
//
// All AgentRequests expect a PluginResponse.
type AgentRequest struct {
// ProtoVersion is the version of the protocol that the autoscaler-agent is expecting to use
//
// If the scheduler does not support this version, then it will respond with a 400 status.
ProtoVersion PluginProtoVersion `json:"protoVersion"`
// Pod is the namespaced name of the pod making the request
Pod util.NamespacedName `json:"pod"`
// ComputeUnit gives the value of the agent's configured compute unit to use for the VM.
//
// If the requested resources are not a multiple of ComputeUnit, the scheduler plugin will make
// a best-effort attempt to return a value satisfying the request. Any approved increases will
// be a multiple of ComputeUnit, but otherwise the plugin does not check.
ComputeUnit Resources `json:"computeUnit"`
// Resources gives a requested or notified change in resources allocated to the VM.
//
// The requested amount MAY be equal to the current amount, in which case it serves as a
// notification that the VM should no longer be contributing to resource pressure.
//
// TODO: allow passing nil here if nothing's changed (i.e., the request would be the same as the
// previous request)
Resources Resources `json:"resources"`
// LastPermit indicates the last permit that the agent has received from the scheduler plugin.
// In case of a failure, the new running scheduler uses LastPermit to recover the previous state.
// LastPermit may be nil.
LastPermit *Resources `json:"lastPermit"`
// Metrics provides information about the VM's current load, so that the scheduler may
// prioritize which pods to migrate
//
// In some protocol versions, this field may be nil.
Metrics *Metrics `json:"metrics"`
}
// Metrics gives the information pulled from vector.dev that the scheduler may use to prioritize
// which pods it should migrate.
type Metrics struct {
LoadAverage1Min float32 `json:"loadAvg1M"`
// DEPRECATED. Will be removed in an upcoming release.
LoadAverage5Min *float32 `json:"loadAvg5M,omitempty"`
// DEPRECATED. Will be removed in an upcoming release.
MemoryUsageBytes *float32 `json:"memoryUsageBytes,omitempty"`
}
// ProtocolRange returns a VersionRange exactly equal to r.ProtoVersion
func (r AgentRequest) ProtocolRange() VersionRange[PluginProtoVersion] {
return VersionRange[PluginProtoVersion]{
Min: r.ProtoVersion,
Max: r.ProtoVersion,
}
}
// Bytes represents a number of bytes, with custom marshaling / unmarshaling that goes through
// resource.Quantity in order to have simplified values over wire
type Bytes uint64
// BytesFromResourceQuantity converts resource.Quantity into Bytes
func BytesFromResourceQuantity(r resource.Quantity) Bytes {
return Bytes(uint64(r.Value()))
}
// ToResourceQuantity converts a Bytes to resource.Quantity - typically used for formatting and/or
// serialization
func (b Bytes) ToResourceQuantity() *resource.Quantity {
return resource.NewQuantity(int64(b), resource.BinarySI)
}
// AsFloat64 converts a Bytes into float64 of the same amount
func (b Bytes) AsFloat64() float64 {
return float64(b)
}
func (b *Bytes) UnmarshalJSON(data []byte) error {
var quantity resource.Quantity
err := json.Unmarshal(data, &quantity)
if err != nil {
return err
}
*b = BytesFromResourceQuantity(quantity)
return nil
}
func (b Bytes) MarshalJSON() ([]byte, error) {
// To (temporarily) support multiple API versions, we should output smaller values as integers.
// Otherwise, resource.Quantity will always format as a string, which is incompatible with
// earllier versions of the agent<->scheduler plugin API.
if b < 1024 {
return json.Marshal(uint64(b))
}
return json.Marshal(b.ToResourceQuantity())
}
func (b Bytes) Format(state fmt.State, verb rune) {
switch {
case verb == 'v' && state.Flag('#'):
//nolint:errcheck // can't do anything about the write error
state.Write([]byte(fmt.Sprintf("%v", uint64(b))))
default:
//nolint:errcheck // can't do anything about the write error
state.Write([]byte(b.ToResourceQuantity().String()))
}
}
// Resources represents an amount of CPU and memory
//
// When used in an AgentRequest, it represents the desired total amount of resources. When
// a resource is increasing, the autoscaler-agent "requests" the change to confirm that the
// resources are available. When decreasing, the autoscaler-agent is expected to use Resources to
// "notify" the scheduler -- i.e., the resource amount should have already been decreased. When
// a resource stays at the same amount, the associated AgentRequest serves to indicate that the
// autoscaler-agent is "satisfied" with its current resources, and should no longer contribute to
// any existing resource pressure.
//
// When used a PluginResponse (as a Permit), then the Resources serves to inform the
// autoscaler-agent of the amount it has been permitted to use, subject to node resource limits.
//
// In all cases, each resource type is considered separately from the others.
type Resources struct {
VCPU vmapi.MilliCPU `json:"vCPUs"`
// Mem gives the number of bytes of memory requested
Mem Bytes `json:"mem"`
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that Resources can be used with zap.Object
func (r Resources) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddString("vCPU", fmt.Sprintf("%v", r.VCPU))
enc.AddString("mem", fmt.Sprintf("%v", r.Mem))
return nil
}
// ValidateNonZero checks that neither of the Resources fields are equal to zero, returning an error
// if either is.
func (r Resources) ValidateNonZero() error {
if r.VCPU == 0 {
return errors.New("vCPUs must be non-zero")
} else if r.Mem == 0 {
return errors.New("mem must be non-zero")
}
return nil
}
func (r Resources) CheckValuesAreReasonablySized() error {
if r.VCPU < 50 {
return errors.New("VCPU is smaller than 0.05")
}
if r.VCPU > 512*1000 {
return errors.New("VCPU is bigger than 512")
}
return nil
}
// HasFieldGreaterThan returns true if and only if there is a field F where r.F > cmp.F
func (r Resources) HasFieldGreaterThan(cmp Resources) bool {
return r.VCPU > cmp.VCPU || r.Mem > cmp.Mem
}
// HasFieldGreaterThan returns true if and only if there is a field F where r.F < cmp.F
func (r Resources) HasFieldLessThan(cmp Resources) bool {
return cmp.HasFieldGreaterThan(r)
}
// Min returns a new Resources value with each field F as the minimum of r.F and cmp.F
func (r Resources) Min(cmp Resources) Resources {
return Resources{
VCPU: min(r.VCPU, cmp.VCPU),
Mem: min(r.Mem, cmp.Mem),
}
}
// Max returns a new Resources value with each field F as the maximum of r.F and cmp.F
func (r Resources) Max(cmp Resources) Resources {
return Resources{
VCPU: max(r.VCPU, cmp.VCPU),
Mem: max(r.Mem, cmp.Mem),
}
}
// Add returns the result of adding the two Resources
func (r Resources) Add(other Resources) Resources {
return Resources{
VCPU: r.VCPU + other.VCPU,
Mem: r.Mem + other.Mem,
}
}
// SaturatingSub returns the result of subtracting r - other, with values that *would* underflow
// instead set to zero.
func (r Resources) SaturatingSub(other Resources) Resources {
return Resources{
VCPU: util.SaturatingSub(r.VCPU, other.VCPU),
Mem: util.SaturatingSub(r.Mem, other.Mem),
}
}
// Mul returns the result of multiplying each resource by factor
func (r Resources) Mul(factor uint16) Resources {
return Resources{
VCPU: vmapi.MilliCPU(factor) * r.VCPU,
Mem: Bytes(factor) * r.Mem,
}
}
// AbsDiff returns a new Resources with each field F as the absolute value of the difference between
// r.F and cmp.F
func (r Resources) AbsDiff(cmp Resources) Resources {
return Resources{
VCPU: util.AbsDiff(r.VCPU, cmp.VCPU),
Mem: util.AbsDiff(r.Mem, cmp.Mem),
}
}
// Increase returns a MoreResources with each field F true when r.F > old.F.
func (r Resources) IncreaseFrom(old Resources) MoreResources {
return MoreResources{
Cpu: r.VCPU > old.VCPU,
Memory: r.Mem > old.Mem,
}
}
// ConvertToRaw produces the Allocation equivalent to these Resources
func (r Resources) ConvertToAllocation() Allocation {
return Allocation{
Cpu: r.VCPU.ToResourceQuantity().AsApproximateFloat64(),
Mem: uint64(r.Mem),
}
}
/////////////////////////////////
// (Scheduler) Plugin Messages //
/////////////////////////////////
type PluginResponse struct {
// Permit provides an upper bound on the resources that the VM is now allowed to consume
//
// If the request's Resources were less than or equal its current resources, then the Permit
// will exactly equal those resources. Otherwise, it may contain resource allocations anywhere
// between the current and requested resources, inclusive.
Permit Resources `json:"permit"`
// Migrate, if present, notifies the autoscaler-agent that its VM will be migrated away,
// alongside whatever other information may be useful.
Migrate *MigrateResponse `json:"migrate,omitempty"`
}
// MigrateResponse, when provided, is a notification to the autsocaler-agent that it will migrate
//
// After receiving a MigrateResponse, the autoscaler-agent MUST NOT change its resource allocation.
//
// TODO: fill this with more information as required
type MigrateResponse struct{}
// MoreResources holds the data associated with a MoreResourcesRequest
type MoreResources struct {
// Cpu is true if the vm-monitor is requesting more CPU
Cpu bool `json:"cpu"`
// Memory is true if the vm-monitor is requesting more memory
Memory bool `json:"memory"`
}
// Not returns the field-wise logical "not" of m
func (m MoreResources) Not() MoreResources {
return MoreResources{
Cpu: !m.Cpu,
Memory: !m.Memory,
}
}
// And returns the field-wise logical "and" of m and cmp
func (m MoreResources) And(cmp MoreResources) MoreResources {
return MoreResources{
Cpu: m.Cpu && cmp.Cpu,
Memory: m.Memory && cmp.Memory,
}
}
////////////////////////////////////
// Controller <-> Runner Messages //
////////////////////////////////////
// VCPUChange is used to notify runner that it had some changes in its CPUs
// runner uses this info to adjust qemu cgroup
type VCPUChange struct {
VCPUs vmapi.MilliCPU
}
// VCPUCgroup is used in runner to reply to controller
// it represents the vCPU usage as controlled by cgroup
type VCPUCgroup struct {
VCPUs vmapi.MilliCPU
}
// this a similar version type for controller <-> runner communications
// see PluginProtoVersion comment for details
type RunnerProtoVersion uint32
const (
RunnerProtoV1 RunnerProtoVersion = iota + 1
)
func (v RunnerProtoVersion) SupportsCgroupFractionalCPU() bool {
return v >= RunnerProtoV1
}
////////////////////////////////////
// Agent <-> Monitor Messages //
////////////////////////////////////
// Represents the resources that a VM has been granted
type Allocation struct {
// Number of vCPUs
Cpu float64 `json:"cpu"`
// Number of bytes
Mem uint64 `json:"mem"`
}
// ** Types sent by monitor **
// This type is sent to the agent as a way to request immediate upscale.
// Since the agent cannot control if the agent will choose to upscale the VM,
// it does not return anything. If an upscale is granted, the agent will notify
// the monitor via an UpscaleConfirmation
type UpscaleRequest struct{}
// This type is sent to the agent to confirm it successfully upscaled, meaning
// it increased its filecache and/or cgroup memory limits. The agent does not
// need to respond.
type UpscaleConfirmation struct{}
// This type is sent to the agent to indicate if downscaling was successful. The
// agent does not need to respond.
type DownscaleResult struct {
Ok bool
Status string
}
// ** Types sent by agent **
// This type is sent to the monitor to inform it that it has been granted a geater
// allocation. Once the monitor is done applying this new allocation (i.e, increasing
// file cache size, cgroup memory limits) it should reply with an UpscaleConfirmation.
type UpscaleNotification struct {
Granted Allocation `json:"granted"`
}
// This type is sent to the monitor as a request to downscale its resource usage.
// Once the monitor has downscaled or failed to do so, it should respond with a
// DownscaleResult.
type DownscaleRequest struct {
Target Allocation `json:"target"`
}
// ** Types shared by agent and monitor **
// This type can be sent by either party whenever they receive a message they
// cannot deserialize properly.
type InvalidMessage struct {
Error string `json:"error"`
}
// This type can be sent by either party to signal that an error occurred carrying
// out the other party's request, for example, the monitor erroring while trying
// to downscale. The receiving party can they log the error or propagate it as they
// see fit.
type InternalError struct {
Error string `json:"error"`
}
// This type is sent as part of a bidirectional heartbeat between the monitor and
// agent. The check is initiated by the agent.
type HealthCheck struct{}
// This function is used to prepare a message for serialization. Any data passed
// to the monitor should be serialized with this function. As of protocol v1.0,
// the following types maybe be sent to the monitor, and thus passed in:
// - DownscaleRequest
// - UpscaleNotification
// - InvalidMessage
// - InternalError
// - HealthCheck
func SerializeMonitorMessage(content any, id uint64) ([]byte, error) {
// The final type that gets sent over the wire
type Bundle struct {
Content any `json:"content"`
Type string `json:"type"`
Id uint64 `json:"id"`
}
var typeStr string
switch content.(type) {
case DownscaleRequest:
typeStr = "DownscaleRequest"
case UpscaleNotification:
typeStr = "UpscaleNotification"
case InvalidMessage:
typeStr = "InvalidMessage"
case InternalError:
typeStr = "InternalError"
case HealthCheck:
typeStr = "HealthCheck"
default:
return nil, fmt.Errorf("unknown message type \"%s\"", reflect.TypeOf(content))
}
return json.Marshal(Bundle{
Content: content,
Type: typeStr,
Id: id,
})
}
// MonitorProtoVersion represents a single version of the agent<->monitor protocol
//
// Each version of the agent<->monitor protocol is named independently from releases of the
// repository containing this code. Names follow semver, although this does not necessarily
// guarantee support - for example, the monitor may only support versions above v1.1.
//
// Version compatibility is documented in the neighboring file VERSIONING.md.
type MonitorProtoVersion uint32
const (
// MonitorProtoV1_0 represents v1.0 of the agent<->monitor protocol - the initial version.
//
// Currently the latest version.
MonitorProtoV1_0 = iota + 1
// latestMonitorProtoVersion represents the latest version of the agent<->Monitor protocol
//
// This value is kept private because it should not be used externally; any desired
// functionality that could be implemented with it should instead be a method on
// MonitorProtoVersion.
latestMonitorProtoVersion MonitorProtoVersion = iota // excluding +1 makes it equal to previous
)
func (v MonitorProtoVersion) String() string {
var zero MonitorProtoVersion
switch v {
case zero:
return "<invalid: zero>"
case MonitorProtoV1_0:
return "v1.0"
default:
diff := v - latestMonitorProtoVersion
return fmt.Sprintf("<unknown = %v + %d>", latestMonitorProtoVersion, diff)
}
}
// Sent back by the monitor after figuring out what protocol version we should use
type MonitorProtocolResponse struct {
// If `Error` is nil, contains the value of the settled on protocol version.
// Otherwise, will be set to 0 (MonitorProtocolVersion's zero value).
Version MonitorProtoVersion `json:"version,omitempty"`
// Will be nil if no error occurred.
Error *string `json:"error,omitempty"`
}
package api
// Generic version handling
import (
"fmt"
"golang.org/x/exp/constraints"
)
// VersionRange is a helper type to represent a range of versions.
//
// The bounds are inclusive, representing all versions v with Min <= v <= Max.
//
// This type is sent directly to the monitor during the creation of a new
// Dispatcher as part of figuring out which protocol to use.
type VersionRange[V constraints.Ordered] struct {
Min V `json:"min"`
Max V `json:"max"`
}
func (r VersionRange[V]) String() string {
if r.Min == r.Max {
return fmt.Sprintf("%v", r.Min)
} else {
return fmt.Sprintf("%v to %v", r.Min, r.Max)
}
}
// LatestSharedVersion returns the latest version covered by both VersionRanges, if there is one.
//
// If either range is invalid, or no such version exists (i.e. the ranges are disjoint), then the
// returned values will be (0, false).
func (r VersionRange[V]) LatestSharedVersion(cmp VersionRange[V]) (_ V, ok bool) {
maxVersion := min(r.Max, cmp.Max)
minVersion := max(r.Min, cmp.Min)
if maxVersion >= minVersion {
return maxVersion, true
} else {
var v V
return v, false
}
}
// API-relevant types extracted from NeonVM VMs
package api
import (
"encoding/json"
"errors"
"fmt"
"github.com/samber/lo"
"github.com/tychoish/fun/erc"
"go.uber.org/zap"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/util"
)
const (
LabelEnableAutoMigration = "autoscaling.neon.tech/auto-migration-enabled"
LabelTestingOnlyAlwaysMigrate = "autoscaling.neon.tech/testing-only-always-migrate"
LabelEnableAutoscaling = "autoscaling.neon.tech/enabled"
AnnotationAutoscalingBounds = "autoscaling.neon.tech/bounds"
AnnotationAutoscalingConfig = "autoscaling.neon.tech/config"
AnnotationBillingEndpointID = "autoscaling.neon.tech/billing-endpoint-id"
)
func hasTrueLabel(obj metav1.ObjectMetaAccessor, labelName string) bool {
labels := obj.GetObjectMeta().GetLabels()
value, ok := labels[labelName]
return ok && value == "true"
}
// HasAutoscalingEnabled returns true iff the object has the label that enables autoscaling
func HasAutoscalingEnabled(obj metav1.ObjectMetaAccessor) bool {
return hasTrueLabel(obj, LabelEnableAutoscaling)
}
// HasAutoMigrationEnabled returns true iff the object has the label that enables "automatic"
// scheduler-triggered migration, and it's set to "true"
func HasAutoMigrationEnabled(obj metav1.ObjectMetaAccessor) bool {
return hasTrueLabel(obj, LabelEnableAutoMigration)
}
func HasAlwaysMigrateLabel(obj metav1.ObjectMetaAccessor) bool {
return hasTrueLabel(obj, LabelTestingOnlyAlwaysMigrate)
}
// VmInfo is the subset of vmapi.VirtualMachineSpec that the scheduler plugin and autoscaler agent
// care about. It takes various labels and annotations into account, so certain fields might be
// different from what's strictly in the VirtualMachine object.
type VmInfo struct {
Name string `json:"name"`
Namespace string `json:"namespace"`
Cpu VmCpuInfo `json:"cpu"`
Mem VmMemInfo `json:"mem"`
Config VmConfig `json:"config"`
CurrentRevision *vmapi.RevisionWithTime `json:"currentRevision,omitempty"`
}
type VmCpuInfo struct {
Min vmapi.MilliCPU `json:"min"`
Max vmapi.MilliCPU `json:"max"`
Use vmapi.MilliCPU `json:"use"`
}
func NewVmCpuInfo(cpus vmapi.CPUs) VmCpuInfo {
return VmCpuInfo{
Min: cpus.Min,
Max: cpus.Max,
Use: cpus.Use,
}
}
type VmMemInfo struct {
// Min is the minimum number of memory slots available
Min uint16 `json:"min"`
// Max is the maximum number of memory slots available
Max uint16 `json:"max"`
// Use is the number of memory slots currently plugged in the VM
Use uint16 `json:"use"`
SlotSize Bytes `json:"slotSize"`
}
func NewVmMemInfo(memSlots vmapi.MemorySlots, memSlotSize resource.Quantity) VmMemInfo {
return VmMemInfo{
Min: uint16(memSlots.Min),
Max: uint16(memSlots.Max),
Use: uint16(memSlots.Use),
SlotSize: Bytes(memSlotSize.Value()),
}
}
// VmConfig stores the autoscaling-specific "extra" configuration derived from labels and
// annotations on the VM object.
//
// This is separate from the bounds information stored in VmInfo (even though that's also derived
// from annotations), because VmConfig is meant to store values that either qualitatively change the
// handling for a VM (e.g., AutoMigrationEnabled) or are expected to largely be the same for most VMs
// (e.g., ScalingConfig).
type VmConfig struct {
// AutoMigrationEnabled indicates to the scheduler plugin that it's allowed to trigger migration
// for this VM. This defaults to false because otherwise we might disrupt VMs that don't have
// adequate networking support to preserve connections across live migration.
AutoMigrationEnabled bool `json:"autoMigrationEnabled"`
// AlwaysMigrate is a test-only debugging flag that, if present in the VM's labels, will always
// prompt it to migrate, regardless of whether the VM actually *needs* to.
AlwaysMigrate bool `json:"alwaysMigrate"`
ScalingEnabled bool `json:"scalingEnabled"`
ScalingConfig *ScalingConfig `json:"scalingConfig,omitempty"`
}
// Using returns the Resources that this VmInfo says the VM is using
func (vm VmInfo) Using() Resources {
return Resources{
VCPU: vm.Cpu.Use,
Mem: vm.Mem.SlotSize * Bytes(vm.Mem.Use),
}
}
// SetUsing sets the values of vm.{Cpu,Mem}.Use to those provided by r
func (vm *VmInfo) SetUsing(r Resources) {
vm.Cpu.Use = r.VCPU
vm.Mem.Use = uint16(r.Mem / vm.Mem.SlotSize)
}
// Min returns the Resources representing the minimum amount this VmInfo says the VM must reserve
func (vm VmInfo) Min() Resources {
return Resources{
VCPU: vm.Cpu.Min,
Mem: vm.Mem.SlotSize * Bytes(vm.Mem.Min),
}
}
// Max returns the Resources representing the maximum amount this VmInfo says the VM may reserve
func (vm VmInfo) Max() Resources {
return Resources{
VCPU: vm.Cpu.Max,
Mem: vm.Mem.SlotSize * Bytes(vm.Mem.Max),
}
}
func (vm VmInfo) NamespacedName() util.NamespacedName {
return util.NamespacedName{Namespace: vm.Namespace, Name: vm.Name}
}
func ExtractVmInfo(logger *zap.Logger, vm *vmapi.VirtualMachine) (*VmInfo, error) {
logger = logger.With(util.VMNameFields(vm))
info, err := extractVmInfoGeneric(logger, vm.Name, vm, vm.Spec.Resources())
if err != nil {
return nil, fmt.Errorf("error extracting VM info: %w", err)
}
info.CurrentRevision = vm.Status.CurrentRevision
return info, nil
}
func ExtractVmInfoFromPod(logger *zap.Logger, pod *corev1.Pod) (*VmInfo, error) {
logger = logger.With(util.PodNameFields(pod))
resourcesJSON := pod.Annotations[vmapi.VirtualMachineResourcesAnnotation]
var resources vmapi.VirtualMachineResources
if err := json.Unmarshal([]byte(resourcesJSON), &resources); err != nil {
return nil, fmt.Errorf("Error unmarshaling %q: %w",
vmapi.VirtualMachineResourcesAnnotation, err)
}
vmName := pod.Labels[vmapi.VirtualMachineNameLabel]
return extractVmInfoGeneric(logger, vmName, pod, resources)
}
func extractVmInfoGeneric(
logger *zap.Logger,
vmName string,
obj metav1.ObjectMetaAccessor,
resources vmapi.VirtualMachineResources,
) (*VmInfo, error) {
cpuInfo := NewVmCpuInfo(resources.CPUs)
memInfo := NewVmMemInfo(resources.MemorySlots, resources.MemorySlotSize)
autoMigrationEnabled := HasAutoMigrationEnabled(obj)
scalingEnabled := HasAutoscalingEnabled(obj)
alwaysMigrate := HasAlwaysMigrateLabel(obj)
info := VmInfo{
Name: vmName,
Namespace: obj.GetObjectMeta().GetNamespace(),
Cpu: cpuInfo,
Mem: memInfo,
Config: VmConfig{
AutoMigrationEnabled: autoMigrationEnabled,
AlwaysMigrate: alwaysMigrate,
ScalingEnabled: scalingEnabled,
ScalingConfig: nil, // set below, maybe
},
CurrentRevision: nil, // set later, maybe
}
if boundsJSON, ok := obj.GetObjectMeta().GetAnnotations()[AnnotationAutoscalingBounds]; ok {
var bounds ScalingBounds
if err := json.Unmarshal([]byte(boundsJSON), &bounds); err != nil {
return nil, fmt.Errorf("Error unmarshaling annotation %q: %w", AnnotationAutoscalingBounds, err)
}
if err := bounds.Validate(&resources.MemorySlotSize); err != nil {
return nil, fmt.Errorf("Bad scaling bounds in annotation %q: %w", AnnotationAutoscalingBounds, err)
}
info.applyBounds(bounds)
}
if configJSON, ok := obj.GetObjectMeta().GetAnnotations()[AnnotationAutoscalingConfig]; ok {
var config ScalingConfig
if err := json.Unmarshal([]byte(configJSON), &config); err != nil {
return nil, fmt.Errorf("Error unmarshaling annotation %q: %w", AnnotationAutoscalingConfig, err)
}
if err := config.ValidateOverrides(); err != nil {
return nil, fmt.Errorf("Bad scaling config in annotation %q: %w", AnnotationAutoscalingConfig, err)
}
info.Config.ScalingConfig = &config
}
minResources := info.Min()
using := info.Using()
maxResources := info.Max()
// we can't do validation for resource.Quantity with kubebuilder
// so do it here
if err := minResources.CheckValuesAreReasonablySized(); err != nil {
return nil, fmt.Errorf("min resources are invalid: %w", err)
}
if err := maxResources.CheckValuesAreReasonablySized(); err != nil {
return nil, fmt.Errorf("max resources are invalid: %w", err)
}
// check: min <= max
if minResources.HasFieldGreaterThan(maxResources) {
return nil, fmt.Errorf("min resources %+v has field greater than maximum %+v", minResources, maxResources)
}
// check: min <= using <= max
if using.HasFieldLessThan(minResources) {
logger.Warn(
"Current usage has field less than minimum",
zap.Object("using", using), zap.Object("min", minResources),
)
} else if using.HasFieldGreaterThan(maxResources) {
logger.Warn(
"Current usage has field greater than maximum",
zap.Object("using", using), zap.Object("max", maxResources),
)
}
return &info, nil
}
func (vm VmInfo) EqualScalingBounds(cmp VmInfo) bool {
return vm.Min() == cmp.Min() && vm.Max() == cmp.Max()
}
func (vm *VmInfo) applyBounds(b ScalingBounds) {
vm.Cpu.Min = vmapi.MilliCPUFromResourceQuantity(b.Min.CPU)
vm.Cpu.Max = vmapi.MilliCPUFromResourceQuantity(b.Max.CPU)
// FIXME: this will be incorrect if b.{Min,Max}.Mem.Value() is greater than
// (2^16-1) * info.Mem.SlotSize.Value().
vm.Mem.Min = uint16(BytesFromResourceQuantity(b.Min.Mem) / vm.Mem.SlotSize)
vm.Mem.Max = uint16(BytesFromResourceQuantity(b.Max.Mem) / vm.Mem.SlotSize)
}
// ScalingBounds is the type that we deserialize from the "autoscaling.neon.tech/bounds" annotation
//
// All fields (and sub-fields) are pointers so that our handling can distinguish between "field not
// set" and "field equal to zero". Please note that all field are still required to be set and
// non-zero, though.
type ScalingBounds struct {
Min ResourceBounds `json:"min"`
Max ResourceBounds `json:"max"`
}
type ResourceBounds struct {
CPU resource.Quantity `json:"cpu"`
Mem resource.Quantity `json:"mem"`
}
// Validate checks that the ScalingBounds are all reasonable values - all fields initialized and
// non-zero.
func (b ScalingBounds) Validate(memSlotSize *resource.Quantity) error {
ec := &erc.Collector{}
b.Min.validate(ec, ".min", memSlotSize)
b.Max.validate(ec, ".max", memSlotSize)
return ec.Resolve()
}
// TODO: This could be made better - see:
// https://github.com/neondatabase/autoscaling/pull/190#discussion_r1169405645
func (b ResourceBounds) validate(ec *erc.Collector, path string, memSlotSize *resource.Quantity) {
errAt := func(field string, err error) error {
return fmt.Errorf("error at %s%s: %w", path, field, err)
}
if b.CPU.IsZero() {
ec.Add(errAt(".cpu", errors.New("must be set to a non-zero value")))
}
if b.Mem.IsZero() || b.Mem.Value() < 0 {
ec.Add(errAt(".mem", errors.New("must be set to a value greater than zero")))
} else if b.Mem.Value()%memSlotSize.Value() != 0 {
ec.Add(errAt(".mem", fmt.Errorf("must be divisible by VM memory slot size %s", memSlotSize)))
}
}
// ScalingConfig provides bits of configuration for how the autoscaler-agent makes scaling decisions
type ScalingConfig struct {
// LoadAverageFractionTarget sets the desired fraction of current CPU that the load average
// should be. For example, with a value of 0.7, we'd want load average to sit at 0.7 × CPU,
// scaling CPU to make this happen.
//
// When specifying the autoscaler-agent config, this field is required. For an individual VM, if
// this field is left out the settings will fall back on the global default.
LoadAverageFractionTarget *float64 `json:"loadAverageFractionTarget,omitempty"`
// MemoryUsageFractionTarget sets the maximum fraction of total memory that postgres allocations
// (MemoryUsage) must fit into. This doesn't count the LFC memory.
// This memory may also be viewed as "unreclaimable" (contrary to e.g. page cache).
//
// For example, with a value of 0.75 on a 4GiB VM, we will try to upscale if the unreclaimable
// memory usage exceeds 3GiB.
//
// When specifying the autoscaler-agent config, this field is required. For an individual VM, if
// this field is left out the settings will fall back on the global default.
MemoryUsageFractionTarget *float64 `json:"memoryUsageFractionTarget,omitempty"`
// MemoryTotalFractionTarget sets the maximum fraction of total memory that postgres allocations
// PLUS LFC memory (MemoryUsage + MemoryCached) must fit into.
//
// Compared with MemoryUsageFractionTarget, this value can be set higher (e.g. 0.9 vs 0.75),
// because we can tolerate higher fraction of consumption for both in-VM memory consumers.
MemoryTotalFractionTarget *float64 `json:"memoryTotalFractionTarget,omitempty"`
// EnableLFCMetrics, if true, enables fetching additional metrics about the Local File Cache
// (LFC) to provide as input to the scaling algorithm.
//
// When specifying the autoscaler-agent config, this field is required. False is a safe default.
// For an individual VM, if this field is left out the settings will fall back on the global
// default.
EnableLFCMetrics *bool `json:"enableLFCMetrics,omitempty"`
// LFCToMemoryRatio dictates the amount of memory in any given Compute Unit that will be
// allocated to the LFC. For example, if the LFC is sized at 75% of memory, then this value
// would be 0.75.
LFCToMemoryRatio *float64 `json:"lfcToMemoryRatio,omitempty"`
// LFCMinWaitBeforeDownscaleMinutes dictates the minimum duration we must wait before lowering
// the goal CU based on LFC working set size.
// For example, a value of 15 means we will not allow downscaling below the working set size
// over the past 15 minutes. This allows us to accommodate spiky workloads without flushing the
// cache every time.
LFCMinWaitBeforeDownscaleMinutes *int `json:"lfcMinWaitBeforeDownscaleMinutes,omitempty"`
// LFCWindowSizeMinutes dictates the minimum duration we must use during internal calculations
// of the rate of increase in LFC working set size.
LFCWindowSizeMinutes *int `json:"lfcWindowSizeMinutes,omitempty"`
}
// WithOverrides returns a new copy of defaults, where fields set in overrides replace the ones in
// defaults but all others remain the same.
//
// overrides may be nil; if so, this method just returns defaults.
func (defaults ScalingConfig) WithOverrides(overrides *ScalingConfig) ScalingConfig {
if overrides == nil {
return defaults
}
if overrides.LoadAverageFractionTarget != nil {
defaults.LoadAverageFractionTarget = lo.ToPtr(*overrides.LoadAverageFractionTarget)
}
if overrides.MemoryUsageFractionTarget != nil {
defaults.MemoryUsageFractionTarget = lo.ToPtr(*overrides.MemoryUsageFractionTarget)
}
if overrides.MemoryTotalFractionTarget != nil {
defaults.MemoryTotalFractionTarget = lo.ToPtr(*overrides.MemoryTotalFractionTarget)
}
if overrides.EnableLFCMetrics != nil {
defaults.EnableLFCMetrics = lo.ToPtr(*overrides.EnableLFCMetrics)
}
if overrides.LFCToMemoryRatio != nil {
defaults.LFCToMemoryRatio = lo.ToPtr(*overrides.LFCToMemoryRatio)
}
if overrides.LFCWindowSizeMinutes != nil {
defaults.LFCWindowSizeMinutes = lo.ToPtr(*overrides.LFCWindowSizeMinutes)
}
if overrides.LFCMinWaitBeforeDownscaleMinutes != nil {
defaults.LFCMinWaitBeforeDownscaleMinutes = lo.ToPtr(*overrides.LFCMinWaitBeforeDownscaleMinutes)
}
return defaults
}
// ValidateDefaults checks that the ScalingConfig is safe to use as default settings.
//
// This is more strict than ValidateOverride, where some fields need not be specified.
// Refer to the comments on ScalingConfig for more - each field specifies whether it is required,
// and when.
func (c *ScalingConfig) ValidateDefaults() error {
return c.validate(true)
}
// ValidateOverrides checks that the ScalingConfig is safe to use to override preexisting settings.
//
// This is less strict than ValidateDefaults, because with ValidateOverrides even required fields
// are optional.
func (c *ScalingConfig) ValidateOverrides() error {
return c.validate(false)
}
func (c *ScalingConfig) validate(requireAll bool) error {
ec := &erc.Collector{}
// Check c.LoadAverageFractionTarget is between 0 and 2. We don't *strictly* need the upper
// bound, but it's a good safety check.
if c.LoadAverageFractionTarget != nil {
erc.Whenf(ec, *c.LoadAverageFractionTarget < 0.0, "%s must be set to value >= 0", ".loadAverageFractionTarget")
erc.Whenf(ec, *c.LoadAverageFractionTarget >= 2.0, "%s must be set to value < 2 ", ".loadAverageFractionTarget")
} else if requireAll {
ec.Add(fmt.Errorf("%s is a required field", ".loadAverageFractionTarget"))
}
// Make sure c.MemoryUsageFractionTarget is between 0 and 1
if c.MemoryUsageFractionTarget != nil {
erc.Whenf(ec, *c.MemoryUsageFractionTarget < 0.0, "%s must be set to value >= 0", ".memoryUsageFractionTarget")
erc.Whenf(ec, *c.MemoryUsageFractionTarget >= 1.0, "%s must be set to value < 1 ", ".memoryUsageFractionTarget")
} else if requireAll {
ec.Add(fmt.Errorf("%s is a required field", ".memoryUsageFractionTarget"))
}
// Make sure c.MemoryTotalFractionTarget is between 0 and 1
if c.MemoryTotalFractionTarget != nil {
erc.Whenf(ec, *c.MemoryTotalFractionTarget < 0.0, "%s must be set to value >= 0", ".memoryTotalFractionTarget")
erc.Whenf(ec, *c.MemoryTotalFractionTarget >= 1.0, "%s must be set to value < 1 ", ".memoryTotalFractionTarget")
} else if requireAll {
ec.Add(fmt.Errorf("%s is a required field", ".memoryTotalFractionTarget"))
}
if requireAll {
erc.Whenf(ec, c.EnableLFCMetrics == nil, "%s is a required field", ".enableLFCMetrics")
erc.Whenf(ec, c.LFCToMemoryRatio == nil, "%s is a required field", ".lfcToMemoryRatio")
erc.Whenf(ec, c.LFCWindowSizeMinutes == nil, "%s is a required field", ".lfcWindowSizeMinutes")
erc.Whenf(ec, c.LFCMinWaitBeforeDownscaleMinutes == nil, "%s is a required field", ".lfcMinWaitBeforeDownscaleMinutes")
}
// heads-up! some functions elsewhere depend on the concrete return type of this function.
return ec.Resolve()
}
package billing
import (
"context"
"fmt"
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/policy"
"github.com/Azure/azure-sdk-for-go/sdk/azidentity"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
)
type AzureAuthSharedKey struct {
AccountName string `json:"accountName"`
AccountKey string `json:"accountKey"`
}
type AzureBlobStorageClientConfig struct {
// In Azure a Container is close to a bucket in AWS S3
Container string `json:"container"`
// Files will be created with name starting with PrefixInContainer
PrefixInContainer string `json:"prefixInContainer"`
// Example Endpoint: "https://MYSTORAGEACCOUNT.blob.core.windows.net/"
Endpoint string `json:"endpoint"`
//
// Unexported attributes follow this comment.
//
// Use generateKey for tests.
// Otherwise, keep empty.
generateKey func() string
// Use getClient for tests.
// Otherwise keep empty.
getClient func() (*azblob.Client, error)
}
type AzureError struct {
Err error
}
func (e AzureError) Error() string {
return fmt.Sprintf("Azure Blob error: %s", e.Err.Error())
}
func (e AzureError) Unwrap() error {
return e.Err
}
type AzureClient struct {
cfg AzureBlobStorageClientConfig
c *azblob.Client
}
func (c AzureClient) LogFields() zap.Field {
return zap.Inline(zapcore.ObjectMarshalerFunc(func(enc zapcore.ObjectEncoder) error {
enc.AddString("container", c.cfg.Container)
enc.AddString("prefixInContainer", c.cfg.PrefixInContainer)
enc.AddString("endpoint", c.cfg.Endpoint)
return nil
}))
}
func (c AzureClient) generateKey() string {
return c.cfg.generateKey()
}
func (c AzureClient) send(ctx context.Context, payload []byte, _ TraceID) error {
payload, err := compress(payload)
if err != nil {
return err
}
_, err = c.c.UploadBuffer(ctx, c.cfg.Container, c.generateKey(), payload,
&azblob.UploadBufferOptions{}, //nolint:exhaustruct // It's part of Azure SDK
)
return handleAzureError(err)
}
func defaultGenerateKey(cfg AzureBlobStorageClientConfig) func() string {
return func() string {
return keyTemplate(cfg.PrefixInContainer)
}
}
func defaultGetClient(cfg AzureBlobStorageClientConfig) func() (*azblob.Client, error) {
return func() (*azblob.Client, error) {
//nolint:exhaustruct // It's part of Azure SDK
clientOptions := &azblob.ClientOptions{
ClientOptions: azcore.ClientOptions{
Telemetry: policy.TelemetryOptions{ApplicationID: "neon-autoscaler"},
},
}
credential, err := azidentity.NewDefaultAzureCredential(nil)
if err != nil {
return nil, err
}
client, err := azblob.NewClient(cfg.Endpoint, credential, clientOptions)
if err != nil {
return nil, &AzureError{err}
}
return client, nil
}
}
func NewAzureBlobStorageClient(cfg AzureBlobStorageClientConfig) (*AzureClient, error) {
var client *azblob.Client
if cfg.generateKey == nil {
cfg.generateKey = defaultGenerateKey(cfg)
}
if cfg.getClient == nil {
cfg.getClient = defaultGetClient(cfg)
}
client, err := cfg.getClient()
if err != nil {
return nil, err
}
return &AzureClient{
cfg: cfg,
c: client,
}, nil
}
func handleAzureError(err error) error {
if err == nil {
return nil
}
return AzureError{err}
}
package billing
import (
"bytes"
"compress/gzip"
"context"
"encoding/json"
"fmt"
"math/rand"
"net/http"
"os"
"time"
awsconfig "github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/s3"
"github.com/lithammer/shortuuid"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
)
var hostname string
func init() {
var err error
hostname, err = os.Hostname()
if err != nil {
hostname = fmt.Sprintf("unknown-%d", rand.Intn(1000))
}
}
// GetHostname returns the hostname to be used for enriching billing events (see Enrich())
//
// This function MUST NOT be run before init has finished.
func GetHostname() string {
return hostname
}
type Client interface {
LogFields() zap.Field
send(ctx context.Context, payload []byte, traceID TraceID) error
}
type TraceID string
func GenerateTraceID() TraceID {
return TraceID(shortuuid.New())
}
type HTTPClient struct {
URL string
httpc *http.Client
}
func NewHTTPClient(url string, c *http.Client) HTTPClient {
return HTTPClient{URL: fmt.Sprintf("%s/usage_events", url), httpc: c}
}
func (c HTTPClient) send(ctx context.Context, payload []byte, traceID TraceID) error {
r, err := http.NewRequestWithContext(ctx, http.MethodPost, c.URL, bytes.NewReader(payload))
if err != nil {
return RequestError{Err: err}
}
r.Header.Set("content-type", "application/json")
r.Header.Set("x-trace-id", string(traceID))
resp, err := c.httpc.Do(r)
if err != nil {
return RequestError{Err: err}
}
defer resp.Body.Close()
// theoretically if wanted/needed, we should use an http handler that
// does the retrying, to avoid writing that logic here.
if resp.StatusCode != http.StatusOK {
return UnexpectedStatusCodeError{StatusCode: resp.StatusCode}
}
return nil
}
func (c HTTPClient) LogFields() zap.Field {
return zap.String("url", c.URL)
}
type S3ClientConfig struct {
Bucket string `json:"bucket"`
Region string `json:"region"`
PrefixInBucket string `json:"prefixInBucket"`
Endpoint string `json:"endpoint"`
}
type S3Client struct {
cfg S3ClientConfig
client *s3.Client
}
type S3Error struct {
Err error
}
func (e S3Error) Error() string {
return fmt.Sprintf("S3 error: %s", e.Err.Error())
}
func (e S3Error) Unwrap() error {
return e.Err
}
func NewS3Client(ctx context.Context, cfg S3ClientConfig) (*S3Client, error) {
// Timeout in case we have hidden IO inside config creation
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
s3Config, err := awsconfig.LoadDefaultConfig(ctx, awsconfig.WithRegion(cfg.Region))
if err != nil {
return nil, S3Error{Err: err}
}
client := s3.NewFromConfig(s3Config, func(o *s3.Options) {
if cfg.Endpoint != "" {
o.BaseEndpoint = &cfg.Endpoint
}
o.UsePathStyle = true // required for minio
})
return &S3Client{
cfg: cfg,
client: client,
}, nil
}
// Example: prefixInContainer/year=2021/month=01/day=26/hh:mm:ssZ_{uuid}.ndjson.gz
func keyTemplate(prefix string) string {
now := time.Now()
id := shortuuid.New()
return fmt.Sprintf("%s/year=%d/month=%02d/day=%02d/%s_%s.ndjson.gz",
prefix,
now.Year(), now.Month(), now.Day(),
now.Format("15:04:05Z"),
id,
)
}
func (c S3Client) generateKey() string {
return keyTemplate(c.cfg.PrefixInBucket)
}
func (c S3Client) LogFields() zap.Field {
return zap.Inline(zapcore.ObjectMarshalerFunc(func(enc zapcore.ObjectEncoder) error {
enc.AddString("bucket", c.cfg.Bucket)
enc.AddString("prefixInBucket", c.cfg.PrefixInBucket)
enc.AddString("region", c.cfg.Region)
enc.AddString("endpoint", c.cfg.Endpoint)
return nil
}))
}
func compress(payload []byte) ([]byte, error) {
buf := bytes.Buffer{}
gzW := gzip.NewWriter(&buf)
_, err := gzW.Write(payload)
if err != nil {
return nil, err
}
err = gzW.Close() // Have to close it before reading the buffer
if err != nil {
return nil, err
}
return buf.Bytes(), nil
}
func (c S3Client) send(ctx context.Context, payload []byte, _ TraceID) error {
// Source of truth for the storage format:
// https://github.com/neondatabase/cloud/issues/11199#issuecomment-1992549672
key := c.generateKey()
payload, err := compress(payload)
if err != nil {
return S3Error{Err: err}
}
r := bytes.NewReader(payload)
_, err = c.client.PutObject(ctx, &s3.PutObjectInput{ //nolint:exhaustruct // AWS SDK
Bucket: &c.cfg.Bucket,
Key: &key,
Body: r,
})
if err != nil {
return S3Error{Err: err}
}
return nil
}
// Enrich sets the event's Type and IdempotencyKey fields, so that users of this API don't need to
// manually set them
func Enrich[E Event](now time.Time, hostname string, countInBatch, batchSize int, event E) E {
event.setType()
// RFC3339 with microsecond precision. Possible to get collisions with millis, nanos are extra.
// And everything's in UTC, so there's no sense including the offset.
formattedTime := now.In(time.UTC).Format("2006-01-02T15:04:05.999999Z")
key := event.getIdempotencyKey()
if *key == "" {
*key = fmt.Sprintf("%s-%s-%d/%d", formattedTime, hostname, countInBatch, batchSize)
}
return event
}
// Send attempts to push the events to the remote endpoint.
//
// On failure, the error is guaranteed to be one of: JSONError, RequestError, or
// UnexpectedStatusCodeError.
func Send[E Event](ctx context.Context, client Client, traceID TraceID, events []E) error {
if len(events) == 0 {
return nil
}
payload, err := json.Marshal(struct {
Events []E `json:"events"`
}{Events: events})
if err != nil {
return JSONError{Err: err}
}
return client.send(ctx, payload, traceID)
}
type JSONError struct {
Err error
}
func (e JSONError) Error() string {
return fmt.Sprintf("Error marshaling events: %s", e.Err.Error())
}
func (e JSONError) Unwrap() error {
return e.Err
}
type RequestError struct {
Err error
}
func (e RequestError) Error() string {
return fmt.Sprintf("Error making request: %s", e.Err.Error())
}
func (e RequestError) Unwrap() error {
return e.Err
}
type UnexpectedStatusCodeError struct {
StatusCode int
}
func (e UnexpectedStatusCodeError) Error() string {
return fmt.Sprintf("Unexpected HTTP status code %d", e.StatusCode)
}
package billing
import (
"time"
)
type Event interface {
*AbsoluteEvent | *IncrementalEvent
// eventMethods must be separate from Event so that we can assert that *AbsoluteEvent and
// *IncrementalEvent both implement it - Go does not allow converting to a value of type Event
// because it contains "*AbsoluteEvent | *IncrementalEvent", and such constraints can only be
// used inside of generics.
eventMethods
}
// eventMethods is a requirement for Event, but exists separately so that we can assert that the
// event types implement it.
//
// The reason this interface even exists in the first place is because we're not allowed to assume
// that a type E implementing Event actually has the common fields from AbsoluteEvent and
// IncrementalEvent, even though it's constrained to either of those types.
type eventMethods interface {
setType()
getIdempotencyKey() *string
}
var (
_ eventMethods = (*AbsoluteEvent)(nil)
_ eventMethods = (*IncrementalEvent)(nil)
)
type AbsoluteEvent struct {
IdempotencyKey string `json:"idempotency_key"`
MetricName string `json:"metric"`
Type string `json:"type"`
TenantID string `json:"tenant_id"`
TimelineID string `json:"timeline_id"`
Time time.Time `json:"time"`
Value int `json:"value"`
}
// setType implements eventMethods
func (e *AbsoluteEvent) setType() {
e.Type = "absolute"
}
// getIdempotencyKey implements eventMethods
func (e *AbsoluteEvent) getIdempotencyKey() *string {
return &e.IdempotencyKey
}
type IncrementalEvent struct {
IdempotencyKey string `json:"idempotency_key"`
MetricName string `json:"metric"`
Type string `json:"type"`
EndpointID string `json:"endpoint_id"`
StartTime time.Time `json:"start_time"`
StopTime time.Time `json:"stop_time"`
Value int `json:"value"`
}
// setType implements eventMethods
func (e *IncrementalEvent) setType() {
e.Type = "incremental"
}
// getIdempotencyKey implements eventMethods
func (e *IncrementalEvent) getIdempotencyKey() *string {
return &e.IdempotencyKey
}
package plugin
import (
"encoding/json"
"errors"
"fmt"
"os"
"golang.org/x/exp/slices"
"k8s.io/apimachinery/pkg/api/resource"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
)
//////////////////
// CONFIG TYPES //
//////////////////
type Config struct {
// NodeConfig defines our policies around node resources and scoring
NodeConfig nodeConfig `json:"nodeConfig"`
// SchedulerName informs the scheduler of its name, so that it can identify pods that a previous
// version handled.
SchedulerName string `json:"schedulerName"`
// EventQueueWorkers sets the number of worker threads responsible for handling items from the
// event queue.
EventQueueWorkers int `json:"eventQueueWorkers"`
// StartupEventHandlingTimeoutSeconds gives the maximum duration, in seconds, that we are
// allowed to wait to finish handling all of the initial events generated by reading the cluster
// state on startup.
//
// If event processing takes longer than this time, then plugin creation will fail, and the
// scheduler pod will retry.
StartupEventHandlingTimeoutSeconds int `json:"startupEventHandlingTimeoutSeconds"`
// RandomizeScores, if true, will cause the scheduler to score a node with a random number in
// the range [minScore + 1, trueScore], instead of the trueScore
RandomizeScores bool `json:"randomizeScores"`
// MigrationDeletionRetrySeconds gives the duration, in seconds, we should wait between retrying
// a failed attempt to delete a VirtualMachineMigration that's finished.
MigrationDeletionRetrySeconds uint `json:"migrationDeletionRetrySeconds"`
// DoMigration, if provided, allows VM migration to be disabled
//
// This flag is intended to be temporary, just until NeonVM supports mgirations and we can
// re-enable it.
DoMigration *bool `json:"doMigration"`
// K8sNodeGroupLabel, if provided, gives the label to use when recording k8s node groups in the
// metrics (like for autoscaling_plugin_node_{cpu,mem}_resources_current)
K8sNodeGroupLabel string `json:"k8sNodeGroupLabel"`
// K8sAvailabilityZoneLabel, if provided, gives the label to use when recording nodes'
// availability zones in the metrics (like for autoscaling_plugin_node_{cpu,mem}_resources_current)
K8sAvailabilityZoneLabel string `json:"k8sAvailabilityZoneLabel"`
// IgnoreNamespaces, if provided, gives a list of namespaces that the plugin should completely
// ignore, as if pods from those namespaces do not exist.
//
// This is specifically designed for our "overprovisioning" namespace, which creates paused pods
// to trigger cluster-autoscaler.
//
// The only exception to this rule is during Filter method calls, where we do still count the
// resources from such pods. The reason to do that is so that these overprovisioning pods can be
// evicted, which will allow cluster-autoscaler to trigger scale-up.
IgnoreNamespaces []string `json:"ignoreNamespaces"`
// DumpState, if provided, enables a server to dump internal state
DumpState *dumpStateConfig `json:"dumpState"`
// JSONString is the JSON string that was used to generate this config struct
JSONString string `json:"-"`
}
type nodeConfig struct {
Cpu resourceConfig `json:"cpu"`
Memory resourceConfig `json:"memory"`
// Details about node scoring:
// See also: https://www.desmos.com/calculator/wg8s0yn63s
// In the desmos, the value f(x,s) gives the score (from 0 to 1) of a node that's x amount full
// (where x is a fraction from 0 to 1), with a total size that is equal to the maximum size node
// times s (i.e. s (or: "scale") gives the ratio between this nodes's size and the biggest one).
// MinUsageScore gives the ratio of the score at the minimum usage (i.e. 0) relative to the
// score at the midpoint, which will have the maximum.
//
// This corresponds to y₀ in the desmos link above.
MinUsageScore float64 `json:"minUsageScore"`
// MaxUsageScore gives the ratio of the score at the maximum usage (i.e. full) relative to the
// score at the midpoint, which will have the maximum.
//
// This corresponds to y₁ in the desmos link above.
MaxUsageScore float64 `json:"maxUsageScore"`
// ScorePeak gives the fraction at which the "target" or highest score should be, with the score
// sloping down on either side towards MinUsageScore at 0 and MaxUsageScore at 1.
//
// This corresponds to xₚ in the desmos link.
ScorePeak float64 `json:"scorePeak"`
}
// resourceConfig configures the amount of a particular resource we're willing to allocate to VMs,
// both the soft limit (Watermark) and the hard limit (via System)
type resourceConfig struct {
// Watermark is the fraction of non-system resource allocation above which we should be
// migrating VMs away to reduce usage
//
// If empty, the watermark is set as equal to the "hard" limit from system resources.
//
// The word "watermark" was originally used by @zoete as a temporary stand-in term during a
// meeting, and so it has intentionally been made permanent to spite the concept of "temporary" 😛
Watermark float32 `json:"watermark,omitempty"`
}
func (c *Config) migrationEnabled() bool {
return c.DoMigration == nil || *c.DoMigration
}
///////////////////////
// CONFIG VALIDATION //
///////////////////////
// if the returned error is not nil, the string is a JSON path to the invalid value
func (c *Config) validate() (string, error) {
if path, err := c.NodeConfig.validate(); err != nil {
return fmt.Sprintf("nodeConfig.%s", path), err
}
if c.SchedulerName == "" {
return "schedulerName", errors.New("string cannot be empty")
}
if c.EventQueueWorkers <= 0 {
return "eventQueueWorkers", errors.New("value must be > 0")
}
if c.StartupEventHandlingTimeoutSeconds <= 0 {
return "startupEventHandlingTimeoutSeconds", errors.New("value must be > 0")
}
if c.DumpState != nil {
if path, err := c.DumpState.validate(); err != nil {
return fmt.Sprintf("dumpState.%s", path), err
}
}
if c.MigrationDeletionRetrySeconds == 0 {
return "migrationDeletionRetrySeconds", errors.New("value must be > 0")
}
return "", nil
}
func (c *nodeConfig) validate() (string, error) {
if path, err := c.Cpu.validate(); err != nil {
return fmt.Sprintf("cpu.%s", path), err
}
if path, err := c.Memory.validate(); err != nil {
return fmt.Sprintf("memory.%s", path), err
}
if c.MinUsageScore < 0 || c.MinUsageScore > 1 {
return "minUsageScore", errors.New("value must be between 0 and 1, inclusive")
} else if c.MaxUsageScore < 0 || c.MaxUsageScore > 1 {
return "maxUsageScore", errors.New("value must be between 0 and 1, inclusive")
} else if c.ScorePeak < 0 || c.ScorePeak > 1 {
return "scorePeak", errors.New("value must be between 0 and 1, inclusive")
}
return "", nil
}
func (c *resourceConfig) validate() (string, error) {
if c.Watermark <= 0.0 {
return "watermark", errors.New("value must be > 0")
} else if c.Watermark > 1.0 {
return "watermark", errors.New("value must be <= 1")
}
return "", nil
}
////////////////////
// CONFIG READING //
////////////////////
const DefaultConfigPath = "/etc/scheduler-plugin-config/autoscale-enforcer-config.json"
func ReadConfig(path string) (*Config, error) {
file, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("Error opening config file %q: %w", path, err)
}
defer file.Close()
var config Config
jsonDecoder := json.NewDecoder(file)
jsonDecoder.DisallowUnknownFields()
if err = jsonDecoder.Decode(&config); err != nil {
return nil, fmt.Errorf("Error decoding JSON config in %q: %w", path, err)
}
if path, err = config.validate(); err != nil {
return nil, fmt.Errorf("Invalid config at %s: %w", path, err)
}
return &config, nil
}
//////////////////////////////////////
// HELPER METHODS FOR USING CONFIGS //
//////////////////////////////////////
// ignoredNamespace returns whether items in the namespace should be treated as if they don't exist
func (c *Config) ignoredNamespace(namespace string) bool {
return slices.Contains(c.IgnoreNamespaces, namespace)
}
func (c *nodeConfig) vCpuLimits(total *resource.Quantity) nodeResourceState[vmapi.MilliCPU] {
totalMilli := total.MilliValue()
return nodeResourceState[vmapi.MilliCPU]{
Total: vmapi.MilliCPU(totalMilli),
Watermark: vmapi.MilliCPU(c.Cpu.Watermark * float32(totalMilli)),
Reserved: 0,
Buffer: 0,
CapacityPressure: 0,
PressureAccountedFor: 0,
}
}
func (c *nodeConfig) memoryLimits(total *resource.Quantity) nodeResourceState[api.Bytes] {
totalBytes := total.Value()
return nodeResourceState[api.Bytes]{
Total: api.Bytes(totalBytes),
Watermark: api.Bytes(c.Memory.Watermark * float32(totalBytes)),
Reserved: 0,
Buffer: 0,
CapacityPressure: 0,
PressureAccountedFor: 0,
}
}
package plugin
import (
"context"
"errors"
"fmt"
"net"
"net/http"
"time"
"github.com/samber/lo"
"go.uber.org/zap"
"golang.org/x/exp/slices"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
)
type dumpStateConfig struct {
Port uint16 `json:"port"`
TimeoutSeconds uint `json:"timeoutSeconds"`
}
func (c *dumpStateConfig) validate() (string, error) {
if c.Port == 0 {
return "port", errors.New("value must be > 0")
} else if c.TimeoutSeconds == 0 {
return "timeoutSeconds", errors.New("value must be > 0")
}
return "", nil
}
type stateDump struct {
Stopped bool `json:"stopped"`
BuildInfo util.BuildInfo `json:"buildInfo"`
State pluginStateDump `json:"state"`
}
func (p *AutoscaleEnforcer) startDumpStateServer(shutdownCtx context.Context, logger *zap.Logger) error {
// Manually start the TCP listener so we can minimize errors in the background thread.
addr := net.TCPAddr{IP: net.IPv4zero, Port: int(p.state.conf.DumpState.Port)}
listener, err := net.ListenTCP("tcp", &addr)
if err != nil {
return fmt.Errorf("Error binding to %v", addr)
}
go func() {
mux := http.NewServeMux()
util.AddHandler(logger, mux, "/", http.MethodGet, "<empty>", func(ctx context.Context, _ *zap.Logger, body *struct{}) (*stateDump, int, error) {
timeout := time.Duration(p.state.conf.DumpState.TimeoutSeconds) * time.Second
startTime := time.Now()
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
state, err := p.dumpState(ctx, shutdownCtx.Err() != nil)
if err != nil {
if ctx.Err() != nil && errors.Is(ctx.Err(), context.DeadlineExceeded) {
totalDuration := time.Since(startTime)
return nil, 500, fmt.Errorf("timed out after %s while getting state", totalDuration)
} else {
// some other type of cancel; 400 is a little weird, but there isn't a great
// option here.
return nil, 400, fmt.Errorf("error while getting state: %w", err)
}
}
return state, 200, nil
})
// note: we don't shut down this server. It should be possible to continue fetching the
// internal state after shutdown has started.
server := &http.Server{Handler: mux}
if err := server.Serve(listener); err != nil {
logger.Error("dump-state server exited", zap.Error(err))
}
}()
return nil
}
func (p *AutoscaleEnforcer) dumpState(ctx context.Context, stopped bool) (*stateDump, error) {
state, err := p.state.dump(ctx)
if err != nil {
return nil, err
}
return &stateDump{
Stopped: stopped,
BuildInfo: util.GetBuildInfo(),
State: *state,
}, nil
}
type keyed[K any, V any] struct {
Key K `json:"key"`
Value V `json:"value"`
}
type pluginStateDump struct {
OngoingMigrationDeletions []keyed[util.NamespacedName, int] `json:"ongoingMigrationDeletions"`
Nodes []keyed[string, nodeStateDump] `json:"nodes"`
Pods []podNameAndPointer `json:"pods"`
MaxTotalReservableCPU vmapi.MilliCPU `json:"maxTotalReservableCPU"`
MaxTotalReservableMem api.Bytes `json:"maxTotalReservableMem"`
Conf Config `json:"config"`
}
type podNameAndPointer struct {
Obj pointerString `json:"obj"`
PodName util.NamespacedName `json:"podName"`
}
type pointerString string
type nodeStateDump struct {
Obj pointerString `json:"obj"`
Name string `json:"name"`
NodeGroup string `json:"nodeGroup"`
AvailabilityZone string `json:"availabilityZone"`
CPU nodeResourceState[vmapi.MilliCPU] `json:"cpu"`
Mem nodeResourceState[api.Bytes] `json:"mem"`
Pods []keyed[util.NamespacedName, podStateDump] `json:"pods"`
Mq []*podNameAndPointer `json:"mq"`
}
type podStateDump struct {
Obj pointerString `json:"obj"`
Name util.NamespacedName `json:"name"`
Node pointerString `json:"node"`
CPU podResourceState[vmapi.MilliCPU] `json:"cpu"`
Mem podResourceState[api.Bytes] `json:"mem"`
VM *vmPodState `json:"vm"`
}
func makePointerString[T any](t *T) pointerString {
return pointerString(fmt.Sprintf("%p", t))
}
func sortSliceByPodName[T any](slice []T, name func(T) util.NamespacedName) {
slices.SortFunc(slice, func(a, b T) (less bool) {
aName := name(a)
bName := name(b)
return aName.Namespace < bName.Namespace && aName.Name < bName.Name
})
}
func (s *pluginState) dump(ctx context.Context) (*pluginStateDump, error) {
if err := s.lock.TryLock(ctx); err != nil {
return nil, err
}
defer s.lock.Unlock()
pods := make([]podNameAndPointer, 0, len(s.pods))
for _, p := range s.pods {
pods = append(pods, podNameAndPointer{Obj: makePointerString(p), PodName: p.name})
}
sortSliceByPodName(pods, func(p podNameAndPointer) util.NamespacedName { return p.PodName })
nodes := make([]keyed[string, nodeStateDump], 0, len(s.nodes))
for k, n := range s.nodes {
nodes = append(nodes, keyed[string, nodeStateDump]{Key: k, Value: n.dump()})
}
slices.SortFunc(nodes, func(kvx, kvy keyed[string, nodeStateDump]) (less bool) {
return kvx.Key < kvy.Key
})
ongoingMigrationDeletions := make([]keyed[util.NamespacedName, int], 0, len(s.ongoingMigrationDeletions))
for k, count := range s.ongoingMigrationDeletions {
ongoingMigrationDeletions = append(ongoingMigrationDeletions, keyed[util.NamespacedName, int]{Key: k, Value: count})
}
sortSliceByPodName(ongoingMigrationDeletions, func(kv keyed[util.NamespacedName, int]) util.NamespacedName { return kv.Key })
return &pluginStateDump{
OngoingMigrationDeletions: ongoingMigrationDeletions,
Nodes: nodes,
Pods: pods,
MaxTotalReservableCPU: s.maxTotalReservableCPU,
MaxTotalReservableMem: s.maxTotalReservableMem,
Conf: *s.conf,
}, nil
}
func (s *nodeState) dump() nodeStateDump {
pods := make([]keyed[util.NamespacedName, podStateDump], 0, len(s.pods))
for k, p := range s.pods {
pods = append(pods, keyed[util.NamespacedName, podStateDump]{Key: k, Value: p.dump()})
}
sortSliceByPodName(pods, func(kv keyed[util.NamespacedName, podStateDump]) util.NamespacedName { return kv.Key })
mq := make([]*podNameAndPointer, 0, len(s.mq))
for _, p := range s.mq {
if p == nil {
mq = append(mq, nil)
} else {
v := podNameAndPointer{Obj: makePointerString(p), PodName: p.Name}
mq = append(mq, &v)
}
}
return nodeStateDump{
Obj: makePointerString(s),
Name: s.name,
NodeGroup: s.nodeGroup,
AvailabilityZone: s.availabilityZone,
CPU: s.cpu,
Mem: s.mem,
Pods: pods,
Mq: mq,
}
}
func (s *podState) dump() podStateDump {
var vm *vmPodState
if s.vm != nil {
vm = lo.ToPtr(s.vm.dump())
}
return podStateDump{
Obj: makePointerString(s),
Name: s.name,
Node: makePointerString(s.node),
CPU: s.cpu,
Mem: s.mem,
VM: vm,
}
}
func (s *vmPodState) dump() vmPodState {
// Copy some of the "may be nil" pointer fields
var metrics *api.Metrics
if s.Metrics != nil {
metrics = lo.ToPtr(*s.Metrics)
}
var migrationState *podMigrationState
if s.MigrationState != nil {
migrationState = &podMigrationState{
Name: s.MigrationState.Name,
}
}
return vmPodState{
Name: s.Name,
MemSlotSize: s.MemSlotSize,
Config: s.Config,
Metrics: metrics,
MqIndex: s.MqIndex,
MigrationState: migrationState,
}
}
package plugin
import (
"context"
"hash/fnv"
"time"
"github.com/tychoish/fun/pubsub"
)
type queueItem[T any] struct {
item T
addTime time.Time
}
type eventQueueSet[T any] struct {
queues []*pubsub.Queue[queueItem[T]]
metrics PromMetrics
}
func newEventQueueSet[T any](size int, metrics PromMetrics) eventQueueSet[T] {
queues := make([]*pubsub.Queue[queueItem[T]], size)
for i := 0; i < size; i += 1 {
queues[i] = pubsub.NewUnlimitedQueue[queueItem[T]]()
}
return eventQueueSet[T]{
queues: queues,
metrics: metrics,
}
}
func (s eventQueueSet[T]) enqueue(key string, item T) error {
hasher := fnv.New64()
// nb: Hash guarantees that Write never returns an error
_, _ = hasher.Write([]byte(key))
hash := hasher.Sum64()
idx := int(hash % uint64(len(s.queues)))
s.metrics.eventQueueDepth.Inc()
s.metrics.eventQueueAddsTotal.Inc()
queueItem := queueItem[T]{
item: item,
addTime: time.Now(),
}
return s.queues[idx].Add(queueItem)
}
func (s eventQueueSet[T]) wait(ctx context.Context, idx int) (T, error) {
queueItem, err := s.queues[idx].Wait(ctx)
if err == nil {
s.metrics.eventQueueDepth.Dec()
s.metrics.eventQueueLatency.Observe(float64(time.Since(queueItem.addTime).Seconds()))
}
return queueItem.item, err
}
package plugin
import (
"context"
"fmt"
"math/rand"
"sync/atomic"
"time"
"go.uber.org/zap"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
scheme "k8s.io/client-go/kubernetes/scheme"
rest "k8s.io/client-go/rest"
"k8s.io/kubernetes/pkg/scheduler/framework"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
"github.com/neondatabase/autoscaling/pkg/util/watch"
)
const Name = "AutoscaleEnforcer"
const LabelPluginCreatedMigration = "autoscaling.neon.tech/created-by-scheduler"
// AutoscaleEnforcer is the scheduler plugin to coordinate autoscaling
type AutoscaleEnforcer struct {
logger *zap.Logger
handle framework.Handle
vmClient *vmclient.Clientset
state pluginState
metrics PromMetrics
// nodeStore provides access to the current-ish state of Nodes in the cluster. If something's
// missing, it can be updated with Relist().
nodeStore IndexedNodeStore
}
// abbreviations, because these types are pretty verbose
type IndexedVMStore = watch.IndexedStore[vmapi.VirtualMachine, *watch.NameIndex[vmapi.VirtualMachine]]
type IndexedNodeStore = watch.IndexedStore[corev1.Node, *watch.FlatNameIndex[corev1.Node]]
// Compile-time checks that AutoscaleEnforcer actually implements the interfaces we want it to
var _ framework.Plugin = (*AutoscaleEnforcer)(nil)
var _ framework.PreFilterPlugin = (*AutoscaleEnforcer)(nil)
var _ framework.PostFilterPlugin = (*AutoscaleEnforcer)(nil)
var _ framework.FilterPlugin = (*AutoscaleEnforcer)(nil)
var _ framework.ScorePlugin = (*AutoscaleEnforcer)(nil)
var _ framework.ReservePlugin = (*AutoscaleEnforcer)(nil)
func NewAutoscaleEnforcerPlugin(ctx context.Context, logger *zap.Logger, config *Config) func(runtime.Object, framework.Handle) (framework.Plugin, error) {
return func(obj runtime.Object, h framework.Handle) (framework.Plugin, error) {
return makeAutoscaleEnforcerPlugin(ctx, logger, obj, h, config)
}
}
// NewAutoscaleEnforcerPlugin produces the initial AutoscaleEnforcer plugin to be used by the
// scheduler
func makeAutoscaleEnforcerPlugin(
ctx context.Context,
logger *zap.Logger,
_obj runtime.Object,
h framework.Handle,
config *Config,
) (framework.Plugin, error) {
// obj can be used for taking in configuration. it's a bit tricky to figure out, and we don't
// quite need it yet.
logger.Info("Initializing plugin")
// create the NeonVM client
if err := vmapi.AddToScheme(scheme.Scheme); err != nil {
return nil, err
}
vmConfig := rest.CopyConfig(h.KubeConfig())
// The handler's ContentType is not the default "application/json" (it's protobuf), so we need
// to set it back to JSON because NeonVM doesn't support protobuf.
vmConfig.ContentType = "application/json"
vmClient, err := vmclient.NewForConfig(vmConfig)
if err != nil {
return nil, fmt.Errorf("Error creating NeonVM client: %w", err)
}
p := AutoscaleEnforcer{
logger: logger.Named("plugin"),
handle: h,
vmClient: vmClient,
state: pluginState{
lock: util.NewChanMutex(),
ongoingMigrationDeletions: make(map[util.NamespacedName]int),
pods: make(map[util.NamespacedName]*podState),
nodes: make(map[string]*nodeState),
maxTotalReservableCPU: 0, // set during event handling
maxTotalReservableMem: 0, // set during event handling
conf: config,
},
metrics: PromMetrics{}, //nolint:exhaustruct // set by makePrometheusRegistry
nodeStore: IndexedNodeStore{}, //nolint:exhaustruct // set below
}
if p.state.conf.DumpState != nil {
logger.Info("Starting 'dump state' server")
if err := p.startDumpStateServer(ctx, logger.Named("dump-state")); err != nil {
return nil, fmt.Errorf("Error starting 'dump state' server: %w", err)
}
}
// makePrometheusRegistry sets p.metrics, which we need to do before calling
// newEventQueueSet or handling events, because we set metrics in eventQueueSet and for each
// node as watch events get handled.
promReg := p.makePrometheusRegistry()
// Start watching Pod/VM events, adding them to a shared queue to process them in order
queueSet := newEventQueueSet[func()](config.EventQueueWorkers, p.metrics)
pushToQueue := func(logger *zap.Logger, key string, f func()) {
if err := queueSet.enqueue(key, f); err != nil {
logger.Warn("Error adding to pod/VM event queue", zap.Error(err))
}
}
// A note about event handling:
//
// Before returning from this function, we want to make sure that we're caught up to the watch
// events generated by initially reading the cluster state (i.e. the initial List()).
//
// Doing this is non-trivial, so we accomplish it in pieces:
//
// 1. Using watch.WatchModeSync to force queueing events *before* returning from creating the
// watcher (note: and therefore, before any start to be handled); and
// 2. For each event created from the initial List(), increment a counter to track the number of
// these events, and decrement it as events are handled.
//
// The initial state building is complete when the counter reaches zero, at which point we close
// the channel that this function will wait on.
var initEventsCount atomic.Int32
var initEvents *eventCounter
incEventCount := func() { initEventsCount.Add(1) }
hlogger := logger.Named("handlers")
nwc := nodeWatchCallbacks{
submitNodeDeletion: func(logger *zap.Logger, nodeName string) {
pushToQueue(logger, nodeName, func() { p.handleNodeDeletion(hlogger, nodeName) })
},
}
pwc := podWatchCallbacks{
submitStarted: func(logger *zap.Logger, pod *corev1.Pod, preexisting bool) {
if preexisting {
incEventCount()
}
pushToQueue(logger, pod.Name, func() {
p.handleStarted(hlogger, pod, preexisting)
if preexisting {
initEvents.dec()
}
})
},
submitDeletion: func(logger *zap.Logger, name util.NamespacedName) {
// NOTE: It's important that the name we use here is the same as the one we use for
// submitStarted - otherwise we can end up with out of order handling for start/stop
// events.
pushToQueue(logger, name.Name, func() { p.handleDeletion(hlogger, name) })
},
submitStartMigration: func(logger *zap.Logger, podName, migrationName util.NamespacedName, source bool) {
pushToQueue(logger, migrationName.Name, func() { p.handlePodStartMigration(logger, podName, migrationName, source) })
},
submitEndMigration: func(logger *zap.Logger, podName, migrationName util.NamespacedName) {
pushToQueue(logger, migrationName.Name, func() { p.handlePodEndMigration(logger, podName, migrationName) })
},
}
vwc := vmWatchCallbacks{
submitConfigUpdated: func(logger *zap.Logger, pod util.NamespacedName, newCfg api.VmConfig) {
pushToQueue(logger, pod.Name, func() { p.handleVMConfigUpdated(hlogger, pod, newCfg) })
},
submitBoundsChanged: func(logger *zap.Logger, vm *api.VmInfo, podName string) {
pushToQueue(logger, vm.Name, func() { p.handleUpdatedScalingBounds(hlogger, vm, podName) })
},
submitNonAutoscalingVmUsageChanged: func(logger *zap.Logger, vm *api.VmInfo, podName string) {
pushToQueue(logger, vm.Name, func() { p.handleNonAutoscalingUsageChange(hlogger, vm, podName) })
},
}
mwc := migrationWatchCallbacks{
submitMigrationFinished: func(vmm *vmapi.VirtualMachineMigration) {
// When cleaning up migrations, we don't want to process those events synchronously.
// So instead, we'll spawn a goroutine to delete the completed migration.
go p.cleanupMigration(hlogger, vmm)
},
}
watchMetrics := watch.NewMetrics("autoscaling_plugin_watchers")
logger.Info("Starting node watcher")
nodeStore, err := p.watchNodeEvents(ctx, logger, watchMetrics, nwc)
if err != nil {
return nil, fmt.Errorf("Error starting node watcher: %w", err)
}
p.nodeStore = watch.NewIndexedStore(nodeStore, watch.NewFlatNameIndex[corev1.Node]())
logger.Info("Starting pod watcher")
podStore, err := p.watchPodEvents(ctx, logger, watchMetrics, pwc)
if err != nil {
return nil, fmt.Errorf("Error starting pod watcher: %w", err)
}
podIndex := watch.NewIndexedStore(podStore, watch.NewNameIndex[corev1.Pod]())
logger.Info("Starting VM watcher")
_, err = p.watchVMEvents(ctx, logger, watchMetrics, vwc, podIndex)
if err != nil {
return nil, fmt.Errorf("Error starting VM watcher: %w", err)
}
logger.Info("Starting VM Migration watcher")
if _, err := p.watchMigrationEvents(ctx, logger, watchMetrics, mwc); err != nil {
return nil, fmt.Errorf("Error starting VM Migration watcher: %w", err)
}
watchMetrics.MustRegister(promReg)
// Set up tracking the initial events, now that we know the count:
totalQueued := initEventsCount.Load()
initEvents = newEventCounter(totalQueued)
// Start handling the queued events. Any handling of initial events will gradually reduce
// initEventsCount, and eventually we'll close(initEventsDone) to mark initial event handling as
// complete.
for i := 0; i < config.EventQueueWorkers; i += 1 {
// copy the loop variable to avoid it escaping pre Go 1.22
go func(ctx context.Context, idx int) {
for {
callback, err := queueSet.wait(ctx, idx) // NB: wait pulls from the front of the queue
if err != nil {
logger.Info("Stopped waiting on pod/VM queue", zap.Error(err))
break
}
callback()
}
}(ctx, i)
}
if err := util.StartPrometheusMetricsServer(ctx, logger.Named("prometheus"), 9100, promReg); err != nil {
return nil, fmt.Errorf("Error starting prometheus server: %w", err)
}
// Wait for all the initial events to be handled.
logger.Info("Waiting on initial events processing to be done", zap.Int32("count", totalQueued))
initEventsTimeout := time.Second * time.Duration(p.state.conf.StartupEventHandlingTimeoutSeconds)
select {
case <-initEvents.done():
// Done
case <-time.After(initEventsTimeout):
return nil, fmt.Errorf(
"Timed out waiting on initial events processing to complete after %s (%d remaining)",
initEventsTimeout,
initEvents.getRemaining(),
)
}
logger.Info("Initial events processing complete")
if err := p.startPermitHandler(ctx, logger.Named("agent-handler")); err != nil {
return nil, fmt.Errorf("permit handler: %w", err)
}
// Periodically check that we're not deadlocked
go func() {
defer func() {
if err := recover(); err != nil {
logger.Panic("deadlock checker for AutoscaleEnforcer.state.lock panicked", zap.String("error", fmt.Sprint(err)))
}
}()
p.state.lock.DeadlockChecker(time.Second, 5*time.Second)(ctx)
}()
logger.Info("Plugin initialization complete")
return &p, nil
}
// monotonically decreasing event counter that closes a channel once all events have been completed
// with dec().
//
// Used to make sure we've processed all the initial events before returning from
// makeAutoscaleEnforcerPlugin().
type eventCounter struct {
remaining atomic.Int32
signalDone chan struct{}
}
func newEventCounter(remaining int32) *eventCounter {
c := &eventCounter{
remaining: atomic.Int32{},
signalDone: make(chan struct{}),
}
c.remaining.Store(remaining)
return c
}
func (c *eventCounter) dec() {
r := c.remaining.Add(-1)
if r == 0 {
close(c.signalDone)
}
}
func (c *eventCounter) getRemaining() int32 {
return c.remaining.Load()
}
func (c *eventCounter) done() <-chan struct{} {
return c.signalDone
}
// Name returns the name of the AutoscaleEnforcer plugin
//
// Required for framework.Plugin
func (e *AutoscaleEnforcer) Name() string {
return Name
}
// getVmInfo is a helper for the plugin-related functions
//
// This function returns nil, nil if the pod is not associated with a NeonVM virtual machine.
func (e *AutoscaleEnforcer) getVmInfo(logger *zap.Logger, pod *corev1.Pod, action string) (*api.VmInfo, error) {
vmName := util.TryPodOwnerVirtualMachine(pod)
if vmName == nil {
return nil, nil
}
vmInfo, err := api.ExtractVmInfoFromPod(logger, pod)
if err != nil {
e.handle.EventRecorder().Eventf(
pod, // regarding
nil, // related
"Warning", // eventtype
"ExtractVmInfo", // reason
action, // action
"Failed to extract autoscaling info about VM: %s", // node
err,
)
return nil, fmt.Errorf("Error extracting VM info: %w", err)
}
return vmInfo, nil
}
// checkSchedulerName asserts that the SchedulerName field of a Pod matches what we're expecting,
// otherwise returns a non-nil framework.Status to return (and also logs the error)
func (e *AutoscaleEnforcer) checkSchedulerName(logger *zap.Logger, pod *corev1.Pod) *framework.Status {
if e.state.conf.SchedulerName != pod.Spec.SchedulerName {
err := fmt.Errorf(
"Mismatched SchedulerName for pod: our config has %q, but the pod has %q",
e.state.conf.SchedulerName, pod.Spec.SchedulerName,
)
logger.Error("Pod failed scheduler name check", zap.Error(err))
return framework.NewStatus(framework.Error, err.Error())
}
return nil
}
// PreFilter is called at the start of any Pod's filter cycle. We use it in combination with
// PostFilter (which is only called on failure) to provide metrics for pods that are rejected by
// this process.
func (e *AutoscaleEnforcer) PreFilter(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
) (_ *framework.PreFilterResult, status *framework.Status) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("PreFilter", pod, ignored)
defer func() {
e.metrics.IncFailIfNotSuccess("PreFilter", pod, ignored, status)
}()
return nil, nil
}
// PreFilterExtensions is required for framework.PreFilterPlugin, and can return nil if it's not used
func (e *AutoscaleEnforcer) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
// PostFilter is used by us for metrics on filter cycles that reject a Pod by filtering out all
// applicable nodes.
//
// Quoting the docs for PostFilter:
//
// > These plugins are called after Filter phase, but only when no feasible nodes were found for the
// > pod.
//
// Required for framework.PostFilterPlugin
func (e *AutoscaleEnforcer) PostFilter(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
filteredNodeStatusMap framework.NodeToStatusMap,
) (_ *framework.PostFilterResult, status *framework.Status) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("PostFilter", pod, ignored)
defer func() {
e.metrics.IncFailIfNotSuccess("PostFilter", pod, ignored, status)
}()
logger := e.logger.With(zap.String("method", "Filter"), util.PodNameFields(pod))
logger.Error("Pod rejected by all Filter method calls")
return nil, nil // PostFilterResult is optional, nil Status is success.
}
// Filter gives our plugin a chance to signal that a pod shouldn't be put onto a particular node
//
// Required for framework.FilterPlugin
func (e *AutoscaleEnforcer) Filter(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
nodeInfo *framework.NodeInfo,
) (status *framework.Status) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("Filter", pod, ignored)
defer func() {
e.metrics.IncFailIfNotSuccess("Filter", pod, ignored, status)
}()
nodeName := nodeInfo.Node().Name // TODO: nodes also have namespaces? are they used at all?
logger := e.logger.With(zap.String("method", "Filter"), zap.String("node", nodeName), util.PodNameFields(pod))
logger.Info("Handling Filter request")
if ignored {
logger.Warn("Received Filter request for pod in ignored namespace, continuing anyways.")
}
vmInfo, err := e.getVmInfo(logger, pod, "Filter")
if err != nil {
logger.Error("Error getting VM info for Pod", zap.Error(err))
return framework.NewStatus(
framework.UnschedulableAndUnresolvable,
fmt.Sprintf("Error getting pod vmInfo: %s", err),
)
}
var podResources api.Resources
if vmInfo != nil {
podResources = vmInfo.Using()
} else {
podResources = extractPodResources(pod)
}
// Check that the SchedulerName matches what we're expecting
if status := e.checkSchedulerName(logger, pod); status != nil {
return status
}
e.state.lock.Lock()
defer e.state.lock.Unlock()
node, err := e.state.getOrFetchNodeState(ctx, logger, e.metrics, e.nodeStore, nodeName)
if err != nil {
logger.Error("Error getting node state", zap.Error(err))
return framework.NewStatus(
framework.Error,
fmt.Sprintf("Error getting node state: %s", err),
)
}
// The pod will get resources according to vmInfo.{Cpu,Mem}.Use reserved for it when it does get
// scheduled. Now we can check whether this node has capacity for the pod.
//
// Technically speaking, the VM pods in nodeInfo might not match what we have recorded for the
// node -- simply because during preemption, the scheduler tries to see whether it could
// schedule the pod if other stuff was preempted, and gives us what the state WOULD be after
// preemption.
//
// So we have to actually count up the resource usage of all pods in nodeInfo:
var nodeTotal api.Resources
// As we process all pods, we should record all the pods that aren't present in both nodeInfo
// and e.state's maps, so that we can log any inconsistencies instead of silently using
// *potentially* bad data. Some differences are expected, but on the whole this extra
// information should be helpful.
missedPods := make(map[util.NamespacedName]struct{})
for name := range node.pods {
missedPods[name] = struct{}{}
}
var includedIgnoredPods []util.NamespacedName
for _, podInfo := range nodeInfo.Pods {
pn := util.NamespacedName{Name: podInfo.Pod.Name, Namespace: podInfo.Pod.Namespace}
if podState, ok := e.state.pods[pn]; ok {
nodeTotal.VCPU += podState.cpu.Reserved
nodeTotal.Mem += podState.mem.Reserved
delete(missedPods, pn)
} else {
name := util.GetNamespacedName(podInfo.Pod)
if util.PodCompleted(podInfo.Pod) {
logger.Warn(
"Skipping completed Pod in Filter node's pods",
zap.Object("pod", name),
zap.String("phase", string(podInfo.Pod.Status.Phase)),
)
continue
}
if !e.state.conf.ignoredNamespace(podInfo.Pod.Namespace) {
// FIXME: this gets us duplicated "pod" fields. Not great. But we're using
// logger.With pretty pervasively, and it's hard to avoid this while using that.
// For now, we can get around this by including the pod name in an error.
logger.Error(
"Unknown-but-not-ignored Pod in Filter node's pods",
zap.Object("pod", name),
zap.Error(fmt.Errorf("Pod %v is unknown but not ignored", name)),
)
} else {
includedIgnoredPods = append(includedIgnoredPods, name)
}
// We *also* need to count pods in ignored namespaces
resources := extractPodResources(podInfo.Pod)
nodeTotal.VCPU += resources.VCPU
nodeTotal.Mem += resources.Mem
}
}
if len(missedPods) != 0 {
var missedPodsList []util.NamespacedName
for name := range missedPods {
missedPodsList = append(missedPodsList, name)
}
logger.Warn("Some known Pods weren't included in Filter NodeInfo", zap.Objects("missedPods", missedPodsList))
}
var kind string
if vmInfo != nil {
kind = "VM"
} else {
kind = "non-VM"
}
makeMsg := func(resource, compareOp string, nodeUse, podUse, nodeMax any) string {
return fmt.Sprintf(
"node %s usage %v + %s pod %s %v %s node max %v",
resource, nodeUse, kind, resource, podUse, compareOp, nodeMax,
)
}
allowing := true
var cpuCompare string
if nodeTotal.VCPU+podResources.VCPU > node.cpu.Total {
cpuCompare = ">"
allowing = false
} else {
cpuCompare = "<="
}
cpuMsg := makeMsg("vCPU", cpuCompare, nodeTotal.VCPU, podResources.VCPU, node.cpu.Total)
var memCompare string
if nodeTotal.Mem+podResources.Mem > node.mem.Total {
memCompare = ">"
allowing = false
} else {
memCompare = "<="
}
memMsg := makeMsg("vCPU", memCompare, nodeTotal.Mem, podResources.Mem, node.mem.Total)
var message string
var logFunc func(string, ...zap.Field)
if allowing {
message = "Allowing Pod"
logFunc = logger.Info
} else {
message = "Rejecting Pod"
logFunc = logger.Warn
}
logFunc(
message,
zap.Objects("includedIgnoredPods", includedIgnoredPods),
zap.Object("verdict", verdictSet{
cpu: cpuMsg,
mem: memMsg,
}),
)
if !allowing {
return framework.NewStatus(framework.Unschedulable, "Not enough resources for pod")
} else {
return nil
}
}
// Score allows our plugin to express which nodes should be preferred for scheduling new pods onto
//
// Even though this function is given (pod, node) pairs, our scoring is only really dependent on
// values of the node. However, we have special handling for when the pod no longer fits in the node
// (even though it might have during the Filter plugin) - we can't return a failure, because that
// would cause *all* scheduling of the pod to fail, so we instead return the minimum score.
//
// The scores might not be consistent with each other, due to ongoing changes in the node. That's
// ok, because nothing relies on strict correctness here, and they should be approximately correct
// anyways.
//
// Required for framework.ScorePlugin
func (e *AutoscaleEnforcer) Score(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
nodeName string,
) (_ int64, status *framework.Status) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("Score", pod, ignored)
defer func() {
e.metrics.IncFailIfNotSuccess("Score", pod, ignored, status)
}()
logger := e.logger.With(zap.String("method", "Score"), zap.String("node", nodeName), util.PodNameFields(pod))
logger.Info("Handling Score request")
scoreLen := framework.MaxNodeScore - framework.MinNodeScore
// Double-check that the SchedulerName matches what we're expecting
if status := e.checkSchedulerName(logger, pod); status != nil {
return framework.MinNodeScore, status
}
vmInfo, err := e.getVmInfo(logger, pod, "Score")
if err != nil {
logger.Error("Error getting VM info for Pod", zap.Error(err))
return 0, framework.NewStatus(framework.Error, "Error getting info for pod")
}
// note: vmInfo may be nil here if the pod does not correspond to a NeonVM virtual machine
e.state.lock.Lock()
defer e.state.lock.Unlock()
// Score by total resources available:
node, err := e.state.getOrFetchNodeState(ctx, logger, e.metrics, e.nodeStore, nodeName)
if err != nil {
logger.Error("Error getting node state", zap.Error(err))
return 0, framework.NewStatus(framework.Error, "Error fetching state for node")
}
// Special case: return minimum score if we don't have room
overbudget, verdict := e.speculativeReserve(node, vmInfo, pod, false, func(_ verdictSet, _ bool) bool {
return false // never actually accept the pod; we're just doing this to ask if it's over-budget.
})
if overbudget {
score := framework.MinNodeScore
logger.Warn(
"No room on node, giving minimum score (typically handled by Filter method)",
zap.Int64("score", score),
zap.Object("verdict", verdict),
)
return score, nil
}
cpuRemaining := node.remainingReservableCPU()
cpuTotal := node.cpu.Total
memRemaining := node.remainingReservableMem()
memTotal := node.mem.Total
cpuFraction := 1 - cpuRemaining.AsFloat64()/cpuTotal.AsFloat64()
memFraction := 1 - memRemaining.AsFloat64()/memTotal.AsFloat64()
cpuScale := node.cpu.Total.AsFloat64() / e.state.maxTotalReservableCPU.AsFloat64()
memScale := node.mem.Total.AsFloat64() / e.state.maxTotalReservableMem.AsFloat64()
nodeConf := e.state.conf.NodeConfig
// Refer to the comments in nodeConfig for more. Also, see: https://www.desmos.com/calculator/wg8s0yn63s
calculateScore := func(fraction, scale float64) (float64, int64) {
y0 := nodeConf.MinUsageScore
y1 := nodeConf.MaxUsageScore
xp := nodeConf.ScorePeak
score := float64(1) // if fraction == nodeConf.ScorePeak
if fraction < nodeConf.ScorePeak {
score = y0 + (1-y0)/xp*fraction
} else if fraction > nodeConf.ScorePeak {
score = y1 + (1-y1)/(1-xp)*(1-fraction)
}
score *= scale
return score, framework.MinNodeScore + int64(float64(scoreLen)*score)
}
cpuFScore, cpuIScore := calculateScore(cpuFraction, cpuScale)
memFScore, memIScore := calculateScore(memFraction, memScale)
score := min(cpuIScore, memIScore)
logger.Info(
"Scored pod placement for node",
zap.Int64("score", score),
zap.Object("verdict", verdictSet{
cpu: fmt.Sprintf(
"%d remaining reservable of %d total => fraction=%g, scale=%g => score=(%g :: %d)",
cpuRemaining, cpuTotal, cpuFraction, cpuScale, cpuFScore, cpuIScore,
),
mem: fmt.Sprintf(
"%d remaining reservable of %d total => fraction=%g, scale=%g => score=(%g :: %d)",
memRemaining, memTotal, memFraction, memScale, memFScore, memIScore,
),
}),
)
return score, nil
}
// NormalizeScore weights scores uniformly in the range [minScore, trueScore], where
// minScore is framework.MinNodeScore + 1.
func (e *AutoscaleEnforcer) NormalizeScore(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
scores framework.NodeScoreList,
) (status *framework.Status) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("NormalizeScore", pod, ignored)
defer func() {
e.metrics.IncFailIfNotSuccess("NormalizeScore", pod, ignored, status)
}()
logger := e.logger.With(zap.String("method", "NormalizeScore"), util.PodNameFields(pod))
logger.Info("Handling NormalizeScore request")
for _, node := range scores {
nodeScore := node.Score
nodeName := node.Name
// rand.Intn will panic if we pass in 0
if nodeScore == 0 {
logger.Info("Ignoring node as it was assigned a score of 0", zap.String("node", nodeName))
continue
}
// This is different from framework.MinNodeScore. We use framework.MinNodeScore
// to indicate that a pod should not be placed on a node. The lowest
// actual score we assign a node is thus framework.MinNodeScore + 1
minScore := framework.MinNodeScore + 1
// We want to pick a score in the range [minScore, score], so use
// score _+ 1_ - minscore, as rand.Intn picks a number in the _half open_
// range [0, n)
newScore := int64(rand.Intn(int(nodeScore+1-minScore))) + minScore
logger.Info(
"Randomly choosing newScore from range [minScore, trueScore]",
zap.String("node", nodeName),
zap.Int64("newScore", newScore),
zap.Int64("minScore", minScore),
zap.Int64("trueScore", nodeScore),
)
node.Score = newScore
}
return nil
}
// ScoreExtensions is required for framework.ScorePlugin, and can return nil if it's not used.
// However, we do use it, to randomize scores.
func (e *AutoscaleEnforcer) ScoreExtensions() framework.ScoreExtensions {
if e.state.conf.RandomizeScores {
return e
} else {
return nil
}
}
// Reserve signals to our plugin that a particular pod will (probably) be bound to a node, giving us
// a chance to both (a) reserve the resources it needs within the node and (b) reject the pod if
// there aren't enough.
//
// Required for framework.ReservePlugin
func (e *AutoscaleEnforcer) Reserve(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
nodeName string,
) (status *framework.Status) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("Reserve", pod, ignored)
defer func() {
e.metrics.IncFailIfNotSuccess("Reserve", pod, ignored, status)
}()
logger := e.logger.With(zap.String("method", "Reserve"), zap.String("node", nodeName), util.PodNameFields(pod))
if migrationName := util.TryPodOwnerVirtualMachineMigration(pod); migrationName != nil {
logger = logger.With(zap.Object("virtualmachinemigration", *migrationName))
}
logger.Info("Handling Reserve request")
if ignored {
// Generally, we shouldn't be getting plugin requests for resources that are ignored.
logger.Warn("Ignoring Reserve request for pod in ignored namespace")
return nil // success; allow the Pod onto the node.
}
// Double-check that the SchedulerName matches what we're expecting
if status := e.checkSchedulerName(logger, pod); status != nil {
return status
}
ok, verdict, err := e.reserveResources(ctx, logger, pod, "Reserve", reserveOptions{
// we *could* deny, but that's ultimately less reliable.
// For more, see https://github.com/neondatabase/autoscaling/issues/869
allowDeny: false,
// don't include buffer because we know that future changes by the autoscaler-agent must go
// through us.
includeBuffer: false,
preexisting: false,
})
if err != nil {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
}
if ok {
logger.Info("Allowing reserve Pod", zap.Object("verdict", verdict))
return nil // nil is success
} else {
logger.Error("Rejecting reserve Pod (not enough resources)", zap.Object("verdict", verdict))
return framework.NewStatus(framework.Unschedulable, "Not enough resources to reserve Pod")
}
}
// Unreserve marks a pod as no longer on-track to being bound to a node, so we can release the
// resources we previously reserved for it.
//
// Required for framework.ReservePlugin.
//
// Note: the documentation for ReservePlugin indicates that Unreserve both (a) must be idempotent
// and (b) may be called without a previous call to Reserve for the same pod.
func (e *AutoscaleEnforcer) Unreserve(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
nodeName string,
) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("Unreserve", pod, ignored)
podName := util.GetNamespacedName(pod)
logger := e.logger.With(zap.String("method", "Unreserve"), zap.String("node", nodeName), util.PodNameFields(pod))
logger.Info("Handling Unreserve request")
if ignored {
// Generally, we shouldn't be getting plugin requests for resources that are ignored.
logger.Warn("Ignoring Unreserve request for pod in ignored namespace")
return
}
logFields, kind, migrating, verdict := e.unreserveResources(logger, podName)
logger.With(logFields...).Info(
fmt.Sprintf("Unreserved %s Pod", kind),
zap.Bool("migrating", migrating),
zap.Object("verdict", verdict),
)
}
package plugin
// defines prometheus metrics and provides the server, via (*AutoscaleEnforcer).startPrometheusServer()
import (
"strconv"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
corev1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/scheduler/framework"
"github.com/neondatabase/autoscaling/pkg/util"
)
type PromMetrics struct {
pluginCalls *prometheus.CounterVec
pluginCallFails *prometheus.CounterVec
resourceRequests *prometheus.CounterVec
validResourceRequests *prometheus.CounterVec
nodeCPUResources *prometheus.GaugeVec
nodeMemResources *prometheus.GaugeVec
migrationCreations prometheus.Counter
migrationDeletions *prometheus.CounterVec
migrationCreateFails prometheus.Counter
migrationDeleteFails *prometheus.CounterVec
reserveShouldDeny *prometheus.CounterVec
eventQueueDepth prometheus.Gauge
eventQueueAddsTotal prometheus.Counter
eventQueueLatency prometheus.Histogram
}
func (p *AutoscaleEnforcer) makePrometheusRegistry() *prometheus.Registry {
reg := prometheus.NewRegistry()
// register stock collectors directly:
// (even though MustRegister is variadic, the function calls
// are cheap and calling it more than once means that when
// it panics, we know exactly which metric caused the error.)
reg.MustRegister(collectors.NewGoCollector())
reg.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
p.metrics = PromMetrics{
// the util.RegisterMetric() function registers the collector and returns
// it so we can set it directly on the output structure.
pluginCalls: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_extension_calls_total",
Help: "Number of calls to scheduler plugin extension points",
},
[]string{"method", "desired_availability_zone", "ignored_namespace"},
)),
pluginCallFails: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_extension_call_fails_total",
Help: "Number of unsuccessful calls to scheduler plugin extension points",
},
[]string{"method", "desired_availability_zone", "ignored_namespace", "status"},
)),
resourceRequests: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_resource_requests_total",
Help: "Number of resource requests received by the scheduler plugin",
},
[]string{"code"},
)),
validResourceRequests: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_resource_requests_results_total",
Help: "Number of resource requests to the scheduler plugin with various results",
},
[]string{"code", "node", "has_metrics"},
)),
nodeCPUResources: util.RegisterMetric(reg, prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "autoscaling_plugin_node_cpu_resources_current",
Help: "Current amount of CPU for 'nodeResourceState' fields",
},
[]string{"node", "node_group", "availability_zone", "field"},
)),
nodeMemResources: util.RegisterMetric(reg, prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "autoscaling_plugin_node_mem_resources_current",
Help: "Current amount of memory (in bytes) for 'nodeResourceState' fields",
},
[]string{"node", "node_group", "availability_zone", "field"},
)),
migrationCreations: util.RegisterMetric(reg, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "autoscaling_plugin_migrations_created_total",
Help: "Number of successful VirtualMachineMigration Create requests by the plugin",
},
)),
migrationDeletions: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_migrations_deleted_total",
Help: "Number of successful VirtualMachineMigration Delete requests by the plugin",
},
[]string{"phase"},
)),
migrationCreateFails: util.RegisterMetric(reg, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "autoscaling_plugin_migration_create_fails_total",
Help: "Number of failed VirtualMachineMigration Create requests by the plugin",
},
)),
migrationDeleteFails: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_migration_delete_fails_total",
Help: "Number of failed VirtualMachineMigration Delete requests by the plugin",
},
[]string{"phase"},
)),
reserveShouldDeny: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_reserve_should_deny_total",
Help: "Number of times the plugin should deny a reservation",
},
[]string{"availability_zone", "node", "node_group"},
)),
eventQueueDepth: util.RegisterMetric(reg, prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "autoscaling_plugin_eventqueue_depth",
Help: "Current sum depth of all event queues",
},
)),
eventQueueAddsTotal: util.RegisterMetric(reg, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "autoscaling_plugin_eventqueue_adds_total",
Help: "Total number of events added to event queues",
},
)),
eventQueueLatency: util.RegisterMetric(reg, prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "autoscaling_plugin_eventqueue_duration_seconds",
Help: "How long in seconds an item stays in an event queue before being processed",
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12),
},
)),
}
return reg
}
func (m *PromMetrics) IncMethodCall(method string, pod *corev1.Pod, ignored bool) {
m.pluginCalls.WithLabelValues(method, util.PodPreferredAZIfPresent(pod), strconv.FormatBool(ignored)).Inc()
}
func (m *PromMetrics) IncFailIfNotSuccess(method string, pod *corev1.Pod, ignored bool, status *framework.Status) {
if !status.IsSuccess() {
return
}
m.pluginCallFails.WithLabelValues(method, util.PodPreferredAZIfPresent(pod), strconv.FormatBool(ignored), status.Code().String())
}
func (m *PromMetrics) IncReserveShouldDeny(pod *corev1.Pod, node *nodeState) {
m.reserveShouldDeny.WithLabelValues(util.PodPreferredAZIfPresent(pod), node.name, node.nodeGroup).Inc()
}
package plugin
// Implementation of a metrics-based migration priority queue over vmPodStates
import (
"container/heap"
)
type migrationQueue []*vmPodState
///////////////////////
// package-local API //
///////////////////////
func (mq *migrationQueue) addOrUpdate(vm *vmPodState) {
if vm.MqIndex == -1 {
heap.Push(mq, vm)
} else {
heap.Fix(mq, vm.MqIndex)
}
}
func (mq migrationQueue) isNextInQueue(vm *vmPodState) bool {
// the documentation for heap.Pop says that it's equivalent to heap.Remove(h, 0). Therefore,
// checking whether something's the next pop target can just be done by checking if its index is
// zero.
return vm.MqIndex == 0
}
func (mq *migrationQueue) removeIfPresent(vm *vmPodState) {
if vm.MqIndex != -1 {
_ = heap.Remove(mq, vm.MqIndex)
vm.MqIndex = -1
}
}
//////////////////////////////////////
// container/heap.Interface methods //
//////////////////////////////////////
func (mq migrationQueue) Len() int { return len(mq) }
func (mq migrationQueue) Less(i, j int) bool {
return mq[i].isBetterMigrationTarget(mq[j])
}
func (mq migrationQueue) Swap(i, j int) {
mq[i], mq[j] = mq[j], mq[i]
mq[i].MqIndex = i
mq[j].MqIndex = j
}
func (mq *migrationQueue) Push(v any) {
n := len(*mq)
vm := v.(*vmPodState)
vm.MqIndex = n
*mq = append(*mq, vm)
}
func (mq *migrationQueue) Pop() any {
// Function body + comments taken from the example at https://pkg.go.dev/container/heap
old := *mq
n := len(old)
vm := old[n-1]
old[n-1] = nil // avoid memory leak
vm.MqIndex = -1 // for safety
*mq = old[0 : n-1]
return vm
}
package plugin
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"strconv"
"time"
"github.com/tychoish/fun/srv"
"go.uber.org/zap"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
)
const (
MaxHTTPBodySize int64 = 1 << 10 // 1 KiB
ContentTypeJSON string = "application/json"
ContentTypeError string = "text/plain"
)
// The scheduler plugin currently supports v3.0 to v5.0 of the agent<->scheduler plugin protocol.
//
// If you update either of these values, make sure to also update VERSIONING.md.
const (
MinPluginProtocolVersion api.PluginProtoVersion = api.PluginProtoV3_0
MaxPluginProtocolVersion api.PluginProtoVersion = api.PluginProtoV5_0
)
// startPermitHandler runs the server for handling each resourceRequest from a pod
func (e *AutoscaleEnforcer) startPermitHandler(ctx context.Context, logger *zap.Logger) error {
mux := http.NewServeMux()
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
logger := logger // copy locally, so that we can add fields and refer to it in defers
var finalStatus int
defer func() {
e.metrics.resourceRequests.WithLabelValues(strconv.Itoa(finalStatus)).Inc()
}()
// Catch any potential panics and report them as 500s
defer func() {
if err := recover(); err != nil {
msg := "request handler panicked"
logger.Error(msg, zap.String("error", fmt.Sprint(err)))
finalStatus = 500
w.WriteHeader(finalStatus)
_, _ = w.Write([]byte(msg))
}
}()
if r.Method != "POST" {
finalStatus = 400
w.WriteHeader(400)
_, _ = w.Write([]byte("must be POST"))
return
}
defer r.Body.Close()
var req api.AgentRequest
jsonDecoder := json.NewDecoder(io.LimitReader(r.Body, MaxHTTPBodySize))
if err := jsonDecoder.Decode(&req); err != nil {
logger.Warn("Received bad JSON in request", zap.Error(err))
w.Header().Add("Content-Type", ContentTypeError)
finalStatus = 400
w.WriteHeader(400)
_, _ = w.Write([]byte("bad JSON"))
return
}
logger = logger.With(
zap.Object("pod", req.Pod),
zap.String("client", r.RemoteAddr),
zap.Any("request", req),
)
resp, statusCode, err := e.handleAgentRequest(logger, req)
finalStatus = statusCode
if err != nil {
logFunc := logger.Warn
if 500 <= statusCode && statusCode < 600 {
logFunc = logger.Error
}
logFunc(
"Responding to autoscaler-agent request with error",
zap.Int("status", statusCode),
zap.Error(err),
)
w.Header().Add("Content-Type", ContentTypeError)
w.WriteHeader(statusCode)
_, _ = w.Write([]byte(err.Error()))
return
}
responseBody, err := json.Marshal(&resp)
if err != nil {
logger.Panic("Failed to encode response JSON", zap.Error(err))
}
w.Header().Add("Content-Type", ContentTypeJSON)
w.WriteHeader(statusCode)
_, _ = w.Write(responseBody)
})
orca := srv.GetOrchestrator(ctx)
logger.Info("Starting resource request server")
hs := srv.HTTP("resource-request", 5*time.Second, &http.Server{Addr: "0.0.0.0:10299", Handler: mux})
if err := hs.Start(ctx); err != nil {
return fmt.Errorf("Error starting resource request server: %w", err)
}
if err := orca.Add(hs); err != nil {
return fmt.Errorf("Error adding resource request server to orchestrator: %w", err)
}
return nil
}
// Returns body (if successful), status code, error (if unsuccessful)
func (e *AutoscaleEnforcer) handleAgentRequest(
logger *zap.Logger,
req api.AgentRequest,
) (_ *api.PluginResponse, status int, _ error) {
nodeName := "<none>" // override this later if we have a node name
defer func() {
hasMetrics := req.Metrics != nil
e.metrics.validResourceRequests.
WithLabelValues(strconv.Itoa(status), nodeName, strconv.FormatBool(hasMetrics)).
Inc()
}()
// Before doing anything, check that the version is within the range we're expecting.
expectedProtoRange := api.VersionRange[api.PluginProtoVersion]{
Min: MinPluginProtocolVersion,
Max: MaxPluginProtocolVersion,
}
if !req.ProtoVersion.IsValid() {
return nil, 400, fmt.Errorf("Invalid protocol version %v", req.ProtoVersion)
}
reqProtoRange := req.ProtocolRange()
if _, ok := expectedProtoRange.LatestSharedVersion(reqProtoRange); !ok {
return nil, 400, fmt.Errorf(
"Protocol version mismatch: Need %v but got %v", expectedProtoRange, reqProtoRange,
)
}
// if req.Metrics is nil, check that the protocol version allows that.
if req.Metrics == nil && !req.ProtoVersion.AllowsNilMetrics() {
return nil, 400, fmt.Errorf("nil metrics not supported for protocol version %v", req.ProtoVersion)
}
// check that req.ComputeUnit has no zeros
if err := req.ComputeUnit.ValidateNonZero(); err != nil {
return nil, 400, fmt.Errorf("computeUnit fields must be non-zero: %w", err)
}
// check that nil-ness of req.Metrics.{LoadAverage5Min,MemoryUsageBytes} match what's expected
// for the protocol version.
if req.Metrics != nil {
if (req.Metrics.LoadAverage5Min != nil) != (req.Metrics.MemoryUsageBytes != nil) {
return nil, 400, fmt.Errorf("presence of metrics.loadAvg5M must match presence of metrics.memoryUsageBytes")
} else if req.Metrics.LoadAverage5Min == nil && req.ProtoVersion.IncludesExtendedMetrics() {
return nil, 400, fmt.Errorf("nil metrics.{loadAvg5M,memoryUsageBytes} not supported for protocol version %v", req.ProtoVersion)
} else if req.Metrics.LoadAverage5Min != nil && !req.ProtoVersion.IncludesExtendedMetrics() {
return nil, 400, fmt.Errorf("non-nil metrics.{loadAvg5M,memoryUsageBytes} not supported for protocol version %v", req.ProtoVersion)
}
}
e.state.lock.Lock()
defer e.state.lock.Unlock()
pod, ok := e.state.pods[req.Pod]
if !ok {
logger.Warn("Received request for Pod we don't know") // pod already in the logger's context
return nil, 404, errors.New("pod not found")
}
if pod.vm == nil {
logger.Error("Received request for non-VM Pod")
return nil, 400, errors.New("pod is not associated with a VM")
}
// Check that req.ComputeUnit.Mem is divisible by the VM's memory slot size
if req.ComputeUnit.Mem%pod.vm.MemSlotSize != 0 {
return nil, 400, fmt.Errorf(
"computeUnit is not divisible by VM memory slot size: %v not divisible by %v",
req.ComputeUnit,
pod.vm.MemSlotSize,
)
}
// If the request was actually sending a quantity of *memory slots*, rather than bytes, then
// multiply memory resources to make it match the
if !req.ProtoVersion.RepresentsMemoryAsBytes() {
req.Resources.Mem *= pod.vm.MemSlotSize
}
node := pod.node
nodeName = node.name // set nodeName for deferred metrics
// Also, now that we know which VM this refers to (and which node it's on), add that to the logger for later.
logger = logger.With(zap.Object("virtualmachine", pod.vm.Name), zap.String("node", nodeName))
mustMigrate := pod.vm.MigrationState == nil &&
// Check whether the pod *will* migrate, then update its resources, and THEN start its
// migration, using the possibly-changed resources.
e.updateMetricsAndCheckMustMigrate(logger, pod.vm, node, req.Metrics)
supportsFractionalCPU := req.ProtoVersion.SupportsFractionalCPU()
verdict, permit, status, err := e.handleResources(
pod,
node,
req.ComputeUnit,
req.Resources,
req.LastPermit,
mustMigrate,
supportsFractionalCPU,
)
if err != nil {
return nil, status, err
}
var migrateDecision *api.MigrateResponse
if mustMigrate {
created, err := e.startMigration(context.Background(), logger, pod)
if err != nil {
return nil, 500, fmt.Errorf("Error starting migration for pod %v: %w", pod.name, err)
}
// We should only signal to the autoscaler-agent that we've started migrating if we actually
// *created* the migration. We're not *supposed* to receive requests for a VM that's already
// migrating, so receiving one means that *something*'s gone wrong. If that's on us, we
// should try to avoid
if created {
migrateDecision = &api.MigrateResponse{}
}
}
status = 200
resp := api.PluginResponse{
Permit: permit,
Migrate: migrateDecision,
}
logger.Info(
"Handled agent request",
zap.Object("verdict", verdict),
zap.Int("status", status),
zap.Any("response", resp),
)
return &resp, status, nil
}
func (e *AutoscaleEnforcer) handleResources(
pod *podState,
node *nodeState,
cu api.Resources,
req api.Resources,
lastPermit *api.Resources,
startingMigration bool,
supportsFractionalCPU bool,
) (verdictSet, api.Resources, int, error) {
if !supportsFractionalCPU && req.VCPU%1000 != 0 {
err := errors.New("agent requested fractional CPU with protocol version that does not support it")
return verdictSet{}, api.Resources{}, 400, err
}
// Check that we aren't being asked to do something during migration:
if pod.vm.currentlyMigrating() {
// The agent shouldn't have asked for a change after already receiving notice that it's
// migrating.
if req.VCPU != pod.cpu.Reserved || req.Mem != pod.mem.Reserved {
err := errors.New("cannot change resources: agent has already been informed that pod is migrating")
return verdictSet{}, api.Resources{}, 400, err
}
message := "No change because pod is migrating"
verdict := verdictSet{cpu: message, mem: message}
return verdict, api.Resources{VCPU: pod.cpu.Reserved, Mem: pod.mem.Reserved}, 200, nil
}
cpuFactor := cu.VCPU
if !supportsFractionalCPU {
cpuFactor = 1000
}
memFactor := cu.Mem
var lastCPUPermit *vmapi.MilliCPU
var lastMemPermit *api.Bytes
if lastPermit != nil {
lastCPUPermit = &lastPermit.VCPU
lastMemPermit = &lastPermit.Mem
}
cpuVerdict := makeResourceTransitioner(&node.cpu, &pod.cpu).
handleRequested(req.VCPU, lastCPUPermit, startingMigration, cpuFactor)
memVerdict := makeResourceTransitioner(&node.mem, &pod.mem).
handleRequested(req.Mem, lastMemPermit, startingMigration, memFactor)
verdict := verdictSet{cpu: cpuVerdict, mem: memVerdict}
permit := api.Resources{VCPU: pod.cpu.Reserved, Mem: pod.mem.Reserved}
return verdict, permit, 200, nil
}
func (e *AutoscaleEnforcer) updateMetricsAndCheckMustMigrate(
logger *zap.Logger,
vm *vmPodState,
node *nodeState,
metrics *api.Metrics,
) bool {
// This pod should migrate if (a) it's allowed to migrate, (b) node resource usage is high
// enough that we should migrate *something*, and (c) it's next up in the priority queue.
// We will give it a chance later to veto if the metrics have changed too much.
//
// Alternatively, "the pod is marked to always migrate" causes it to migrate even if none of
// the above conditions are met, so long as it has *previously* provided metrics.
canMigrate := vm.Config.AutoMigrationEnabled && e.state.conf.migrationEnabled()
shouldMigrate := node.mq.isNextInQueue(vm) && node.tooMuchPressure(logger)
forcedMigrate := vm.Config.AlwaysMigrate && vm.Metrics != nil
logger.Info("Updating pod metrics", zap.Any("metrics", metrics))
oldMetrics := vm.Metrics
vm.Metrics = metrics
if vm.currentlyMigrating() {
return false // don't do anything else; it's already migrating.
}
node.mq.addOrUpdate(vm)
// nb: forcedMigrate takes priority over canMigrate
if (!canMigrate || !shouldMigrate) && !forcedMigrate {
return false
}
// Give the pod a chance to veto migration if its metrics have significantly changed...
var veto error
if oldMetrics != nil && !forcedMigrate {
veto = vm.checkOkToMigrate(*oldMetrics)
}
// ... but override the veto if it's still the best candidate anyways.
stillFirst := node.mq.isNextInQueue(vm)
if forcedMigrate || stillFirst || veto == nil {
if veto != nil {
logger.Info("Pod attempted veto of self migration, still highest priority", zap.NamedError("veto", veto))
}
return true
} else {
logger.Warn("Pod vetoed self migration", zap.NamedError("veto", veto))
return false
}
}
package plugin
// Definitions and helper functions for managing plugin state
import (
"context"
"errors"
"fmt"
"time"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
"github.com/neondatabase/autoscaling/pkg/util/watch"
"github.com/neondatabase/autoscaling/pkg/util/xact"
)
// pluginState stores the private state for the plugin, used both within and outside of the
// predefined scheduler plugin points
//
// Accessing the individual fields MUST be done while holding the lock, with some exceptions.
type pluginState struct {
lock util.ChanMutex
ongoingMigrationDeletions map[util.NamespacedName]int
pods map[util.NamespacedName]*podState
nodes map[string]*nodeState
// maxTotalReservableCPU stores the maximum value of any node's totalReservableCPU(), so that we
// can appropriately scale our scoring
maxTotalReservableCPU vmapi.MilliCPU
// maxTotalReservableMem is the same as maxTotalReservableCPU, but for bytes of memory instead
// of CPU
maxTotalReservableMem api.Bytes
// conf stores the current configuration, and is nil if the configuration has not yet been set
//
// Proper initialization of the plugin guarantees conf is not nil.
//
// conf MAY be accessed without holding the lock; it MUST not be modified.
conf *Config
}
// nodeState is the information that we track for a particular
type nodeState struct {
// name is the name of the node, guaranteed by kubernetes to be unique
name string
// nodeGroup, if present, gives the node group that this node belongs to.
nodeGroup string
// availabilityZone, if present, gives the availability zone that this node is in.
availabilityZone string
// cpu tracks the state of vCPU resources -- what's available and how
cpu nodeResourceState[vmapi.MilliCPU]
// mem tracks the state of bytes of memory -- what's available and how
mem nodeResourceState[api.Bytes]
// pods tracks all the VM pods assigned to this node
//
// This includes both bound pods (i.e., pods fully committed to the node) and reserved pods
// (still may be unreserved)
pods map[util.NamespacedName]*podState
// mq is the priority queue tracking which pods should be chosen first for migration
mq migrationQueue
}
type nodeResourceStateField[T any] struct {
valueName string
value T
}
func (s *nodeResourceState[T]) fields() []nodeResourceStateField[T] {
return []nodeResourceStateField[T]{
{"Total", s.Total},
{"Watermark", s.Watermark},
{"Reserved", s.Reserved},
{"Buffer", s.Buffer},
{"CapacityPressure", s.CapacityPressure},
{"PressureAccountedFor", s.PressureAccountedFor},
}
}
func (s *nodeState) updateMetrics(metrics PromMetrics) {
s.cpu.updateMetrics(metrics.nodeCPUResources, s.name, s.nodeGroup, s.availabilityZone, vmapi.MilliCPU.AsFloat64)
s.mem.updateMetrics(metrics.nodeMemResources, s.name, s.nodeGroup, s.availabilityZone, api.Bytes.AsFloat64)
}
func (s *nodeResourceState[T]) updateMetrics(
metric *prometheus.GaugeVec,
nodeName string,
nodeGroup string,
availabilityZone string,
convert func(T) float64,
) {
for _, f := range s.fields() {
metric.WithLabelValues(nodeName, nodeGroup, availabilityZone, f.valueName).Set(convert(f.value))
}
}
func (s *nodeState) removeMetrics(metrics PromMetrics) {
gauges := []*prometheus.GaugeVec{metrics.nodeCPUResources, metrics.nodeMemResources}
fields := s.cpu.fields() // No particular reason to be CPU, we just want the valueNames, and CPU vs memory valueNames are the same
for _, g := range gauges {
for _, f := range fields {
g.DeleteLabelValues(s.name, s.nodeGroup, s.availabilityZone, f.valueName)
}
}
}
// nodeResourceState describes the state of a resource allocated to a node
type nodeResourceState[T any] struct {
// Total is the Total amount of T available on the node. This value does not change.
Total T `json:"total"`
// Watermark is the amount of T reserved to pods above which we attempt to reduce usage via
// migration.
Watermark T `json:"watermark"`
// Reserved is the current amount of T reserved to pods. It SHOULD be less than or equal to
// Total), and we take active measures reduce it once it is above Watermark.
//
// Reserved MAY be greater than Total on scheduler restart (because of buffering with VM scaling
// maximums), but (Reserved - Buffer) MUST be less than Total. In general, (Reserved - Buffer)
// SHOULD be less than or equal to Total, but this can be temporarily violated after restart.
//
// For more information, refer to the ARCHITECTURE.md file in this directory.
//
// Reserved is always exactly equal to the sum of all of this node's pods' Reserved T.
Reserved T `json:"reserved"`
// Buffer *mostly* matters during startup. It tracks the total amount of T that we don't
// *expect* is currently in use, but is still reserved to the pods because we can't prevent the
// autoscaler-agents from making use of it.
//
// Buffer is always exactly equal to the sum of all this node's pods' Buffer for T.
Buffer T `json:"buffer"`
// CapacityPressure is -- roughly speaking -- the amount of T that we're currently denying to
// pods in this node when they request it, due to not having space in remainingReservableCPU().
// This value is exactly equal to the sum of each pod's CapacityPressure.
//
// This value is used alongside the "logical pressure" (equal to Reserved - Watermark, if
// nonzero) in tooMuchPressure() to determine if more pods should be migrated off the node to
// free up pressure.
CapacityPressure T `json:"capacityPressure"`
// PressureAccountedFor gives the total pressure expected to be relieved by ongoing migrations.
// This is equal to the sum of Reserved + CapacityPressure for all pods currently migrating.
//
// The value may be larger than CapacityPressure.
PressureAccountedFor T `json:"pressureAccountedFor"`
}
// podState is the information we track for an individual pod, which may or may not be associated
// with a VM
type podState struct {
// name is the namespace'd name of the pod
//
// name will not change after initialization, so it can be accessed without holding a lock.
name util.NamespacedName
// node provides information about the node that this pod is bound to or reserved onto.
node *nodeState
// cpu is the current state of this pod's vCPU utilization and pressure
cpu podResourceState[vmapi.MilliCPU]
// memBytes is the current state of this pod's memory utilization and pressure
mem podResourceState[api.Bytes]
// vm stores the extra information associated with VMs
vm *vmPodState
}
type vmPodState struct {
// Name is the name of the VM, as given by the owner reference for the VM or VM migration that
// owns this pod
Name util.NamespacedName
// MemSlotSize stores the value of the VM's .Spec.Guest.MemorySlotSize, for compatibility with
// earlier versions of the agent<->plugin protocol.
MemSlotSize api.Bytes
// Config stores the values of per-VM settings for this VM
Config api.VmConfig
// Metrics is the most recent Metrics update we received for this pod. A nil pointer means that
// we have not yet received Metrics.
Metrics *api.Metrics
// MqIndex stores this pod's index in the migrationQueue. This value is -1 iff metrics is nil or
// it is currently migrating.
MqIndex int
// MigrationState gives current information about an ongoing migration, if this pod is currently
// migrating.
MigrationState *podMigrationState
}
// podMigrationState tracks the information about an ongoing VM pod's migration
type podMigrationState struct {
// Name gives the name of the VirtualMachineMigration that this pod is involved in
Name util.NamespacedName
}
type podResourceState[T any] struct {
// Reserved is the amount of T that this pod has reserved. It is guaranteed that the pod is
// using AT MOST Reserved T.
Reserved T `json:"reserved"`
// Buffer is the amount of T that we've included in Reserved to account for the possibility of
// unilateral increases by the autoscaler-agent
//
// This value is only nonzero during startup (between initial state load and first communication
// from the autoscaler-agent), and MUST be less than or equal to reserved.
//
// After the first communication from the autoscaler-agent, we update Reserved to match its
// value, and set Buffer to zero.
Buffer T `json:"buffer"`
// CapacityPressure is this pod's contribution to this pod's node's CapacityPressure for this
// resource
CapacityPressure T `json:"capacityPressure"`
// Min and Max give the minimum and maximum values of this resource that the VM may use.
Min T `json:"min"`
Max T `json:"max"`
}
func (p *podState) kind() string {
if p.vm != nil {
return "VM"
} else {
return "non-VM"
}
}
func (p *podState) logFields() []zap.Field {
podName := zap.Object("pod", p.name)
if p.vm != nil {
vmName := zap.Object("virtualmachine", p.vm.Name)
return []zap.Field{podName, vmName}
} else {
return []zap.Field{podName}
}
}
// remainingReservableCPU returns the remaining CPU that can be allocated to VM pods
func (s *nodeState) remainingReservableCPU() vmapi.MilliCPU {
return util.SaturatingSub(s.cpu.Total, s.cpu.Reserved)
}
// remainingReservableMem returns the remaining number of bytes of memory that can be allocated to
// VM pods
func (s *nodeState) remainingReservableMem() api.Bytes {
return util.SaturatingSub(s.mem.Total, s.mem.Reserved)
}
// tooMuchPressure is used to signal whether the node should start migrating pods out in order to
// relieve some of the pressure
func (s *nodeState) tooMuchPressure(logger *zap.Logger) bool {
if s.cpu.Reserved <= s.cpu.Watermark && s.mem.Reserved < s.mem.Watermark {
type okPair[T any] struct {
Reserved T
Watermark T
}
logger.Debug(
"tooMuchPressure = false (clearly)",
zap.Any("cpu", okPair[vmapi.MilliCPU]{Reserved: s.cpu.Reserved, Watermark: s.cpu.Watermark}),
zap.Any("mem", okPair[api.Bytes]{Reserved: s.mem.Reserved, Watermark: s.mem.Watermark}),
)
return false
}
type info[T any] struct {
LogicalPressure T
LogicalSlack T
Capacity T
AccountedFor T
TooMuch bool
}
var cpu info[vmapi.MilliCPU]
var mem info[api.Bytes]
cpu.LogicalPressure = util.SaturatingSub(s.cpu.Reserved, s.cpu.Watermark)
mem.LogicalPressure = util.SaturatingSub(s.mem.Reserved, s.mem.Watermark)
// Account for existing slack in the system, to counteract capacityPressure that hasn't been
// updated yet
cpu.LogicalSlack = s.cpu.Buffer + util.SaturatingSub(s.cpu.Watermark, s.cpu.Reserved)
mem.LogicalSlack = s.mem.Buffer + util.SaturatingSub(s.mem.Watermark, s.mem.Reserved)
cpu.TooMuch = cpu.LogicalPressure+s.cpu.CapacityPressure > s.cpu.PressureAccountedFor+cpu.LogicalSlack
mem.TooMuch = mem.LogicalPressure+s.mem.CapacityPressure > s.mem.PressureAccountedFor+mem.LogicalSlack
result := cpu.TooMuch || mem.TooMuch
logger.Debug(
fmt.Sprintf("tooMuchPressure = %v", result),
zap.Any("cpu", cpu),
zap.Any("mem", mem),
)
return result
}
// checkOkToMigrate allows us to check that it's still ok to start migrating a pod, after it was
// previously selected for migration
//
// A returned error indicates that the pod's resource usage has changed enough that we should try to
// migrate something else first. The error provides justification for this.
func (s *vmPodState) checkOkToMigrate(oldMetrics api.Metrics) error {
// TODO. Note: s.metrics may be nil.
return nil
}
func (s *vmPodState) currentlyMigrating() bool {
return s.MigrationState != nil
}
// this method can only be called while holding a lock. If we don't have the necessary information
// locally, then the lock is released temporarily while we query the API server
//
// A lock will ALWAYS be held on return from this function.
func (s *pluginState) getOrFetchNodeState(
ctx context.Context,
logger *zap.Logger,
metrics PromMetrics,
store IndexedNodeStore,
nodeName string,
) (*nodeState, error) {
logger = logger.With(zap.String("node", nodeName))
if n, ok := s.nodes[nodeName]; ok {
logger.Debug("Using stored information for node")
return n, nil
}
logger.Info("Node has not yet been processed, fetching from store")
accessor := func(index *watch.FlatNameIndex[corev1.Node]) (*corev1.Node, bool) {
return index.Get(nodeName)
}
// Before unlocking, try to get the node from the store.
node, ok := store.GetIndexed(accessor)
if !ok {
logger.Warn("Node is missing from local store. Relisting to try getting it from API server")
s.lock.Unlock() // Unlock to let other goroutines progress while we get the data we need
var locked bool // In order to prevent double-unlock panics, we always lock on return.
defer func() {
if !locked {
s.lock.Lock()
}
}()
// Use a reasonable timeout on the relist request, so that if the store is broken, we won't
// block forever.
//
// FIXME: make this configurable
timeout := 5 * time.Second
timer := time.NewTimer(timeout)
defer timer.Stop()
select {
case <-store.Relist():
case <-timer.C:
message := "Timed out waiting on Node store relist"
logger.Error(message, zap.Duration("timeout", timeout))
return nil, errors.New(message)
case <-ctx.Done():
err := ctx.Err()
message := "Context expired while waiting on Node store relist"
logger.Error(message, zap.Error(err))
return nil, errors.New(message)
}
node, ok = store.GetIndexed(accessor)
if !ok {
// Either the node is already gone, or there's a deeper problem.
message := "Could not find Node, even after relist"
logger.Error(message)
return nil, errors.New(message)
}
logger.Info("Found node after relisting")
// Re-lock and process API result
locked = true
s.lock.Lock()
// It's possible that the node was already added. Don't double-process nodes if we don't have
// to.
if n, ok := s.nodes[nodeName]; ok {
logger.Warn("Local information for node became available while waiting on relist, using it instead")
return n, nil
}
}
n, err := buildInitialNodeState(logger, node, s.conf)
if err != nil {
return nil, err
}
// update maxTotalReservableCPU and maxTotalReservableMem if there's new maxima
if n.cpu.Total > s.maxTotalReservableCPU {
s.maxTotalReservableCPU = n.cpu.Total
}
if n.mem.Total > s.maxTotalReservableMem {
s.maxTotalReservableMem = n.mem.Total
}
n.updateMetrics(metrics)
s.nodes[nodeName] = n
return n, nil
}
// this method must only be called while holding s.lock. It will not be released during this
// function.
//
// Note: buildInitialNodeState does not take any of the pods or VMs on the node into account; it
// only examines the total resources available to the node.
func buildInitialNodeState(logger *zap.Logger, node *corev1.Node, conf *Config) (*nodeState, error) {
// cpuQ = "cpu, as a K8s resource.Quantity"
// -A for allocatable, -C for capacity
var cpuQ *resource.Quantity
cpuQA := node.Status.Allocatable.Cpu()
cpuQC := node.Status.Capacity.Cpu()
if cpuQA != nil {
// Use Allocatable by default ...
cpuQ = cpuQA
} else if cpuQC != nil {
// ... but use Capacity if Allocatable is not available
cpuQ = cpuQC
} else {
return nil, errors.New("Node has no Allocatable or Capacity CPU limits")
}
cpu := conf.NodeConfig.vCpuLimits(cpuQ)
// memQ = "mem, as a K8s resource.Quantity"
// -A for allocatable, -C for capacity
var memQ *resource.Quantity
memQA := node.Status.Allocatable.Memory()
memQC := node.Status.Capacity.Memory()
if memQA != nil {
memQ = memQA
} else if memQC != nil {
memQ = memQC
} else {
return nil, errors.New("Node has no Allocatable or Capacity Memory limits")
}
mem := conf.NodeConfig.memoryLimits(memQ)
var nodeGroup string
if conf.K8sNodeGroupLabel != "" {
var ok bool
nodeGroup, ok = node.Labels[conf.K8sNodeGroupLabel]
if !ok {
logger.Warn("Node does not have node group label", zap.String("label", conf.K8sNodeGroupLabel))
}
}
var availabilityZone string
if conf.K8sAvailabilityZoneLabel != "" {
var ok bool
availabilityZone, ok = node.Labels[conf.K8sAvailabilityZoneLabel]
if !ok {
logger.Warn("Node does not have availability zone label", zap.String("label", conf.K8sAvailabilityZoneLabel))
}
}
n := &nodeState{
name: node.Name,
nodeGroup: nodeGroup,
availabilityZone: availabilityZone,
cpu: cpu,
mem: mem,
pods: make(map[util.NamespacedName]*podState),
mq: migrationQueue{},
}
type resourceInfo[T any] struct {
Total T
Watermark T
}
logger.Info(
"Built initial node state",
zap.Any("cpu", resourceInfo[vmapi.MilliCPU]{
Total: n.cpu.Total,
Watermark: n.cpu.Watermark,
}),
zap.Any("memSlots", resourceInfo[api.Bytes]{
Total: n.mem.Total,
Watermark: n.mem.Watermark,
}),
)
return n, nil
}
func extractPodResources(pod *corev1.Pod) api.Resources {
var cpu vmapi.MilliCPU
var mem api.Bytes
for _, container := range pod.Spec.Containers {
// For each resource, add the requests, if they're provided. We use this because it matches
// what cluster-autoscaler uses.
//
// NB: .Cpu() returns a pointer to a value equal to zero if the resource is not present. So
// we can just add it either way.
cpu += vmapi.MilliCPUFromResourceQuantity(*container.Resources.Requests.Cpu())
mem += api.BytesFromResourceQuantity(*container.Resources.Requests.Memory())
}
return api.Resources{VCPU: cpu, Mem: mem}
}
func (e *AutoscaleEnforcer) handleNodeDeletion(logger *zap.Logger, nodeName string) {
logger = logger.With(
zap.String("action", "Node deletion"),
zap.String("node", nodeName),
)
logger.Info("Handling deletion of Node")
e.state.lock.Lock()
defer e.state.lock.Unlock()
node, ok := e.state.nodes[nodeName]
if !ok {
logger.Warn("Cannot find node in nodeMap")
}
if logger.Core().Enabled(zapcore.DebugLevel) {
logger.Debug("Dump final node state", zap.Any("state", node.dump()))
}
// For any pods still on the node, remove them from the global state:
for name, pod := range node.pods {
logger.Warn(
fmt.Sprintf("Found %s pod still on node at time of deletion", pod.kind()),
pod.logFields()...,
)
delete(e.state.pods, name)
}
node.removeMetrics(e.metrics)
delete(e.state.nodes, nodeName)
logger.Info("Deleted node")
}
// handleStarted updates the state according to a pod that's already started, but may or may not
// have been scheduled via the plugin.
//
// We need to handle this so that we maintain an accurate view of the resource usage in the cluster;
// otherwise, we might (a) ignore resources from pods that weren't scheduled here, or (b) fail to
// include pods that *were* scheduled here, but had spurious Unreserves.
// (for more, see: https://github.com/neondatabase/autoscaling/pull/435)
func (e *AutoscaleEnforcer) handleStarted(logger *zap.Logger, pod *corev1.Pod, preexisting bool) {
nodeName := pod.Spec.NodeName
logger = logger.With(
zap.String("action", "Pod started"),
zap.String("node", nodeName),
util.PodNameFields(pod),
)
if migrationName := util.TryPodOwnerVirtualMachineMigration(pod); migrationName != nil {
logger = logger.With(zap.Object("virtualmachinemigration", *migrationName))
}
logger.Info("Handling Pod start event")
_, _, _ = e.reserveResources(context.TODO(), logger, pod, "Pod started", reserveOptions{
// pod already started, out of our control - we don't have a mechanism to deny it
allowDeny: false,
// this may be a preexisting VM. If so, we should include it in "buffer" as long it's
// supposed to be handled by us (otherwise, the "buffer" will never be resolved)
includeBuffer: pod.Spec.SchedulerName == e.state.conf.SchedulerName,
preexisting: preexisting,
})
}
type reserveOptions struct {
allowDeny bool
includeBuffer bool
preexisting bool
}
// reserveResources attempts to set aside resources on the node for the pod.
//
// If allowDeny is false, reserveResources is not "allowed" to reject the pod if there isn't enough
// room - it must instead set aside resources that don't exist.
//
// If an unexpected error occurs, the first two return values are unspecified, and the error will be
// non-nil. Otherwise, 'ok' will indicate whether the pod was accepted and the verdictSet will
// provide messages describing the result, suitable for being logged.
func (e *AutoscaleEnforcer) reserveResources(
ctx context.Context,
logger *zap.Logger,
pod *corev1.Pod,
action string,
opts reserveOptions,
) (ok bool, _ *verdictSet, _ error) {
nodeName := pod.Spec.NodeName
if e.state.conf.ignoredNamespace(pod.Namespace) {
panic(fmt.Errorf("reserveResources called with ignored pod %v", util.GetNamespacedName(pod)))
}
vmInfo, err := e.getVmInfo(logger, pod, action)
if err != nil {
msg := "Error getting VM info for Pod"
logger.Error(msg, zap.Error(err))
return false, nil, fmt.Errorf("%s: %w", msg, err)
}
e.state.lock.Lock()
defer e.state.lock.Unlock()
podName := util.GetNamespacedName(pod)
// If the pod already exists, nothing to do
_, isPodInState := e.state.pods[podName]
if isPodInState {
logger.Info("Pod already exists in global state")
return true, &verdictSet{cpu: "", mem: ""}, nil
}
// If the following conditions are met, the pod has bypassed neon scheduler which might be a sign
// of a bug or misbehavior:
// - pod is assigned to autoscaler scheduler
// - pod not in the state
// - pod is not preexisting pod
// - pod has the node name
if !isPodInState && !opts.preexisting && pod.Spec.SchedulerName == e.state.conf.SchedulerName && pod.Spec.NodeName != "" {
logger.Warn("Pod has bypassed neon scheduler")
}
// Get information about the node
node, err := e.state.getOrFetchNodeState(ctx, logger, e.metrics, e.nodeStore, nodeName)
if err != nil {
msg := "Failed to get state for node"
logger.Error(msg, zap.Error(err))
return false, nil, fmt.Errorf("%s: %w", msg, err)
}
accept := func(verdict verdictSet, overBudget bool) bool {
shouldDeny := overBudget
if shouldDeny {
e.metrics.IncReserveShouldDeny(pod, node)
}
if shouldDeny && opts.allowDeny {
logger.Error(
"Can't reserve resources for Pod (not enough available)",
zap.Object("verdict", verdict),
)
return false
}
if opts.allowDeny {
logger.Info("Allowing reserve resources for Pod", zap.Object("verdict", verdict))
} else if shouldDeny /* want to deny, but can't */ {
logger.Warn("Reserved resources for Pod above totals", zap.Object("verdict", verdict))
} else /* don't want to deny, but also couldn't if we wanted to */ {
logger.Info("Reserved resources for Pod", zap.Object("verdict", verdict))
}
return true
}
ok, verdict := e.speculativeReserve(node, vmInfo, pod, opts.includeBuffer, accept)
return ok, &verdict, nil
}
// speculativeReserve reserves the pod, and then calls accept() to see whether the pod should
// actually be added.
//
// If accept() returns false, no changes to the state will be made.
func (e *AutoscaleEnforcer) speculativeReserve(
node *nodeState,
vmInfo *api.VmInfo,
pod *corev1.Pod,
includeBuffer bool,
accept func(verdict verdictSet, overBudget bool) bool,
) (ok bool, _ verdictSet) {
// Construct the speculative state of the pod
//
// We'll pass this into (resourceTransitioner).handleReserve(), but only commit the changes if
// the caller allows us to.
var cpuState podResourceState[vmapi.MilliCPU]
var memState podResourceState[api.Bytes]
var vmState *vmPodState
if vmInfo != nil {
vmState = &vmPodState{
Name: vmInfo.NamespacedName(),
MemSlotSize: vmInfo.Mem.SlotSize,
Config: vmInfo.Config,
Metrics: nil,
MqIndex: -1,
MigrationState: nil,
}
// initially build the resource states assuming that we're including buffer, and then update
// later to remove it if that turns out not to be right.
cpuState = podResourceState[vmapi.MilliCPU]{
Reserved: vmInfo.Max().VCPU,
Buffer: util.SaturatingSub(vmInfo.Max().VCPU, vmInfo.Using().VCPU),
CapacityPressure: 0,
Min: vmInfo.Min().VCPU,
Max: vmInfo.Max().VCPU,
}
memState = podResourceState[api.Bytes]{
Reserved: vmInfo.Max().Mem,
Buffer: util.SaturatingSub(vmInfo.Max().Mem, vmInfo.Using().Mem),
CapacityPressure: 0,
Min: vmInfo.Min().Mem,
Max: vmInfo.Max().Mem,
}
// If scaling isn't enabled *or* the pod is involved in an ongoing migration *or* the caller
// has opted out of setting Buffer, then we can be more precise about usage.
//
// Buffer exists to handle scaling that may happen due to a prior scheduler's approval.
// If scaling is disabled, we don't have to worry about this, and if there's an ongoing
// migration, scaling is forbidden.
migrating := util.TryPodOwnerVirtualMachineMigration(pod) != nil
if !vmInfo.Config.ScalingEnabled || migrating || !includeBuffer {
cpuState.Buffer = 0
cpuState.Reserved = vmInfo.Using().VCPU
memState.Buffer = 0
memState.Reserved = vmInfo.Using().Mem
}
} else {
res := extractPodResources(pod)
cpuState = podResourceState[vmapi.MilliCPU]{
Reserved: res.VCPU,
Buffer: 0,
CapacityPressure: 0,
Min: res.VCPU,
Max: res.VCPU,
}
memState = podResourceState[api.Bytes]{
Reserved: res.Mem,
Buffer: 0,
CapacityPressure: 0,
Min: res.Mem,
Max: res.Mem,
}
}
podName := util.GetNamespacedName(pod)
ps := &podState{
name: podName,
node: node,
cpu: cpuState,
mem: memState,
vm: vmState,
}
// Speculatively try reserving the pod.
nodeXactCPU := xact.New(&node.cpu)
nodeXactMem := xact.New(&node.mem)
cpuOverBudget, cpuVerdict := makeResourceTransitioner(nodeXactCPU.Value(), &ps.cpu).handleReserve()
memOverBudget, memVerdict := makeResourceTransitioner(nodeXactMem.Value(), &ps.mem).handleReserve()
overBudget := cpuOverBudget || memOverBudget
verdict := verdictSet{
cpu: cpuVerdict,
mem: memVerdict,
}
const verdictNotEnough = "NOT ENOUGH"
const verdictOk = "OK"
if overBudget {
cpuShortVerdict := verdictNotEnough
if !cpuOverBudget {
cpuShortVerdict = verdictOk
}
verdict.cpu = fmt.Sprintf("%s: %s", cpuShortVerdict, verdict.cpu)
memShortVerdict := verdictNotEnough
if !memOverBudget {
memShortVerdict = verdictOk
}
verdict.mem = fmt.Sprintf("%s: %s", memShortVerdict, verdict.mem)
}
if !accept(verdict, overBudget) {
return false, verdict
}
nodeXactCPU.Commit()
nodeXactMem.Commit()
node.pods[podName] = ps
e.state.pods[podName] = ps
node.updateMetrics(e.metrics)
return true, verdict
}
// This method is /basically/ the same as e.Unreserve, but the API is different and it has different
// logs, so IMO it's worthwhile to have this separate.
func (e *AutoscaleEnforcer) handleDeletion(logger *zap.Logger, podName util.NamespacedName) {
logger = logger.With(
zap.String("action", "VM deletion"),
zap.Object("pod", podName),
)
logger.Info("Handling deletion of VM pod")
logFields, kind, migrating, verdict := e.unreserveResources(logger, podName)
logger.With(logFields...).Info(
fmt.Sprintf("Deleted %s Pod", kind),
zap.Bool("migrating", migrating),
zap.Object("verdict", verdict),
)
}
// unreserveResources is *essentially* the inverse of reserveResources, but with two main
// differences:
//
// 1. unreserveResources cannot "deny" unreserving, whereas reserveResources may choose whether to
// accept the additional reservation.
// 2. unreserveResources returns additional information for logging.
//
// Also note that because unreserveResources is expected to be called by the plugin's Unreserve()
// method, it may be called for pods that no longer exist.
func (e *AutoscaleEnforcer) unreserveResources(
logger *zap.Logger,
podName util.NamespacedName,
) (_ []zap.Field, kind string, migrating bool, _ verdictSet) {
e.state.lock.Lock()
defer e.state.lock.Unlock()
ps, ok := e.state.pods[podName]
if !ok {
logger.Warn("Cannot find Pod in global pods map")
return
}
logFields := []zap.Field{zap.String("node", ps.node.name)}
if ps.vm != nil {
logFields = append(logFields, zap.Object("virtualmachine", ps.vm.Name))
}
// Mark the resources as no longer reserved
currentlyMigrating := ps.vm != nil && ps.vm.currentlyMigrating()
cpuVerdict := makeResourceTransitioner(&ps.node.cpu, &ps.cpu).
handleDeleted(currentlyMigrating)
memVerdict := makeResourceTransitioner(&ps.node.mem, &ps.mem).
handleDeleted(currentlyMigrating)
// Delete our record of the pod
delete(e.state.pods, podName)
delete(ps.node.pods, podName)
if ps.vm != nil {
ps.node.mq.removeIfPresent(ps.vm)
}
ps.node.updateMetrics(e.metrics)
return logFields, ps.kind(), currentlyMigrating, verdictSet{cpu: cpuVerdict, mem: memVerdict}
}
func (e *AutoscaleEnforcer) handleVMConfigUpdated(logger *zap.Logger, podName util.NamespacedName, newCfg api.VmConfig) {
logger = logger.With(
zap.String("action", "VM config updated"),
zap.Object("pod", podName),
)
logger.Info("Handling updated config for VM pod")
e.state.lock.Lock()
defer e.state.lock.Unlock()
ps, ok := e.state.pods[podName]
if !ok {
logger.Error("Cannot find Pod in global pods map")
return
}
logger = logger.With(zap.String("node", ps.node.name))
if ps.vm == nil {
logger.Error("handleVMConfigUpdated called for non-VM Pod")
return
}
logger = logger.With(zap.Object("virtualmachine", ps.vm.Name))
// Broadly, we want to update the value of the vmPodState.Config field.
// But *also*, if autoscaling is newly disabled, we should update update the pod/node state.
// And if auto-migration is disabled, we should remove the VM from the migration queue.
oldCfg := ps.vm.Config
ps.vm.Config = newCfg
// worth logging all of this in case we hit issues.
logger.Info("Config updated for VM", zap.Any("oldCfg", newCfg), zap.Any("newCfg", newCfg))
if oldCfg.AutoMigrationEnabled && !newCfg.AutoMigrationEnabled {
ps.node.mq.removeIfPresent(ps.vm)
}
if oldCfg.ScalingEnabled && !newCfg.ScalingEnabled {
cpuVerdict := makeResourceTransitioner(&ps.node.cpu, &ps.cpu).
handleAutoscalingDisabled()
memVerdict := makeResourceTransitioner(&ps.node.mem, &ps.mem).
handleAutoscalingDisabled()
ps.node.updateMetrics(e.metrics)
logger.Info(
"Disabled autoscaling for VM pod",
zap.Object("verdict", verdictSet{
cpu: cpuVerdict,
mem: memVerdict,
}),
)
}
}
func (e *AutoscaleEnforcer) handlePodStartMigration(logger *zap.Logger, podName, migrationName util.NamespacedName, source bool) {
logger = logger.With(
zap.String("action", "VM pod start migration"),
zap.Object("pod", podName),
zap.Object("virtualmachinemigration", migrationName),
)
logger.Info("Handling VM pod migration start")
e.state.lock.Lock()
defer e.state.lock.Unlock()
ps, ok := e.state.pods[podName]
if !ok {
logger.Error("Cannot find Pod in global pods map")
return
}
logger = logger.With(zap.String("node", ps.node.name))
if ps.vm == nil {
logger.Error("handlePodStartMigration called for non-VM Pod")
return
}
logger = logger.With(zap.Object("virtualmachine", ps.vm.Name))
// Reset buffer to zero, remove from migration queue (if in it), and set pod's migrationState
cpuVerdict := makeResourceTransitioner(&ps.node.cpu, &ps.cpu).
handleStartMigration(source)
memVerdict := makeResourceTransitioner(&ps.node.mem, &ps.mem).
handleStartMigration(source)
ps.node.mq.removeIfPresent(ps.vm)
ps.vm.MigrationState = &podMigrationState{Name: migrationName}
ps.node.updateMetrics(e.metrics)
logger.Info(
"Handled start of migration involving pod",
zap.Object("verdict", verdictSet{
cpu: cpuVerdict,
mem: memVerdict,
}),
)
}
func (e *AutoscaleEnforcer) handlePodEndMigration(logger *zap.Logger, podName, migrationName util.NamespacedName) {
logger = logger.With(
zap.String("action", "VM pod end migration"),
zap.Object("pod", podName),
zap.Object("virtualmachinemigration", migrationName),
)
logger.Info("Handling VM pod migration end")
e.state.lock.Lock()
defer e.state.lock.Unlock()
ps, ok := e.state.pods[podName]
if !ok {
logger.Error("Cannot find Pod in global pods map")
return
}
logger = logger.With(zap.String("node", ps.node.name))
if ps.vm == nil {
logger.Error("handlePodEndMigration called for non-VM Pod")
return
}
logger = logger.With(zap.Object("virtualmachine", ps.vm.Name))
ps.vm.MigrationState = nil
//nolint:gocritic // NOTE: not *currently* needed, but this should be kept here as a reminder, in case that changes.
// ps.node.updateMetrics(e.metrics)
logger.Info("Recorded end of migration for VM pod")
}
func (e *AutoscaleEnforcer) handleUpdatedScalingBounds(logger *zap.Logger, vm *api.VmInfo, unqualifiedPodName string) {
podName := util.NamespacedName{Namespace: vm.Namespace, Name: unqualifiedPodName}
logger = logger.With(
zap.String("action", "VM updated scaling bounds"),
zap.Object("pod", podName),
zap.Object("virtualmachine", vm.NamespacedName()),
)
logger.Info("Handling updated scaling bounds for VM")
e.state.lock.Lock()
defer e.state.lock.Unlock()
ps, ok := e.state.pods[podName]
if !ok {
logger.Error("Cannot find Pod in global pods map")
return
}
logger = logger.With(zap.String("node", ps.node.name))
if ps.vm == nil {
logger.Error("handleUpdatedScalingBounds called for non-VM Pod")
return
}
cpuVerdict := handleUpdatedLimits(&ps.node.cpu, &ps.cpu, vm.Cpu.Min, vm.Cpu.Max)
memVerdict := handleUpdatedLimits(&ps.node.mem, &ps.mem, vm.Min().Mem, vm.Max().Mem)
ps.node.updateMetrics(e.metrics)
logger.Info(
"Updated scaling bounds for VM pod",
zap.Object("verdict", verdictSet{
cpu: cpuVerdict,
mem: memVerdict,
}),
)
}
func (e *AutoscaleEnforcer) handleNonAutoscalingUsageChange(logger *zap.Logger, vm *api.VmInfo, unqualifiedPodName string) {
e.state.lock.Lock()
defer e.state.lock.Unlock()
podName := util.NamespacedName{Namespace: vm.Namespace, Name: unqualifiedPodName}
logger = logger.With(
zap.String("action", "non-autoscaling VM usage change"),
zap.Object("pod", podName),
zap.Object("virtualmachine", vm.NamespacedName()),
)
ps, ok := e.state.pods[podName]
if !ok {
logger.Error("Cannot find Pod in global pods map")
return
}
cpuVerdict := makeResourceTransitioner(&ps.node.cpu, &ps.cpu).
handleNonAutoscalingUsageChange(vm.Using().VCPU)
memVerdict := makeResourceTransitioner(&ps.node.mem, &ps.mem).
handleNonAutoscalingUsageChange(vm.Using().Mem)
ps.node.updateMetrics(e.metrics)
logger.Info(
"Updated non-autoscaling VM usage",
zap.Object("verdict", verdictSet{
cpu: cpuVerdict,
mem: memVerdict,
}),
)
}
// NB: expected to be run in its own thread.
func (e *AutoscaleEnforcer) cleanupMigration(logger *zap.Logger, vmm *vmapi.VirtualMachineMigration) {
vmmName := util.GetNamespacedName(vmm)
logger = logger.With(
// note: use the "virtualmachinemigration" key here for just the name, because it mirrors
// what we log in startMigration.
zap.Object("virtualmachinemigration", vmmName),
// also include the VM, for better association.
zap.Object("virtualmachine", util.NamespacedName{
Name: vmm.Spec.VmName,
Namespace: vmm.Namespace,
}),
)
// Failed migrations should be noisy. Everything to do with cleaning up a failed migration
// should be logged at "Warn" or higher.
var logInfo func(string, ...zap.Field)
if vmm.Status.Phase == vmapi.VmmSucceeded {
logInfo = logger.Info
} else {
logInfo = logger.Warn
}
logInfo(
"Going to delete VirtualMachineMigration",
// Explicitly include "phase" here because we have metrics for it.
zap.String("phase", string(vmm.Status.Phase)),
// ... and then log the rest of the information about the migration:
zap.Any("spec", vmm.Spec),
zap.Any("status", vmm.Status),
)
// mark the operation as ongoing
func() {
e.state.lock.Lock()
defer e.state.lock.Unlock()
newCount := e.state.ongoingMigrationDeletions[vmmName] + 1
if newCount != 1 {
// context included by logger
logger.Error(
"More than one ongoing deletion for VirtualMachineMigration",
zap.Int("count", newCount),
)
}
e.state.ongoingMigrationDeletions[vmmName] = newCount
}()
// ... and remember to clean up when we're done:
defer func() {
e.state.lock.Lock()
defer e.state.lock.Unlock()
newCount := e.state.ongoingMigrationDeletions[vmmName] - 1
if newCount == 0 {
delete(e.state.ongoingMigrationDeletions, vmmName)
} else {
// context included by logger
logger.Error(
"More than one ongoing deletion for VirtualMachineMigration",
zap.Int("count", newCount),
)
e.state.ongoingMigrationDeletions[vmmName] = newCount
}
}()
// Continually retry the operation, until we're successful (or the VM doesn't exist anymore)
retryWait := time.Second * time.Duration(e.state.conf.MigrationDeletionRetrySeconds)
for {
logInfo("Attempting to delete VirtualMachineMigration")
err := e.vmClient.NeonvmV1().
VirtualMachineMigrations(vmmName.Namespace).
Delete(context.TODO(), vmmName.Name, metav1.DeleteOptions{})
if err == nil /* NB! This condition is inverted! */ {
logInfo("Successfully deleted VirtualMachineMigration")
e.metrics.migrationDeletions.WithLabelValues(string(vmm.Status.Phase)).Inc()
return
} else if apierrors.IsNotFound(err) {
logger.Warn("Deletion was handled for us; VirtualMachineMigration no longer exists")
return
}
logger.Error(
"Failed to delete VirtualMachineMigration, will try again after delay",
zap.Duration("delay", retryWait),
zap.Error(err),
)
e.metrics.migrationDeleteFails.WithLabelValues(string(vmm.Status.Phase)).Inc()
// retry after a delay
time.Sleep(retryWait)
continue
}
}
func (s *vmPodState) isBetterMigrationTarget(other *vmPodState) bool {
// TODO: this deprioritizes VMs whose metrics we can't collect. Maybe we don't want that?
if s.Metrics == nil || other.Metrics == nil {
return s.Metrics != nil && other.Metrics == nil
}
// TODO - this is just a first-pass approximation. Maybe it's ok for now? Maybe it's not. Idk.
return s.Metrics.LoadAverage1Min < other.Metrics.LoadAverage1Min
}
// this method can only be called while holding a lock. It will be released temporarily while we
// send requests to the API server
//
// A lock will ALWAYS be held on return from this function.
func (e *AutoscaleEnforcer) startMigration(ctx context.Context, logger *zap.Logger, pod *podState) (created bool, _ error) {
if pod.vm.currentlyMigrating() {
return false, fmt.Errorf("Pod is already migrating")
}
// Unlock to make the API request(s), then make sure we're locked on return.
e.state.lock.Unlock()
defer e.state.lock.Lock()
vmmName := util.NamespacedName{
Name: fmt.Sprintf("schedplugin-%s", pod.vm.Name.Name),
Namespace: pod.name.Namespace,
}
logger = logger.With(zap.Object("virtualmachinemigration", vmmName))
logger.Info("Starting VirtualMachineMigration for VM")
// Check that the migration doesn't already exist. If it does, then there's no need to recreate
// it.
//
// We technically don't *need* this additional request here (because we can check the return
// from the Create request with apierrors.IsAlreadyExists). However: the benefit we get from
// this is that the logs are significantly clearer.
_, err := e.vmClient.NeonvmV1().
VirtualMachineMigrations(pod.name.Namespace).
Get(ctx, vmmName.Name, metav1.GetOptions{})
if err == nil {
logger.Warn("VirtualMachineMigration already exists, nothing to do")
return false, nil
} else if !apierrors.IsNotFound(err) {
// We're *expecting* to get IsNotFound = true; if err != nil and isn't NotFound, then
// there's some unexpected error.
logger.Error("Unexpected error doing Get request to check if migration already exists", zap.Error(err))
return false, fmt.Errorf("Error checking if migration exists: %w", err)
}
gitVersion := util.GetBuildInfo().GitInfo
// FIXME: make this not depend on GetBuildInfo() internals.
if gitVersion == "<unknown>" {
gitVersion = "unknown"
}
vmm := &vmapi.VirtualMachineMigration{
ObjectMeta: metav1.ObjectMeta{
// TODO: it's maybe possible for this to run into name length limits? Unclear what we
// should do if that happens.
Name: vmmName.Name,
Namespace: pod.name.Namespace,
Labels: map[string]string{
// NB: There's requirements on what constitutes a valid label. Thankfully, the
// output of `git describe` always will.
//
// See also:
// https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set
LabelPluginCreatedMigration: gitVersion,
},
},
Spec: vmapi.VirtualMachineMigrationSpec{
VmName: pod.vm.Name.Name,
// FIXME: NeonVM's VirtualMachineMigrationSpec has a bunch of boolean fields that aren't
// pointers, which means we need to explicitly set them when using the Go API.
PreventMigrationToSameHost: true,
CompletionTimeout: 3600,
Incremental: true,
AutoConverge: true,
MaxBandwidth: resource.MustParse("1Gi"),
AllowPostCopy: false,
},
}
logger.Info("Migration doesn't already exist, creating one for VM", zap.Any("spec", vmm.Spec))
_, err = e.vmClient.NeonvmV1().VirtualMachineMigrations(pod.name.Namespace).Create(ctx, vmm, metav1.CreateOptions{})
if err != nil {
e.metrics.migrationCreateFails.Inc()
// log here, while the logger's fields are in scope
logger.Error("Unexpected error doing Create request for new migration", zap.Error(err))
return false, fmt.Errorf("Error creating migration: %w", err)
}
e.metrics.migrationCreations.Inc()
logger.Info("VM migration request successful")
return true, nil
}
package plugin
// this file primarily contains the type resourceTransition[T], for handling a number of operations
// on resources, and pretty-formatting summaries of the operations. There are also other, unrelated
// methods to perform similar functionality.
//
// resourceTransitions are created with the collectResourceTransition function.
//
// Handling requested resources from the autoscaler-agent is done with the handleRequested method,
// and changes from VM deletion are handled by handleDeleted.
import (
"errors"
"fmt"
"go.uber.org/zap/zapcore"
"golang.org/x/exp/constraints"
"github.com/neondatabase/autoscaling/pkg/util"
)
// resourceTransitioner maintains the current state of its resource and handles the transition
// into a new state. A resource is associated with a pod, and the pod is associated with a node.
type resourceTransitioner[T constraints.Unsigned] struct {
// node represents the current resource state of the node
node *nodeResourceState[T]
// pod represents the current resource state of the pod.
// pod belongs to the node.
pod *podResourceState[T]
}
func makeResourceTransitioner[T constraints.Unsigned](
node *nodeResourceState[T], pod *podResourceState[T],
) resourceTransitioner[T] {
return resourceTransitioner[T]{
node: node,
pod: pod,
}
}
// resourceState represents a resource state in its pod and its node. This is not necessarily the
// current state. It represents the resource state at a point in time.
type resourceState[T constraints.Unsigned] struct {
node nodeResourceState[T]
pod podResourceState[T]
}
// snapshotState snapshots the current state of the resource transitioner by making a copy of
// its state.
func (r resourceTransitioner[T]) snapshotState() resourceState[T] {
return resourceState[T]{*r.node, *r.pod}
}
// verdictSet represents a set of verdicts from some operation, for ease of logging
type verdictSet struct {
cpu string
mem string
}
// MarshalLogObject implements zapcore.ObjectMarshaler
func (s verdictSet) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddString("cpu", s.cpu)
enc.AddString("mem", s.mem)
return nil
}
// handleReserve adds the resources from the pod to the node, reporting if the node was over-budget
//
// Unlike handleRequested, this method should be called to add a NEW pod to the node.
//
// This is used in combination with Xact to speculatively *try* reserving a pod, and then revert if
// it would result in being over-budget.
func (r resourceTransitioner[T]) handleReserve() (overbudget bool, verdict string) {
callback := func(oldState, newState resourceState[T]) string {
if oldState.pod.Buffer != 0 {
return fmt.Sprintf(
"node reserved %v [buffer %v] + %v [buffer %v] -> %v [buffer %v] of total %v",
// node reserved %v [buffer %v] + %v [buffer %v] ->
oldState.node.Reserved, oldState.node.Buffer, newState.pod.Reserved, newState.pod.Buffer,
// -> %v [buffer %v] of total %v
newState.node.Reserved, newState.node.Buffer, oldState.node.Total,
)
} else {
return fmt.Sprintf(
"node reserved %v + %v -> %v of total %v",
oldState.node.Reserved, newState.pod.Reserved, newState.node.Reserved, oldState.node.Total,
)
}
}
callbackUnexpected := func(message string) verdictCallback[T] {
return func(_, _ resourceState[T]) string {
panic(errors.New(message))
}
}
// Currently, the caller provides the requested value via the Pod's Reserved field.
// In order to convert this to work with handleRequestedGeneric, we need to explicitly represent
// the increase from zero to pod.Reserved, so we do that by setting the Pod's value to zero and
// passing in the requested amount separately.
requested := r.pod.Reserved
r.pod.Reserved = 0
verdict = r.handleRequestedGeneric(
requested,
requestedOptions[T]{
// by setting factor and forceApprovalMinimum to the requested amount, we force that
// handleRequestedGeneric MUST reserve exactly that amount.
// Then, we leave it up to the caller to accept/reject by returning whether the node was
// overbudget, at the very end.
factor: requested,
forceApprovalMinimum: requested,
// only used for migrations
convertIncreaseIntoPressure: false,
// Yes, add buffer, because this is for reserving a pod for the first time. If the pod
// was already known, it's the caller's responsibility to set buffer appropriately.
addBuffer: true,
callbackNoChange: callback,
callbackDecreaseAutoApproved: callbackUnexpected("got 'decrease approved' from logic to reserve new pod"),
callbackIncreaseTurnedToPressure: callback,
callbackIncreaseRejected: callbackUnexpected("got 'increase rejected' from logic to reserve new pod, but it is infallible"),
callbackIncreasePartiallyApproved: callbackUnexpected("got 'partially approved' from logic to reserve new pod, but it is infallible"),
callbackIncreaseFullyApproved: callback,
},
)
overbudget = r.node.Reserved > r.node.Total
return overbudget, verdict
}
// handleRequested updates r.pod and r.node with changes to match the requested resources, within
// what's possible given the remaining resources.
//
// Any permitted increases are required to be a multiple of factor.
//
// Unlike handleReserve, this method should be called to update the resources for a preexisting pod
// on the node.
//
// A pretty-formatted summary of the outcome is returned as the verdict, for logging.
func (r resourceTransitioner[T]) handleRequested(
requested T,
lastPermit *T,
startingMigration bool,
factor T,
) (verdict string) {
normalVerdictCallback := func(oldState, newState resourceState[T]) string {
fmtString := "Register %d%s -> %d%s (pressure %d -> %d); " +
"node reserved %d%s -> %d%s (of %d), " +
"node capacityPressure %d -> %d (%d -> %d spoken for)"
var oldPodBuffer string
var oldNodeBuffer string
var newNodeBuffer string
if oldState.pod.Buffer != 0 {
oldPodBuffer = fmt.Sprintf(" [buffer %d]", oldState.pod.Buffer)
oldNodeBuffer = fmt.Sprintf(" [buffer %d]", oldState.node.Buffer)
newNodeBuffer = fmt.Sprintf(" [buffer %d]", newState.node.Buffer)
}
var wanted string
if newState.pod.Reserved != requested {
wanted = fmt.Sprintf(" (wanted %d)", requested)
}
return fmt.Sprintf(
fmtString,
// Register %d%s -> %d%s (pressure %d -> %d)
oldState.pod.Reserved, oldPodBuffer, newState.pod.Reserved, wanted, oldState.pod.CapacityPressure, newState.pod.CapacityPressure,
// node reserved %d%s -> %d%s (of %d)
oldState.node.Reserved, oldNodeBuffer, newState.node.Reserved, newNodeBuffer, oldState.node.Total,
// node capacityPressure %d -> %d (%d -> %d spoken for)
oldState.node.CapacityPressure, newState.node.CapacityPressure, oldState.node.PressureAccountedFor, newState.node.PressureAccountedFor,
)
}
migrationVerdictCallback := func(oldState, newState resourceState[T]) string {
fmtString := "Denying increase %d -> %d because the pod is starting migration; " +
"node capacityPressure %d -> %d (%d -> %d spoken for)"
return fmt.Sprintf(
fmtString,
// Denying increase %d -> %d because ...
oldState.pod.Reserved, requested,
// node capacityPressure %d -> %d (%d -> %d spoken for)
oldState.node.CapacityPressure, newState.node.CapacityPressure, oldState.node.PressureAccountedFor, newState.node.PressureAccountedFor,
)
}
var forceApprovalMinimum T
if lastPermit != nil {
forceApprovalMinimum = *lastPermit
}
return r.handleRequestedGeneric(
requested,
requestedOptions[T]{
factor: factor,
forceApprovalMinimum: forceApprovalMinimum,
// Can't increase during migrations.
//
// But we _will_ add the pod's request to the node's pressure, noting that its migration
// will resolve it.
convertIncreaseIntoPressure: startingMigration,
// don't add buffer to the node; autoscaler-agent requests should reset it.
addBuffer: false,
callbackNoChange: normalVerdictCallback,
callbackDecreaseAutoApproved: normalVerdictCallback,
callbackIncreaseTurnedToPressure: migrationVerdictCallback,
callbackIncreaseRejected: normalVerdictCallback,
callbackIncreasePartiallyApproved: normalVerdictCallback,
callbackIncreaseFullyApproved: normalVerdictCallback,
},
)
}
type requestedOptions[T constraints.Unsigned] struct {
// factor provides a multiple binding the result of any increases from handleRequestedGeneric()
//
// For handling autoscaler-agent requests, this is the value of a compute unit's worth of that
// resource (e.g. 0.25 CPU or 1 GiB memory).
// For initially reserving a Pod, factor is set equal to the total additional resources, which
// turns handleRequestedGeneric() into a binary function that either grants the entire request,
// or none of it.
factor T
// forceApprovalMinimum sets the threshold above which handleRequestedGeneric() is allowed to
// reject the request - i.e. if the request is less than or equal to forceApprovalMinimum, it
// must be approved.
//
// This is typically set to a non-zero value when reserving resources for a Pod that has already
// been scheduled (so there's nothing we can do about it), or when handling an autoscaler-agent
// request that provides what a previous scheduler approved (via lastPermit).
forceApprovalMinimum T
// convertIncreaseIntoPressure causes handleRequestedGeneric() to reject any requested increases
// in reserved resources, and instead add the amount of the increase to the CapacityPressure of
// the Pod and Node.
convertIncreaseIntoPressure bool
// addBuffer causes handleRequestedGeneric() to additionally add the pod's Buffer field to the
// node, under the assumption that the Buffer is completely new.
//
// Note that if addBuffer is true, buffer will be added *even if the reservation is rejected*.
addBuffer bool
callbackNoChange verdictCallback[T]
callbackDecreaseAutoApproved verdictCallback[T]
callbackIncreaseTurnedToPressure verdictCallback[T]
callbackIncreaseRejected verdictCallback[T]
callbackIncreasePartiallyApproved verdictCallback[T]
callbackIncreaseFullyApproved verdictCallback[T]
}
type verdictCallback[T constraints.Unsigned] func(oldState, newState resourceState[T]) string
func (r resourceTransitioner[T]) handleRequestedGeneric(
requested T,
opts requestedOptions[T],
) (verdict string) {
oldState := r.snapshotState()
var verdictGenerator verdictCallback[T]
if requested <= r.pod.Reserved {
// Decrease "requests" are actually just notifications it's already happened
r.node.Reserved -= r.pod.Reserved - requested
r.pod.Reserved = requested
// pressure is now zero, because the pod no longer wants to increase resources.
r.pod.CapacityPressure = 0
r.node.CapacityPressure -= oldState.pod.CapacityPressure
if requested == r.pod.Reserved {
verdictGenerator = opts.callbackNoChange
} else /* requested < r.pod.Reserved */ {
verdictGenerator = opts.callbackDecreaseAutoApproved
}
} else if opts.convertIncreaseIntoPressure /* implied: requested > pod.Reserved */ {
r.pod.CapacityPressure = requested - r.pod.Reserved
r.node.CapacityPressure = r.node.CapacityPressure + r.pod.CapacityPressure - oldState.pod.CapacityPressure
verdictGenerator = opts.callbackIncreaseTurnedToPressure
} else /* implied: requested > pod.Reserved && !opts.convertIncreaseIntoPressure */ {
// The following comment was made 2022-11-28 (updated 2023-04-06, 2024-05-DD): (TODO: set date)
//
// Note: this function as currently written will actively cause the autoscaler-agent to use
// resources that are uneven w.r.t. the number of compute units they represent.
//
// For example, we might have a request to go from 3CPU/3Gi -> 4CPU/4Gi but we only allow
// 4CPU/3Gi, which would instead be 4 compute units of CPU but 3 compute units of memory.
// When the autoscaler-agent receives the permit, it naively carries it out, giving itself a
// resource allocation that isn't a multiple of compute units.
//
// This obviously isn't great. However, this *is* the most resilient solution, and it is
// significantly simpler to implement, so it is the one I went with. As it currently stands,
// the autoscaler-agent is still expected to submit requests that are multiples of compute
// units, so the system should *eventually* stabilize (provided that the autoscaler-agent is
// not violating its own guarantees). This allows us to gracefully handle many kinds of
// stressors. Handling the resources separately *from the scheduler's point of view* makes
// it much, much easier to deal with.
//
// Please think carefully before changing this.
// note: it's entirely possible to have Reserved > Total, under a variety of
// undesirable-but-impossible-to-prevent circumstances.
remainingReservable := util.SaturatingSub(r.node.Total, r.node.Reserved)
increase := requested - r.pod.Reserved
// Increases are bounded by what's left in the node, rounded down to the nearest multiple of
// the factor.
maxIncrease := (remainingReservable / opts.factor) * opts.factor
// ... but we must allow at least opts.forceApprovalMinimum
increaseFromForceApproval := util.SaturatingSub(opts.forceApprovalMinimum, r.pod.Reserved)
maxIncrease = max(maxIncrease, increaseFromForceApproval)
if increase > maxIncrease /* increases are bound by what's left in the node */ {
r.pod.CapacityPressure = increase - maxIncrease
// adjust node pressure accordingly. We can have old < new or new > old, so we shouldn't
// directly += or -= (implicitly relying on overflow).
r.node.CapacityPressure = r.node.CapacityPressure - oldState.pod.CapacityPressure + r.pod.CapacityPressure
increase = maxIncrease // cap at maxIncrease.
verdictGenerator = opts.callbackIncreasePartiallyApproved
} else {
// If we're not capped by maxIncrease, relieve pressure coming from this pod
r.node.CapacityPressure -= r.pod.CapacityPressure
r.pod.CapacityPressure = 0
verdictGenerator = opts.callbackIncreaseFullyApproved
}
r.pod.Reserved += increase
r.node.Reserved += increase
}
if r.pod.Buffer != 0 {
if opts.addBuffer {
r.node.Buffer += r.pod.Buffer
} else /* !opts.addBuffer - buffer is only needed until the first request, so we can reset it */ {
r.node.Buffer -= r.pod.Buffer
r.pod.Buffer = 0
}
}
newState := r.snapshotState()
return verdictGenerator(oldState, newState)
}
// handleDeleted updates r.node with changes to match the removal of r.pod
//
// A pretty-formatted summary of the changes is returned as the verdict, for logging.
func (r resourceTransitioner[T]) handleDeleted(currentlyMigrating bool) (verdict string) {
oldState := r.snapshotState()
r.node.Reserved -= r.pod.Reserved
r.node.CapacityPressure -= r.pod.CapacityPressure
if currentlyMigrating {
r.node.PressureAccountedFor -= r.pod.Reserved + r.pod.CapacityPressure
}
var podBuffer string
var oldNodeBuffer string
var newNodeBuffer string
if r.pod.Buffer != 0 {
r.node.Buffer -= r.pod.Buffer
podBuffer = fmt.Sprintf(" [buffer %d]", r.pod.Buffer)
oldNodeBuffer = fmt.Sprintf(" [buffer %d]", oldState.node.Buffer)
newNodeBuffer = fmt.Sprintf(" [buffer %d]", r.node.Buffer)
}
fmtString := "pod had %d%s; node reserved %d%s -> %d%s, " +
"node capacityPressure %d -> %d (%d -> %d spoken for)"
verdict = fmt.Sprintf(
fmtString,
// pod had %d%s; node reserved %d%s -> %d%s
r.pod.Reserved, podBuffer, oldState.node.Reserved, oldNodeBuffer, r.node.Reserved, newNodeBuffer,
// node capacityPressure %d -> %d (%d -> %d spoken for)
oldState.node.CapacityPressure, r.node.CapacityPressure, oldState.node.PressureAccountedFor, r.node.PressureAccountedFor,
)
return verdict
}
func (r resourceTransitioner[T]) handleNonAutoscalingUsageChange(newUsage T) (verdict string) {
oldState := r.snapshotState()
diff := newUsage - r.pod.Reserved
r.pod.Reserved = newUsage
r.node.Reserved += diff
verdict = fmt.Sprintf(
"pod reserved (%v -> %v), node reserved (%v -> %v)",
oldState.pod.Reserved, r.pod.Reserved, oldState.node.Reserved, r.node.Reserved,
)
return verdict
}
// handleAutoscalingDisabled updates r.node with changes to clear any buffer and capacityPressure
// from r.pod
//
// A pretty-formatted summary of the changes is returned as the verdict, for logging.
func (r resourceTransitioner[T]) handleAutoscalingDisabled() (verdict string) {
oldState := r.snapshotState()
// buffer is included in reserved, so we reduce everything by buffer.
buffer := r.pod.Buffer
valuesToReduce := []*T{&r.node.Reserved, &r.node.Buffer, &r.pod.Reserved, &r.pod.Buffer}
for _, v := range valuesToReduce {
*v -= buffer
}
r.node.CapacityPressure -= r.pod.CapacityPressure
r.pod.CapacityPressure = 0
var nodeBufferChange string
if oldState.pod.Buffer != 0 {
nodeBufferChange = fmt.Sprintf(" [buffer %d -> %d]", oldState.node.Buffer, r.node.Buffer)
}
fmtString := "pod had buffer %d, capacityPressure %d; " +
"node reserved %d -> %d%s, capacityPressure %d -> %d"
verdict = fmt.Sprintf(
fmtString,
// pod had buffer %d, capacityPressure %d;
oldState.pod.Buffer, oldState.pod.CapacityPressure,
// node reserved %d -> %d%s, capacityPressure %d -> %d
oldState.node.Reserved, r.node.Reserved, nodeBufferChange, oldState.node.CapacityPressure, r.node.CapacityPressure,
)
return verdict
}
// handleStartMigration updates r.node with changes to clear any buffer and capacityPressure from
// r.pod.
//
// If the pod is the migration source, this method *also* increases the node's PressureAccountedFor
// to match the pod's resource usage.
//
//nolint:unparam // linter complains about 'source'. FIXME: needs more work to figure this out.
func (r resourceTransitioner[T]) handleStartMigration(source bool) (verdict string) {
// This method is basically the same as handleAutoscalingDisabled, except we also update the
// node's PressureAccountedFor because any pressure generated by the pod will be resolved once
// the migration completes and the pod gets deleted.
oldState := r.snapshotState()
buffer := r.pod.Buffer
valuesToReduce := []*T{&r.node.Reserved, &r.node.Buffer, &r.pod.Reserved, &r.pod.Buffer}
for _, v := range valuesToReduce {
*v -= buffer
}
r.node.CapacityPressure -= r.pod.CapacityPressure
r.pod.CapacityPressure = 0
r.node.PressureAccountedFor += r.pod.Reserved
fmtString := "pod had buffer %d, capacityPressure %d; " +
"node reserved %d -> %d, capacityPressure %d -> %d, pressureAccountedFor %d -> %d"
verdict = fmt.Sprintf(
fmtString,
// pod had buffer %d, capacityPressure %d;
oldState.pod.Buffer, oldState.pod.CapacityPressure,
// node reserved %d -> %d, capacityPressure %d -> %d
oldState.node.Reserved, r.node.Reserved, oldState.node.CapacityPressure, r.node.CapacityPressure, oldState.node.PressureAccountedFor, r.node.PressureAccountedFor,
)
return verdict
}
func handleUpdatedLimits[T constraints.Unsigned](
node *nodeResourceState[T],
pod *podResourceState[T],
newMin T,
newMax T,
) (verdict string) {
if newMin == pod.Min && newMax == pod.Max {
return fmt.Sprintf("limits unchanged (min = %d, max = %d)", newMin, newMax)
}
// If the maximum bound has changed, then we should update {node,pod}.Buffer based it so that we
// can make a best-effort attempt to avoid overcommitting. This solution can't be perfect
// (because we're intentionally not using the "hard" limits provided by NeonVM, which would be
// overly conservative).
// However. This solution should be *good enough* - the cases it protects against are already
// exceptionally rare, and the imperfections even more so.
//
// To be clear, the cases we're worried about are things like the following sequence of events:
//
// 1. VM is at 4 CPU (of max 4)
// 2. Scheduler dies, autoscaler-agent loses contact
// 3. autoscaler-agent downscales to 2 CPU
// 3. VM Cpu.Max gets set to 2 (autoscaler-agent misses this)
// 4. Scheduler appears, observes Cpu.Max = 2
// 5. VM Cpu.Max gets set to 4
// 6. autoscaler-agent observes Cpu.Max is still 4
// 7. autoscaler-agent scales VM up to 4 CPU, which it is able to do because a previous
// scheduler approved 4 CPU.
// <-- INCONSISTENT STATE -->
// 8. autoscaler-agent reconnects with scheduler, informing it that it's using 4 CPU
//
// Again: we can't handle this perfectly with the current system. However, a good best-effort
// attempt to prevent this is worthwhile here. (realistically, the things we can't prevent would
// require a "perfect storm" of other failures in order to be relevant - which is good!)
bufferVerdict := ""
updateBuffer := pod.Max != newMax
if updateBuffer {
oldPodBuffer := pod.Buffer
oldNodeBuffer := node.Buffer
oldPodReserved := pod.Reserved
oldNodeReserved := node.Reserved
// Recalculate Reserved and Buffer from scratch because it's easier than doing the math
// directly.
//
// Note that we don't want to reserve *below* what we think the VM is using if the bounds
// decrease; it may be that the autoscaler-agent has not yet reacted to that.
using := pod.Reserved - pod.Buffer
pod.Reserved = max(newMax, using)
pod.Buffer = pod.Reserved - using
node.Reserved = node.Reserved + pod.Reserved - oldPodReserved
node.Buffer = node.Buffer + pod.Buffer - oldPodBuffer
bufferVerdict = fmt.Sprintf(
". no contact yet: pod reserved %d -> %d (buffer %d -> %d), node reserved %d -> %d (buffer %d -> %d)",
oldPodReserved, pod.Reserved, oldPodBuffer, pod.Buffer,
oldNodeReserved, node.Reserved, oldNodeBuffer, node.Buffer,
)
}
oldMin := pod.Min
oldMax := pod.Max
pod.Min = newMin
pod.Max = newMax
return fmt.Sprintf("updated min %d -> %d, max %d -> %d%s", oldMin, newMin, oldMax, newMax, bufferVerdict)
}
package plugin
// Implementation of watching for Pod deletions and changes to a VM's scaling settings (either
// whether it's disabled, or the scaling bounds themselves).
import (
"context"
"reflect"
"time"
"go.uber.org/zap"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
"github.com/neondatabase/autoscaling/pkg/util/watch"
)
type nodeWatchCallbacks struct {
submitNodeDeletion func(*zap.Logger, string)
}
// watchNodeEvents watches for any deleted Nodes, so that we can clean up the resources that were
// associated with them.
func (e *AutoscaleEnforcer) watchNodeEvents(
ctx context.Context,
parentLogger *zap.Logger,
metrics watch.Metrics,
callbacks nodeWatchCallbacks,
) (*watch.Store[corev1.Node], error) {
logger := parentLogger.Named("node-watch")
return watch.Watch(
ctx,
logger.Named("watch"),
e.handle.ClientSet().CoreV1().Nodes(),
watch.Config{
ObjectNameLogField: "node",
Metrics: watch.MetricsConfig{
Metrics: metrics,
Instance: "Nodes",
},
// FIXME: make these configurable.
RetryRelistAfter: util.NewTimeRange(time.Second, 3, 5),
RetryWatchAfter: util.NewTimeRange(time.Second, 3, 5),
},
watch.Accessors[*corev1.NodeList, corev1.Node]{
Items: func(list *corev1.NodeList) []corev1.Node { return list.Items },
},
watch.InitModeSync, // Doesn't matter because AddFunc is nil and node store is only used for events.
metav1.ListOptions{},
watch.HandlerFuncs[*corev1.Node]{
DeleteFunc: func(node *corev1.Node, mayBeStale bool) {
logger.Info("Received delete event for node", zap.String("node", node.Name))
callbacks.submitNodeDeletion(logger, node.Name)
},
},
)
}
type podWatchCallbacks struct {
submitStarted func(_ *zap.Logger, _ *corev1.Pod, preexisting bool)
submitDeletion func(*zap.Logger, util.NamespacedName)
submitStartMigration func(_ *zap.Logger, podName, migrationName util.NamespacedName, source bool)
submitEndMigration func(_ *zap.Logger, podName, migrationName util.NamespacedName)
}
// watchPodEvents continuously tracks a handful of Pod-related events that we care about. These
// events are pod deletion or completion for VM and non-VM pods.
//
// This method starts its own goroutine, and guarantees that we have started listening for FUTURE
// events once it returns (unless it returns error).
//
// Events occurring before this method is called will not be sent.
func (e *AutoscaleEnforcer) watchPodEvents(
ctx context.Context,
parentLogger *zap.Logger,
metrics watch.Metrics,
callbacks podWatchCallbacks,
) (*watch.Store[corev1.Pod], error) {
logger := parentLogger.Named("pod-watch")
return watch.Watch(
ctx,
logger.Named("watch"),
e.handle.ClientSet().CoreV1().Pods(corev1.NamespaceAll),
watch.Config{
ObjectNameLogField: "pod",
Metrics: watch.MetricsConfig{
Metrics: metrics,
Instance: "Pods",
},
// We want to be up-to-date in tracking deletions, so that our reservations are correct.
//
// FIXME: make these configurable.
RetryRelistAfter: util.NewTimeRange(time.Millisecond, 250, 750),
RetryWatchAfter: util.NewTimeRange(time.Millisecond, 250, 750),
},
watch.Accessors[*corev1.PodList, corev1.Pod]{
Items: func(list *corev1.PodList) []corev1.Pod { return list.Items },
},
watch.InitModeSync, // required so that events are queued before watchPodEvents() returns
metav1.ListOptions{},
watch.HandlerFuncs[*corev1.Pod]{
AddFunc: func(pod *corev1.Pod, preexisting bool) {
name := util.GetNamespacedName(pod)
if e.state.conf.ignoredNamespace(pod.Namespace) {
logger.Info("Received add event for ignored Pod", zap.Object("pod", name))
return
}
// Generate events for pods when they become running
if pod.Status.Phase == corev1.PodRunning {
if !preexisting {
// Generally pods shouldn't be immediately running, so we log this as a
// warning. If it was preexisting, then it'll be handled on the initial
// cluster read already (but we generate the events anyways so that we
// definitely don't miss anything).
logger.Warn("Received add event for new Pod already running", zap.Object("pod", name))
}
callbacks.submitStarted(logger, pod, preexisting)
}
},
UpdateFunc: func(oldPod *corev1.Pod, newPod *corev1.Pod) {
name := util.GetNamespacedName(newPod)
if e.state.conf.ignoredNamespace(newPod.Namespace) {
logger.Info("Received update event for ignored Pod", zap.Object("pod", name))
return
}
// Check if a pod is now running.
if oldPod.Status.Phase == corev1.PodPending && newPod.Status.Phase == corev1.PodRunning {
logger.Info("Received update event for Pod now running", zap.Object("pod", name))
callbacks.submitStarted(logger, newPod, false)
}
// Check if pod is "completed" - handle that the same as deletion.
if !util.PodCompleted(oldPod) && util.PodCompleted(newPod) {
logger.Info("Received update event for completion of Pod", zap.Object("pod", name))
callbacks.submitDeletion(logger, name)
return // no other handling worthwhile if the pod's done.
}
// Check if the pod is part of a new migration, or if a migration it *was* part of
// has now ended.
oldMigration := util.TryPodOwnerVirtualMachineMigration(oldPod)
newMigration := util.TryPodOwnerVirtualMachineMigration(newPod)
if oldMigration == nil && newMigration != nil {
isSource := util.TryPodOwnerVirtualMachine(newPod) == nil
callbacks.submitStartMigration(logger, name, *newMigration, isSource)
} else if oldMigration != nil && newMigration == nil {
callbacks.submitEndMigration(logger, name, *oldMigration)
}
},
DeleteFunc: func(pod *corev1.Pod, mayBeStale bool) {
name := util.GetNamespacedName(pod)
if e.state.conf.ignoredNamespace(pod.Namespace) {
logger.Info("Received delete event for ignored Pod", zap.Object("pod", name))
return
}
if util.PodCompleted(pod) {
logger.Info("Received delete event for completed Pod", zap.Object("pod", name))
} else {
logger.Info("Received delete event for Pod", zap.Object("pod", name))
callbacks.submitDeletion(logger, name)
}
},
},
)
}
type vmWatchCallbacks struct {
submitConfigUpdated func(_ *zap.Logger, podName util.NamespacedName, newCfg api.VmConfig)
submitBoundsChanged func(_ *zap.Logger, _ *api.VmInfo, podName string)
submitNonAutoscalingVmUsageChanged func(_ *zap.Logger, _ *api.VmInfo, podName string)
}
// watchVMEvents watches for changes in VMs: signaling when scaling becomes disabled and updating
// stored information when scaling bounds change.
//
// The reason we care about when scaling is disabled is that if we don't, we can run into the
// following race condition:
//
// 1. VM created with autoscaling enabled
// 2. Scheduler restarts and reads the state of the cluster. It records the difference between the
// VM's current and maximum usage as "buffer"
// 3. Before the autoscaler-agent runner for the VM connects to the scheduler, the VM's label to
// enable autoscaling is removed, and the autoscaler-agent's runner exits.
// 4. final state: The scheduler retains buffer for a VM that can't scale.
//
// To avoid (4) occurring, we track events where autoscaling is disabled for a VM and remove its
// "buffer" when that happens. There's still some other possibilities for race conditions (FIXME),
// but those are a little harder to handlle - in particular:
//
// 1. Scheduler exits
// 2. autoscaler-agent runner downscales
// 3. Scheduler starts, reads cluster state
// 4. VM gets autoscaling disabled
// 5. Scheduler removes the VM's buffer
// 6. Before noticing that event, the autoscaler-agent upscales the VM and informs the scheduler of
// its current allocation (which it can do, because it was approved by a previous scheduler).
// 7. The scheduler denies what it sees as upscaling.
//
// This one requires a very unlikely sequence of events to occur, that should be appropriately
// handled by cancelled contexts in *almost all* cases.
func (e *AutoscaleEnforcer) watchVMEvents(
ctx context.Context,
parentLogger *zap.Logger,
metrics watch.Metrics,
callbacks vmWatchCallbacks,
podIndex watch.IndexedStore[corev1.Pod, *watch.NameIndex[corev1.Pod]],
) (*watch.Store[vmapi.VirtualMachine], error) {
logger := parentLogger.Named("vm-watch")
return watch.Watch(
ctx,
logger.Named("watch"),
e.vmClient.NeonvmV1().VirtualMachines(corev1.NamespaceAll),
watch.Config{
ObjectNameLogField: "virtualmachine",
Metrics: watch.MetricsConfig{
Metrics: metrics,
Instance: "VirtualMachines",
},
// FIXME: make these durations configurable.
RetryRelistAfter: util.NewTimeRange(time.Millisecond, 250, 750),
RetryWatchAfter: util.NewTimeRange(time.Millisecond, 250, 750),
},
watch.Accessors[*vmapi.VirtualMachineList, vmapi.VirtualMachine]{
Items: func(list *vmapi.VirtualMachineList) []vmapi.VirtualMachine { return list.Items },
},
watch.InitModeSync, // Doesn't matter because AddFunc is nil, and vmStore is only used for events.
metav1.ListOptions{},
watch.HandlerFuncs[*vmapi.VirtualMachine]{
UpdateFunc: func(oldVM, newVM *vmapi.VirtualMachine) {
if e.state.conf.ignoredNamespace(newVM.Namespace) {
logger.Info("Received update event for ignored VM", util.VMNameFields(newVM))
return
}
newInfo, err := api.ExtractVmInfo(logger, newVM)
if err != nil {
// Try to get the runner pod associated with the VM, if we can, but don't worry
// about it if we can't.
var runnerPod runtime.Object
if podName := newVM.Status.PodName; podName != "" {
// NB: index.Get returns nil if not found, so we only have a non-nil
// runnerPod if it's currently known.
rp, _ := podIndex.GetIndexed(func(index *watch.NameIndex[corev1.Pod]) (*corev1.Pod, bool) {
return index.Get(newVM.Namespace, podName)
})
// avoid typed nils by only assigning if non-nil
// See <https://github.com/neondatabase/autoscaling/issues/689> for more.
if rp != nil {
runnerPod = rp
}
}
logger.Error("Failed to extract VM info in update for new VM", util.VMNameFields(newVM), zap.Error(err))
e.handle.EventRecorder().Eventf(
newVM, // regarding
runnerPod, // related
"Warning", // eventtype
"ExtractVmInfo", // reason
"HandleVmUpdate", // action
"Failed to extract autoscaling info about VM: %s", // note
err,
)
return
}
oldInfo, err := api.ExtractVmInfo(logger, oldVM)
if err != nil {
logger.Error("Failed to extract VM info in update for old VM", util.VMNameFields(oldVM), zap.Error(err))
return
}
if newVM.Status.PodName == "" {
logger.Info("Skipping update for VM because .status.podName is empty", util.VMNameFields(newVM))
return
}
if !reflect.DeepEqual(oldInfo.Config, newInfo.Config) {
logger.Info("Received config update for VM", util.VMNameFields(newVM))
name := util.NamespacedName{Namespace: newInfo.Namespace, Name: newVM.Status.PodName}
callbacks.submitConfigUpdated(logger, name, newInfo.Config)
}
if (!oldInfo.Config.ScalingEnabled || !newInfo.Config.ScalingEnabled) && oldInfo.Using() != newInfo.Using() {
podName := util.NamespacedName{Namespace: newInfo.Namespace, Name: newVM.Status.PodName}
logger.Info("Received update changing usage for VM", zap.Object("old", oldInfo.Using()), zap.Object("new", newInfo.Using()))
callbacks.submitNonAutoscalingVmUsageChanged(logger, newInfo, podName.Name)
}
// If the pod changed, then we're going to handle a deletion event for the old pod,
// plus creation event for the new pod. Don't worry about it - because all VM
// information comes from this watch.Store anyways, there's no possibility of missing
// an update.
if oldVM.Status.PodName != newVM.Status.PodName {
return
}
// If bounds didn't change, then no need to update
if oldInfo.EqualScalingBounds(*newInfo) {
return
}
callbacks.submitBoundsChanged(logger, newInfo, newVM.Status.PodName)
},
},
)
}
type migrationWatchCallbacks struct {
submitMigrationFinished func(*vmapi.VirtualMachineMigration)
}
// watchMigrationEvents *only* looks at migrations that were created by the scheduler plugin (or a
// previous version of it).
//
// We use this to trigger cleaning up migrations once they're finished, because they don't
// auto-delete, and our deterministic naming means that each we won't be able to create a new
// migration for the same VM until the old one's gone.
//
// Tracking whether a migration was created by the scheduler plugin is done by adding the label
// 'autoscaling.neon.tech/created-by-scheduler' to every migration we create.
func (e *AutoscaleEnforcer) watchMigrationEvents(
ctx context.Context,
parentLogger *zap.Logger,
metrics watch.Metrics,
callbacks migrationWatchCallbacks,
) (*watch.Store[vmapi.VirtualMachineMigration], error) {
logger := parentLogger.Named("vmm-watch")
return watch.Watch(
ctx,
logger.Named("watch"),
e.vmClient.NeonvmV1().VirtualMachineMigrations(corev1.NamespaceAll),
watch.Config{
ObjectNameLogField: "virtualmachinemigration",
Metrics: watch.MetricsConfig{
Metrics: metrics,
Instance: "VirtualMachineMigrations",
},
// FIXME: make these durations configurable.
RetryRelistAfter: util.NewTimeRange(time.Second, 3, 5),
RetryWatchAfter: util.NewTimeRange(time.Second, 3, 5),
},
watch.Accessors[*vmapi.VirtualMachineMigrationList, vmapi.VirtualMachineMigration]{
Items: func(list *vmapi.VirtualMachineMigrationList) []vmapi.VirtualMachineMigration { return list.Items },
},
watch.InitModeSync,
metav1.ListOptions{
// NB: Including just the label itself means that we select for objects that *have* the
// label, without caring about the actual value.
//
// See also:
// https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#set-based-requirement
LabelSelector: LabelPluginCreatedMigration,
},
watch.HandlerFuncs[*vmapi.VirtualMachineMigration]{
UpdateFunc: func(oldObj, newObj *vmapi.VirtualMachineMigration) {
if e.state.conf.ignoredNamespace(newObj.Namespace) {
logger.Info(
"Received update event for ignored VM Migration",
zap.Object("virtualmachinemigration", util.GetNamespacedName(newObj)),
)
return
}
shouldDelete := newObj.Status.Phase != oldObj.Status.Phase &&
(newObj.Status.Phase == vmapi.VmmSucceeded || newObj.Status.Phase == vmapi.VmmFailed)
if shouldDelete {
callbacks.submitMigrationFinished(newObj)
}
},
},
)
}
package util
// Helper arithmetic methods
import (
"golang.org/x/exp/constraints"
)
// SaturatingSub returns x - y if x >= y, otherwise zero
func SaturatingSub[T constraints.Unsigned](x, y T) T {
if x >= y {
return x - y
} else {
var zero T
return zero
}
}
// AbsDiff returns the absolute value of the difference between x and y
func AbsDiff[T constraints.Unsigned](x, y T) T {
if x > y {
return x - y
} else {
return y - x
}
}
// AtomicInt represents the shared interface provided by various atomic.<NAME> integers
//
// This interface type is primarily used by AtomicMax.
type AtomicInt[I any] interface {
Add(delta I) (new I) //nolint:predeclared // same var names as methods
CompareAndSwap(old, new I) (swapped bool) //nolint:predeclared // same var names as methods
Load() I
Store(val I)
Swap(new I) (old I) //nolint:predeclared // same var names as methods
}
// AtomicMax atomically sets a to the maximum of *a and i, returning the old value at a.
//
// On ISAs without atomic maximum/minimum instructions, a fallback is typically implemented as the
// Load + CompareAndSwap loop that this function uses. At time of writing (Go 1.20), the Go standard
// library does not include atomic maximum/minimum functions.
//
// This function is lock-free but not wait-free.
func AtomicMax[A AtomicInt[I], I constraints.Integer](a A, i I) I {
for {
current := a.Load()
if current >= i {
return current
}
if a.CompareAndSwap(current, i) {
return current
}
}
}
package util
// A channel-based sync.Cond-like interface, with support for broadcast operations (but some
// additional restrictions). Refer to the documentation of Wait for detailed usage.
import (
"sync"
)
func NewBroadcaster() *Broadcaster {
return &Broadcaster{
mu: sync.Mutex{},
ch: make(chan struct{}),
sent: 0,
}
}
type Broadcaster struct {
mu sync.Mutex
ch chan struct{}
sent uint64
}
type BroadcastReceiver struct {
b *Broadcaster
viewed uint64
}
// Broadcast sends a signal to all receivers
func (b *Broadcaster) Broadcast() {
b.mu.Lock()
defer b.mu.Unlock()
close(b.ch)
b.ch = make(chan struct{})
b.sent += 1
}
// NewReceiver creates a new BroadcastReceiver that will receive only future broadcasted events.
//
// It's generally not recommended to call (*BroadcastReceiver).Wait() on a single BroadcastReceiver
// from more than one thread at a time, although it *is* thread-safe.
func (b *Broadcaster) NewReceiver() BroadcastReceiver {
b.mu.Lock()
defer b.mu.Unlock()
return BroadcastReceiver{
b: b,
viewed: b.sent,
}
}
var closedChannel = func() <-chan struct{} {
ch := make(chan struct{})
close(ch)
return ch
}()
// Wait returns a channel that will be closed once there has been an event broadcasted since
// the BroadcastReceiver was created, or the last call to Awake().
//
// Typical usage of Wait will involve selecting on the channel returned and calling Awake
// immediately in the branch handling the event, for example:
//
// select {
// case <-ctx.Done():
// return
// case <-receiver.Wait():
// receiver.Awake()
// ...
// }
func (r *BroadcastReceiver) Wait() <-chan struct{} {
r.b.mu.Lock()
defer r.b.mu.Unlock()
if r.b.sent == r.viewed {
return r.b.ch
} else {
return closedChannel
}
}
// Awake marks the most recent broadcast event as received, so that the next call to Wait returns a
// channel that will only be closed once there's been a new event after this call to Awake.
func (r *BroadcastReceiver) Awake() {
r.b.mu.Lock()
defer r.b.mu.Unlock()
r.viewed = r.b.sent
}
package util
// This file primarily exposes the GetBuildInfo function
import (
"runtime/debug"
)
// BuildGitInfo stores some pretty-formatted information about the repository and working tree at
// build time. It's set by the GIT_INFO argument in the Dockerfiles and set to the output of:
//
// git describe --long --dirty
//
// While public, this value is not expected to be used externally. You should use GetBuildInfo
// instead.
var BuildGitInfo string
// BuildInfo stores a little bit of information about the build of the current binary
//
// All strings are guaranteed to be non-empty.
type BuildInfo struct {
GitInfo string `json:"gitInfo"`
GoVersion string `json:"goVersion"`
}
// GetBuildInfo makes a best-effort attempt to return some information about how the currently
// running binary was built
func GetBuildInfo() BuildInfo {
goVersion := "<unknown>"
if buildInfo, ok := debug.ReadBuildInfo(); ok {
if buildInfo.GoVersion != "" {
goVersion = buildInfo.GoVersion
}
}
// FIXME: the "<unknown>" string is depended upon by the plugin's VirtualMachineMigration
// creation process. We should expose something better here.
gitInfo := BuildGitInfo
if BuildGitInfo == "" {
gitInfo = "<unknown>"
}
return BuildInfo{
GitInfo: gitInfo,
GoVersion: goVersion,
}
}
package util
// Implementation of a channel-based mutex, so that it can be combined with Context.Done and other
// select-able methods, without dealing with the hassle of creating separate goroutines
import (
"context"
"fmt"
"time"
)
// ChanMutex is a select-able mutex
//
// It is fair if and only if receiving on a channel is fair. As of Go 1.19/2022-01-17, receiving on
// a channel appears to be fair. However: this is a runtime implementation detail, and so it may
// change without notice in the future.
//
// Unlike sync.Mutex, ChanMutex requires initialization before use because it's basically just a
// channel.
//
// Also unlike sync.Mutex, a ChanMutex may be copied without issue (again, because it's just a
// channel).
type ChanMutex struct {
ch chan struct{}
}
// NewChanMutex creates a new ChanMutex
func NewChanMutex() ChanMutex {
ch := make(chan struct{}, 1)
ch <- struct{}{}
return ChanMutex{ch}
}
// Lock locks m
//
// This method is semantically equivalent to sync.Mutex.Lock
func (m *ChanMutex) Lock() {
if m.ch == nil {
panic("called Lock on uninitialized ChanMutex")
}
<-m.ch
}
// WaitLock is like Lock, but instead returns a channel
//
// If receiving on the channel succeeds, the caller "holds" the lock and must now be responsible for
// Unlock-ing it.
func (m *ChanMutex) WaitLock() <-chan struct{} {
if m.ch == nil {
panic("called WaitLock on uninitialized ChanMutex")
}
return m.ch
}
// TryLock blocks until locking m succeeds or the context is cancelled
//
// If the context is cancelled while waiting to lock m, the lock will be left unchanged and
// ctx.Err() will be returned.
func (m *ChanMutex) TryLock(ctx context.Context) error {
if m.ch == nil {
panic("called TryLock on uninitialized ChanMutex")
}
select {
case <-m.ch:
return nil
case <-ctx.Done():
return ctx.Err()
}
}
// Unlock unlocks m
//
// This method is semantically equivalent to sync.Mutex.Unlock
func (m *ChanMutex) Unlock() {
select {
case m.ch <- struct{}{}:
default:
panic("ChanMutex.Unlock called while already unlocked")
}
}
// DeadlockChecker creates a function that, when called, periodically attempts to acquire the lock,
// panicking if it fails
//
// The returned function exits when the context is done.
func (m *ChanMutex) DeadlockChecker(timeout, delay time.Duration) func(ctx context.Context) {
return func(ctx context.Context) {
for {
// Delay between checks
select {
case <-ctx.Done():
return
case <-time.After(delay):
}
select {
case <-ctx.Done():
return
case <-m.WaitLock():
m.Unlock()
case <-time.After(timeout):
panic(fmt.Errorf("likely deadlock detected, could not get lock after %s", timeout))
}
}
}
}
package util
// Utilities for errors
import (
"errors"
)
// RootError returns the root cause of the error, calling errors.Unwrap until it returns nil
func RootError(err error) error {
for {
next := errors.Unwrap(err)
if next == nil {
return err
}
err = next
}
}
package util
// Wrapper file for the AddHandler function
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"go.uber.org/zap"
)
// AddHandler is a helper function to wrap the handle function with JSON [de]serialization and check
// that the HTTP method is correct
//
// The provided logPrefix is prepended to every log line emitted by the wrapped handler function, to
// offer distinction where that's useful.
func AddHandler[T any, R any](
logger *zap.Logger,
mux *http.ServeMux,
endpoint string,
method string,
reqTypeName string,
handle func(context.Context, *zap.Logger, *T) (_ *R, statusCode int, _ error),
) {
errBadMethod := []byte("request method must be " + method)
logger = logger.With(zap.String("endpoint", endpoint))
hlogger := logger.Named("http")
mux.HandleFunc(endpoint, func(w http.ResponseWriter, r *http.Request) {
if r.Method != method {
w.WriteHeader(http.StatusMethodNotAllowed)
_, _ = w.Write(errBadMethod)
return
}
defer r.Body.Close()
var req T
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
hlogger.Error("Failed to read request body as JSON", zap.String("type", reqTypeName), zap.Error(err))
w.WriteHeader(400)
_, _ = w.Write([]byte("bad JSON"))
return
}
hlogger.Info(
"Received request",
zap.String("endpoint", endpoint),
zap.String("client", r.RemoteAddr),
zap.Any("request", req),
)
resp, status, err := handle(r.Context(), logger.With(zap.Any("request", req)), &req)
if err == nil && status != http.StatusOK {
err = errors.New("HTTP handler error: status != 200 OK, but no error message")
status = 500
}
var respBody []byte
var respBodyFormatted zap.Field
var logFunc func(string, ...zap.Field)
if err != nil {
if 500 <= status && status < 600 {
logFunc = hlogger.Error
} else if 400 <= status && status < 500 {
logFunc = hlogger.Warn
} else /* unexpected status */ {
err = fmt.Errorf("HTTP handler error: invalid status %d for error response: %w", status, err)
logFunc = hlogger.Error
}
respBodyFormatted = zap.NamedError("response", err)
respBody = []byte(err.Error())
} else {
if status == 0 {
hlogger.Warn("non-error response with status = 0")
}
respBodyFormatted = zap.Any("response", resp)
respBody, err = json.Marshal(resp)
if err != nil {
hlogger.Error("Failed to encode JSON response", respBodyFormatted)
w.WriteHeader(500)
_, _ = w.Write([]byte("Error encoding JSON response"))
return
}
logFunc = hlogger.Info
}
logFunc(
"Responding to request",
zap.String("endpoint", endpoint), zap.Int("status", status), respBodyFormatted,
)
w.WriteHeader(status)
_, _ = w.Write(respBody)
})
}
package util
// Kubernetes-specific utility functions
import (
"strings"
corev1 "k8s.io/api/core/v1"
)
// PodReady returns true iff the pod is marked as ready (as determined by the pod's
// Status.Conditions)
func PodReady(pod *corev1.Pod) bool {
for _, c := range pod.Status.Conditions {
if c.Type == corev1.PodReady {
return c.Status == corev1.ConditionTrue
}
}
return false
}
// PodCompleted returns true iff all of the Pod's containers have stopped and will not be restarted
func PodCompleted(pod *corev1.Pod) bool {
return pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed
}
// PodStartedBefore returns true iff Pod p started before Pod q
func PodStartedBefore(p, q *corev1.Pod) bool {
return p.Status.StartTime.Before(q.Status.StartTime)
}
func azForTerm(term corev1.NodeSelectorTerm) string {
for _, expr := range term.MatchExpressions {
isAZ := expr.Key == "topology.kubernetes.io/zone" &&
expr.Operator == corev1.NodeSelectorOpIn &&
len(expr.Values) == 1
if isAZ {
return expr.Values[0]
}
}
return ""
}
// PodPreferredAZIfPresent returns the desired availability zone of the Pod, if it has one
func PodPreferredAZIfPresent(pod *corev1.Pod) string {
if pod.Spec.Affinity == nil || pod.Spec.Affinity.NodeAffinity == nil {
return ""
}
affinity := pod.Spec.Affinity.NodeAffinity
// First, check required affinities for AZ:
if affinity.RequiredDuringSchedulingIgnoredDuringExecution != nil {
for _, term := range affinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms {
if az := azForTerm(term); az != "" {
return az
}
}
}
// Then, check preferred:
for _, term := range affinity.PreferredDuringSchedulingIgnoredDuringExecution {
if az := azForTerm(term.Preference); az != "" {
return az
}
}
// no AZ present
return ""
}
// TryPodOwnerVirtualMachine returns the name of the VirtualMachine that owns the pod, if there is
// one that does. Otherwise returns nil.
func TryPodOwnerVirtualMachine(pod *corev1.Pod) *NamespacedName {
for _, ref := range pod.OwnerReferences {
// For NeonVM, *at time of writing*, the OwnerReference has an APIVersion of
// "vm.neon.tech/v1". But:
//
// 1. It's good to be extra-safe around possible name collisions for the
// "VirtualMachineMigration" name, even though *practically* it's not going to happen;
// 2. We can disambiguate with the APIVersion; and
// 3. We don't want to match on a fixed version, in case we want to change the version
// number later.
//
// So, given that the format is "<NAME>/<VERSION>", we can just match on the "<NAME>/" part
// of the APIVersion to have the safety we want with the flexibility we need.
if strings.HasPrefix(ref.APIVersion, "vm.neon.tech/") && ref.Kind == "VirtualMachine" {
// note: OwnerReferences are not permitted to have a different namespace than the owned
// object, so because VirtualMachineMigrations are namespaced, it must have the same
// namespace as the Pod.
return &NamespacedName{Namespace: pod.Namespace, Name: ref.Name}
}
}
return nil
}
// TryPodOwnerVirtualMachineMigration returns the name of the VirtualMachineMigration that owns the
// pod, if there is one. Otherwise returns nil.
func TryPodOwnerVirtualMachineMigration(pod *corev1.Pod) *NamespacedName {
for _, ref := range pod.OwnerReferences {
if strings.HasPrefix(ref.APIVersion, "vm.neon.tech/") && ref.Kind == "VirtualMachineMigration" {
return &NamespacedName{Namespace: pod.Namespace, Name: ref.Name}
}
}
return nil
}
package util
import (
"context"
"errors"
"fmt"
"net"
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.uber.org/zap"
)
func RegisterMetric[P prometheus.Collector](reg prometheus.Registerer, collector P) P {
reg.MustRegister(collector)
return collector
}
// Prometheus metrics server common to >1 component
// Starts the prometheus server in a background thread. Returns error if binding on the port fails.
func StartPrometheusMetricsServer(ctx context.Context, logger *zap.Logger, port uint16, reg *prometheus.Registry) error {
// Separate binding from serving, so that we can catch any error in this thread, rather than the
// server's.
listener, err := net.ListenTCP("tcp", &net.TCPAddr{IP: net.IPv4zero, Port: int(port)})
if err != nil {
return fmt.Errorf("Error listening on TCP port %d: %w", port, err)
}
shutdownCtx, shutdown := context.WithCancel(ctx)
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg}))
baseContext := context.Background()
srv := &http.Server{Handler: mux, BaseContext: func(net.Listener) context.Context { return baseContext }}
go func() {
<-shutdownCtx.Done()
if err := srv.Shutdown(context.Background()); err != nil {
logger.Error("Error shutting down prometheus server", zap.Error(err))
}
}()
go func() {
// shutdown the shutdown watcher if we exit before it
defer shutdown()
if err := srv.Serve(listener); !errors.Is(err, http.ErrServerClosed) {
logger.Error("Prometheus server exited with unexpected error", zap.Error(err))
}
}()
return nil
}
package util
// same as k8s.io/apimachinery/pkg/types/namespacedname.go, but with JSON (de)serialization
import (
"fmt"
"go.uber.org/zap/zapcore"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
const Separator = '/'
// NamespacedName represents a resource name with the namespace it's in.
//
// When printed with '%v', NamespacedName is rendered as "<namespace>/<name>". Printing with
// '%+v' or '%#v' renders as it would normally.
type NamespacedName struct {
Namespace string `json:"namespace"`
Name string `json:"name"`
}
func GetNamespacedName(obj metav1.ObjectMetaAccessor) NamespacedName {
meta := obj.GetObjectMeta()
return NamespacedName{Namespace: meta.GetNamespace(), Name: meta.GetName()}
}
func (n NamespacedName) Format(state fmt.State, verb rune) {
switch {
case verb == 'v' && state.Flag('+'):
// Show fields, e.g. `{Namespace:foo Name:bar}`
_, _ = state.Write([]byte(string("{Namespace:")))
_, _ = state.Write([]byte(n.Namespace))
_, _ = state.Write([]byte(string(" Name:")))
_, _ = state.Write([]byte(n.Name))
_, _ = state.Write([]byte{'}'})
case verb == 'v' && state.Flag('#'):
// Go syntax representation, e.g. `util.NamespacedName{Namespace:"foo", Name:"bar"}`
_, _ = state.Write([]byte(fmt.Sprintf("util.NamespacedName{Namespace:%q, Name:%q}", n.Namespace, n.Name)))
default:
// Pretty-printed representation, e.g. `foo:bar`
_, _ = state.Write([]byte(n.Namespace))
_, _ = state.Write([]byte(string(Separator)))
_, _ = state.Write([]byte(n.Name))
}
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that NamespacedName can be used with zap.Object
func (n NamespacedName) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddString("namespace", n.Namespace)
enc.AddString("name", n.Name)
return nil
}
// Construction of JSON patches. See https://jsonpatch.com/
package patch
import (
"strings"
)
// OpKind is the kind of operation being performed in a single step
type OpKind string
const (
OpAdd OpKind = "add"
OpRemove OpKind = "remove"
OpReplace OpKind = "replace"
OpMove OpKind = "move"
OpCopy OpKind = "copy"
OpTest OpKind = "test"
)
type JSONPatch = []Operation
// Operation is a single step in the overall JSON patch
type Operation struct {
// Op is the kind of operation being performed in this step. See [OpKind] for more.
Op OpKind `json:"op"`
// Path is a [JSON pointer] to the target location of the operation.
//
// In general, nesting is separated by '/'s, with special characters escaped by '~'.
// [PathEscape] is provided to handle escaping, because it can get a little gnarly.
//
// As an example, if you want to add a field "foo" to the first element of an array,
// you'd use the path `/0/foo`. The jsonpatch website has more details (and clearer examples),
// refer there for more information: https://jsonpatch.com/#json-pointer
//
// [JSON pointer]: https://datatracker.ietf.org/doc/html/rfc6901/
Path string `json:"path"`
// From gives the source location for "copy" or "move" operations.
From string `json:"from,omitempty"`
// Value is the new value to use, for "add", "replace", or "test" operations.
Value any `json:"value,omitempty"`
}
var pathEscaper = strings.NewReplacer("~", "~0", "/", "~1")
// PathEscape escapes a string for use in a segment of the Path field of an Operation
//
// This is useful, for example, when using arbitrary strings as map keys (like Kubernetes labels or
// annotations).
func PathEscape(s string) string {
return pathEscaper.Replace(s)
}
package util
import (
"net/http"
"net/http/pprof"
"time"
)
func MakePPROF(addr string) *http.Server {
mux := http.NewServeMux()
mux.HandleFunc("/debug/pprof/", pprof.Index)
mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
return &http.Server{
Addr: addr,
Handler: mux,
ReadHeaderTimeout: time.Second,
}
}
package util
import "time"
// RecentCounter is a struct that keeps track of recent timestamps within a given interval.
type RecentCounter struct {
interval time.Duration
timestamps []time.Time
}
func NewRecentCounter(interval time.Duration) *RecentCounter {
return &RecentCounter{
interval: interval,
timestamps: make([]time.Time, 0),
}
}
// cleanup removes all timestamps that are beyond the interval from the current time.
func (rc *RecentCounter) cleanup(now time.Time) {
checkpoint := now.Add(-rc.interval)
i := 0
for ; i < len(rc.timestamps); i++ {
if rc.timestamps[i].After(checkpoint) {
break
}
}
rc.timestamps = rc.timestamps[i:]
}
// inc is separated from its exported version to provide more flexibity around testing.
func (rc *RecentCounter) inc(now time.Time) {
rc.cleanup(now)
rc.timestamps = append(rc.timestamps, now)
}
// get is separated from its exported version to provide more flexibity around testing.
func (rc *RecentCounter) get(now time.Time) uint {
rc.cleanup(now)
return uint(len(rc.timestamps))
}
// Inc increments the counter and adds the current timestamp to the list of timestamps.
func (rc *RecentCounter) Inc() {
rc.inc(time.Now())
}
// Get returns the number of recent timestamps stored in the RecentCounter.
func (rc *RecentCounter) Get() uint {
return rc.get(time.Now())
}
package util
// Signalling primitives: single-signal sender/receiver pair and sync.Cond-ish exposed over a
// channel instead
import (
"sync"
)
func NewSingleSignalPair[T any]() (SignalSender[T], SignalReceiver[T]) {
sigCh := make(chan T, 1)
once := &sync.Once{}
closeSigCh := func() { once.Do(func() { close(sigCh) }) }
return SignalSender[T]{
send: func(data T) {
once.Do(func() {
sigCh <- data
close(sigCh)
})
},
}, SignalReceiver[T]{sigCh: sigCh, closeSigCh: closeSigCh}
}
type SignalSender[T any] struct {
send func(T)
}
type SignalReceiver[T any] struct {
sigCh chan T
closeSigCh func()
}
func (s SignalSender[T]) Send(data T) {
s.send(data)
}
func (s SignalReceiver[T]) Recv() <-chan T {
return s.sigCh
}
func (s SignalReceiver[T]) Close() {
s.closeSigCh()
}
// NewCondChannelPair creates a sender/receiver pair for a sync.Cond-like interface
//
// The differences from sync.Cond are that receiving is exposed through a channel (so it can be
// select-ed) and there is no equivalent to (*Cond).Broadcast()
func NewCondChannelPair() (CondChannelSender, CondChannelReceiver) {
ch := make(chan struct{}, 1)
return CondChannelSender{ch: ch}, CondChannelReceiver{ch: ch}
}
// CondChannelSender is the sending half of a sync.Cond-like interface
type CondChannelSender struct {
ch chan struct{}
}
// CondChannelReceiver is the receiving half of a sync.Cond-like interface
type CondChannelReceiver struct {
ch chan struct{}
}
// Send performs a non-blocking notify of the associated CondChannelReceiver
//
// If there is currently a receiver waiting via Recv, then this will immediately wake them.
// Otherwise, the next receive on the channel returned by Recv will complete immediately.
func (c *CondChannelSender) Send() {
select {
case c.ch <- struct{}{}:
default:
}
}
// Unsend cancels an existing signal that has been sent but not yet received.
//
// It returns whether there was a signal to be cancelled.
func (c *CondChannelSender) Unsend() bool {
select {
case <-c.ch:
return true
default:
return false
}
}
// Consume removes any existing signal created by Send, requiring an additional Send to be made
// before the receiving on Recv will unblock
//
// This method is non-blocking.
func (c *CondChannelReceiver) Consume() {
select {
case <-c.ch:
default:
}
}
// Recv returns a channel for which receiving will complete either (a) immediately, if Send has been
// called without Consume or another receive since; or (b) as soon as Send is next called
//
// This method is non-blocking but receiving on the returned channel may block.
func (c *CondChannelReceiver) Recv() <-chan struct{} {
return c.ch
}
package stack
// Originally taken from https://github.com/sharnoff/chord
// TODO - want to have some kind of "N skipped" when (a) there's lots of frames and (b) many of
// those frames are duplicates
import (
"runtime"
"strconv"
"sync"
)
// StackTrace represents a collected stack trace, possibly with a parent (i.e caller)
//
// StackTraces are designed to make it easy to track callers across goroutines. They are typically
// produced by [GetStackTrace]; refer to that function for more information.
type StackTrace struct {
// Frames provides the frames of this stack trace. Each frame's caller is at the index following
// it; the first frame is the direct caller.
Frames []StackFrame
// Parent, if not nil, provides the "parent" stack trace - typically the stack trace at the
// point this goroutine was spawned.
Parent *StackTrace
}
// Individual stack frame, contained in a [StackTrace], produced by [GetStackTrace].
type StackFrame struct {
// Function provides the name of the function being called, or the empty string if unknown.
Function string
// File gives the name of the file, or an empty string if the file is unknown.
File string
// Line gives the line number (starting from 1), or zero if the line number is unknown.
Line int
}
// GetStackTrace produces a StackTrace, optionally with a parent's stack trace to append.
//
// skip sets the number of initial calling stack frames to exclude. Setting skip to zero will
// produce a StackTrace where the first [StackFrame] represents the location where GetStackTrace was
// called.
func GetStackTrace(parent *StackTrace, skip uint) StackTrace {
frames := getFrames(skip + 1) // skip the additional frame introduced by GetStackTrace
return StackTrace{Frames: frames, Parent: parent}
}
// String produces a string representation of the stack trace, roughly similar to the default panic
// handler's.
//
// For some examples of formatting, refer to the StackTrace tests.
func (st StackTrace) String() string {
var buf []byte
for {
if len(st.Frames) == 0 {
buf = append(buf, "<empty stack>\n"...)
} else {
for _, f := range st.Frames {
var function, functionTail, file, fileLineSep, line string
if f.Function == "" {
function = "<unknown function>"
} else {
function = f.Function
functionTail = "(...)"
}
if f.File == "" {
file = "<unknown file>"
} else {
file = f.File
if f.Line != 0 {
fileLineSep = ":"
line = strconv.Itoa(f.Line)
}
}
buf = append(buf, function...)
buf = append(buf, functionTail...)
buf = append(buf, "\n\t"...)
buf = append(buf, file...)
buf = append(buf, fileLineSep...)
buf = append(buf, line...)
buf = append(buf, byte('\n'))
}
}
if st.Parent == nil {
break
}
st = *st.Parent
buf = append(buf, "called by "...)
continue
}
return string(buf)
}
var pcBufPool = sync.Pool{
New: func() any {
buf := make([]uintptr, 128)
return &buf
},
}
func putPCBuffer(buf *[]uintptr) {
if len(*buf) < 1024 {
pcBufPool.Put(buf)
}
}
func getFrames(skip uint) []StackFrame {
skip += 2 // skip the frame introduced by this function and runtime.Callers
pcBuf := pcBufPool.Get().(*[]uintptr)
defer putPCBuffer(pcBuf)
if len(*pcBuf) == 0 {
panic("internal error: len(*pcBuf) == 0")
}
// read program counters into the buffer, repeating until buffer is big enough.
//
// This is O(n log n), where n is the true number of program counters.
var pc []uintptr
for {
n := runtime.Callers(0, *pcBuf)
if n == 0 {
panic("runtime.Callers(0, ...) returned zero")
}
if n < len(*pcBuf) {
pc = (*pcBuf)[:n]
break
} else {
*pcBuf = make([]uintptr, 2*len(*pcBuf))
}
}
framesIter := runtime.CallersFrames(pc)
var frames []StackFrame
more := true
for more {
var frame runtime.Frame
frame, more = framesIter.Next()
if skip > 0 {
skip -= 1
continue
}
frames = append(frames, StackFrame{
Function: frame.Function,
File: frame.File,
Line: frame.Line,
})
}
return frames
}
// Originally taken from https://github.com/ptxmac/multierrgroup
// Package taskgroup provides a mix of multierr and errgroup
// See documentation for https://pkg.go.dev/go.uber.org/multierr and https://pkg.go.dev/golang.org/x/sync/errgroup
package taskgroup
import (
"context"
"fmt"
"sync"
"go.uber.org/multierr"
"go.uber.org/zap"
"github.com/neondatabase/autoscaling/pkg/util/stack"
)
// Group manages goroutines and collect all the errors.
// See https://pkg.go.dev/golang.org/x/sync/errgroup#group for more information
type Group interface {
Ctx() context.Context
Wait() error
Go(name string, f func(logger *zap.Logger) error)
}
type group struct {
cancel context.CancelFunc
ctx context.Context
logger *zap.Logger
panicHandler func(any)
wg sync.WaitGroup
errMutex sync.Mutex
err error
}
type GroupOption func(*group)
// WithParentContext sets the parent context for the group.
func WithParentContext(ctx context.Context) GroupOption {
return func(g *group) {
g.ctx, g.cancel = context.WithCancel(ctx)
}
}
// WithPanicHandler sets a panic handler for the group.
func WithPanicHandler(f func(any)) GroupOption {
return func(g *group) {
g.panicHandler = f
}
}
// NewGroup returns a new Group.
func NewGroup(logger *zap.Logger, opts ...GroupOption) Group {
g := &group{
cancel: nil, // Set separately by Ctx
ctx: nil, // Set separately by Ctx
panicHandler: nil, // Set separately by WithPanicHandler
logger: logger,
wg: sync.WaitGroup{},
errMutex: sync.Mutex{},
err: nil,
}
for _, opt := range opts {
opt(g)
}
if g.ctx == nil {
// If parent context is not set, use background context
WithParentContext(context.Background())(g)
}
return g
}
// Ctx returns a context that will be canceled when the group is Waited.
func (g *group) Ctx() context.Context {
return g.ctx
}
// Wait blocks until all goroutines have completed.
//
// All errors returned from the goroutines will be combined into one using multierr and returned from this method.
func (g *group) Wait() error {
g.wg.Wait()
if g.cancel != nil {
g.cancel()
}
return g.err
}
func (g *group) call(f func() error) (err error) {
defer func() {
if r := recover(); r != nil {
if g.panicHandler != nil {
g.panicHandler(r)
}
// Omit 1 frame - the f() call below
st := stack.GetStackTrace(nil, 1).String()
g.logger.Error("Task panicked", zap.Any("payload", r), zap.String("stack", st))
err = fmt.Errorf("panic: %v", r)
}
}()
err = f()
return err
}
// Go calls the function in a new goroutine.
// If a non-nil errors is returned, the context is canceled and
// the error is collected using multierr and will be returned by Wait.
func (g *group) Go(name string, f func(logger *zap.Logger) error) {
g.wg.Add(1)
go func() {
defer g.wg.Done()
logger := g.logger.Named(name)
cb := func() error {
return f(logger)
}
if err := g.call(cb); err != nil {
err = fmt.Errorf("task %s failed: %w", name, err)
g.errMutex.Lock()
g.err = multierr.Append(g.err, err)
g.errMutex.Unlock()
logger.Error(err.Error())
if g.cancel != nil {
g.cancel()
}
}
}()
}
package util
import (
"errors"
"math/rand"
"time"
)
type TimeRange struct {
min int
max int
units time.Duration
}
func NewTimeRange(units time.Duration, minTime, maxTime int) *TimeRange {
if minTime < 0 {
panic(errors.New("bad time range: min < 0"))
} else if minTime == 0 && maxTime == 0 {
panic(errors.New("bad time range: min and max = 0"))
} else if maxTime < minTime {
panic(errors.New("bad time range: max < min"))
}
return &TimeRange{min: minTime, max: maxTime, units: units}
}
// Random returns a random time.Duration within the range
func (r TimeRange) Random() time.Duration {
if r.max == r.min {
return time.Duration(r.min) * r.units
}
count := rand.Intn(r.max-r.min) + r.min
return time.Duration(count) * r.units
}
package util
// Helper for creating a zap.Field for a VM
import (
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
corev1 "k8s.io/api/core/v1"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)
type nameFields struct {
virtualmachine NamespacedName
pod NamespacedName
}
// MarshalLogObject implements zapcore.ObjectMarshaler
func (f nameFields) MarshalLogObject(enc zapcore.ObjectEncoder) error {
if err := enc.AddObject("virtualmachine", f.virtualmachine); err != nil {
return err
}
if err := enc.AddObject("pod", f.pod); err != nil {
return err
}
return nil
}
func VMNameFields(vm *vmapi.VirtualMachine) zap.Field {
vmName := GetNamespacedName(vm)
// If the VM has a pod, log both the VM and the pod, otherwise just the VM.
if vm.Status.PodName == "" {
return zap.Object("virtualmachine", vmName)
} else {
podName := NamespacedName{Namespace: vm.Namespace, Name: vm.Status.PodName}
return zap.Inline(nameFields{
virtualmachine: vmName,
pod: podName,
})
}
}
func PodNameFields(pod *corev1.Pod) zap.Field {
podName := GetNamespacedName(pod)
if vmName, ok := pod.Labels[vmapi.VirtualMachineNameLabel]; ok {
vmName := NamespacedName{Namespace: pod.Namespace, Name: vmName}
return zap.Inline(nameFields{
virtualmachine: vmName,
pod: podName,
})
} else {
return zap.Object("pod", podName)
}
}
package watch
// Metrics for Watch()
import (
"fmt"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/watch"
)
// Metrics holds some common prometheus collectors that are used by Watch
//
// The metrics used are:
//
// - client_calls_total (number of calls to k8s client.{Watch,List}, labeled by method)
// - relist_requests_total (number of "relist" requests from the Store)
// - events_total (number of K8s watch.Events that have occurred, including errors)
// - errors_total (number of errors, either error events or re-List errors, labeled by source: ["List", "Watch", "Watch.Event"])
// - alive_current (1 iff the watcher is currently running or failing, else 0)
// - failing_current (1 iff the watcher's last request failed *and* it's waiting to retry, else 0)
//
// Prefixes are typically of the form "COMPONENT_watchers" (e.g. "autoscaling_agent_watchers").
// Separate reporting per call to Watch is automatically done with the "watcher_instance" label
// attached to the metrics, using MetricsConfig.
//
// A brief note about "alive" and "failing": Reading from a pair of collectors is fundamentally
// racy. It may be possible to temporarily view "failing" but not "alive".
type Metrics struct {
clientCallsTotal *prometheus.CounterVec
relistRequestsTotal *prometheus.CounterVec
eventsTotal *prometheus.CounterVec
errorsTotal *prometheus.CounterVec
aliveCurrent *prometheus.GaugeVec
failingCurrent *prometheus.GaugeVec
// note: all usage of Metrics is by value, so this field gets copied in on each Watch call.
// It gives us a bit of state to use for the failing and unfailing functions.
isFailing bool
}
type MetricsConfig struct {
Metrics
// Instance provides the value of the "watcher_instance" label that will be applied to all
// metrics collected for the Watch call
Instance string
}
const metricInstanceLabel = "watcher_instance"
// NewMetrics creates a new set of metrics for one or many Watch calls
//
// All metrics' names will be prefixed with the provided string.
func NewMetrics(prefix string) Metrics {
return Metrics{
isFailing: false,
clientCallsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: fmt.Sprint(prefix, "_client_calls_total"),
Help: "Number of calls to k8s client.{Watch,List}, labeled by method",
},
[]string{metricInstanceLabel, "method"},
),
relistRequestsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: fmt.Sprint(prefix, "_relist_requests_total"),
Help: "Number of internal manual relisting requests",
},
[]string{metricInstanceLabel},
),
eventsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: fmt.Sprint(prefix, "_events_total"),
Help: "Number of k8s watch.Events that have occurred, including errors, labeled by type",
},
[]string{metricInstanceLabel, "type"},
),
errorsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: fmt.Sprint(prefix, "_errors_total"),
Help: "Number of errors, either error events or re-list errors, labeled by source",
},
[]string{metricInstanceLabel, "source"},
),
aliveCurrent: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: fmt.Sprint(prefix, "_alive_current"),
Help: "For each watcher, 1 iff the watcher is currently running or failing, else 0",
},
[]string{metricInstanceLabel},
),
failingCurrent: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: fmt.Sprint(prefix, "_failing_current"),
Help: "For each watcher, 1 iff the watcher's last request failed *and* it's waiting to retry, else 0",
},
[]string{metricInstanceLabel},
),
}
}
// MustRegister registers all the collectors in the Metrics
func (m *Metrics) MustRegister(reg *prometheus.Registry) {
reg.MustRegister(m.clientCallsTotal)
reg.MustRegister(m.relistRequestsTotal)
reg.MustRegister(m.eventsTotal)
reg.MustRegister(m.errorsTotal)
reg.MustRegister(m.aliveCurrent)
reg.MustRegister(m.failingCurrent)
}
///////////////////////////////////////////////
// Internal helper methods for MetricsConfig //
///////////////////////////////////////////////
func (m *MetricsConfig) alive() {
m.aliveCurrent.WithLabelValues(m.Instance).Inc()
// Explicitly set the 'failing' count so that it's present (and set to zero)
m.failingCurrent.WithLabelValues(m.Instance).Add(0.0)
}
func (m *MetricsConfig) unalive() {
m.aliveCurrent.WithLabelValues(m.Instance).Dec()
}
func (m *MetricsConfig) failing() {
if !m.isFailing {
m.failingCurrent.WithLabelValues(m.Instance).Inc()
}
m.isFailing = true
}
func (m *MetricsConfig) unfailing() {
if m.isFailing {
m.failingCurrent.WithLabelValues(m.Instance).Dec()
}
m.isFailing = false
}
func (m *MetricsConfig) startList() {
m.clientCallsTotal.WithLabelValues(m.Instance, "List").Inc()
}
func (m *MetricsConfig) startWatch() {
m.clientCallsTotal.WithLabelValues(m.Instance, "Watch").Inc()
}
func (m *MetricsConfig) relistRequested() {
m.relistRequestsTotal.WithLabelValues(m.Instance).Inc()
}
func (m *MetricsConfig) doneList(err error) {
if err != nil {
m.errorsTotal.WithLabelValues(m.Instance, "List").Inc()
}
}
func (m *MetricsConfig) doneWatch(err error) {
if err != nil {
m.errorsTotal.WithLabelValues(m.Instance, "Watch").Inc()
}
}
func (m *MetricsConfig) recordEvent(ty watch.EventType) {
m.eventsTotal.WithLabelValues(m.Instance, string(ty)).Inc()
if ty == watch.Error {
m.errorsTotal.WithLabelValues(m.Instance, "Watch.Event").Inc()
}
}
package watch
import (
"context"
"errors"
"fmt"
stdruntime "runtime"
"sync"
"sync/atomic"
"time"
"go.uber.org/zap"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/watch"
"github.com/neondatabase/autoscaling/pkg/util"
)
// Client is implemented by the specific interfaces of kubernetes clients, like
// `Clientset.CoreV1().Pods(namespace)` or `..Nodes()`
//
// This interface should be *already implemented* by whatever the correct client is.
type Client[L any] interface {
List(context.Context, metav1.ListOptions) (L, error)
Watch(context.Context, metav1.ListOptions) (watch.Interface, error)
}
// Config is the miscellaneous configuration used by Watch
type Config struct {
// ObjectNameLogField determines the key given to the logger to use when describing the type
// being watched -- for example, "pod" or "virtualmachine"
//
// This can help with standardizing keys between the watcher and everything else using it.
ObjectNameLogField string
// Metrics will be used by the Watch call to report some information about its internal
// operations
//
// Refer to the Metrics and MetricsConfig types for more information.
Metrics MetricsConfig
// RetryRelistAfter gives a retry interval when a re-list fails. If left nil, then Watch will
// not retry.
RetryRelistAfter *util.TimeRange
// RetryWatchAfter gives a retry interval when a non-initial watch fails. If left nil, then
// Watch will not retry.
RetryWatchAfter *util.TimeRange
}
// Accessors provides the "glue" functions for Watch to go from a list L (returned by the
// client's List) to the underlying slice of items []T
type Accessors[L any, T any] struct {
Items func(L) []T
}
// Object is implemented by pointers to T, where T is typically the resource that we're
// actually watching.
//
// Example implementers: *corev1.Pod, *corev1.Node
type Object[T any] interface {
~*T
runtime.Object
metav1.ObjectMetaAccessor
}
// HandlerFuncs provides the set of callbacks to use for events from Watch
type HandlerFuncs[P any] struct {
AddFunc func(obj P, preexisting bool)
UpdateFunc func(oldObj P, newObj P)
DeleteFunc func(obj P, mayBeStale bool)
}
// Index represents types that provide some kind of additional index on top of the base listing
//
// Indexing is functionally implemented in the same way that WatchHandlerFuncs is, with the main
// difference being that more things are done for you with WatchIndexes. In particular, indexes can
// be added and removed after the Watch has already started, and the locking behavior is explicit.
type Index[T any] interface {
Add(obj *T)
Update(oldObj, newObj *T)
Delete(obj *T)
}
// InitMode dictates the behavior of Watch with respect to any initial calls to
// handlers.AddFunc before returning
//
// If set to InitWatchModeSync, then AddFunc will be called while processing the initial listing,
// meaning that the returned WatchStore is guaranteed contain the state of the cluster (although it
// may update before any access).
//
// Otherwise, if set to InitWatchModeDefer, then AddFunc will not be called until after Watch
// returns. Correspondingly, the WatchStore will not update until then either.
type InitMode string
const (
InitModeSync InitMode = "sync"
InitModeDefer InitMode = "defer"
)
// Watch starts a goroutine for watching events, using the provided WatchHandlerFuncs as the
// callbacks for each type of event.
//
// The type C is the kubernetes client we use to get the objects, L representing a list of these,
// T representing the object type, and P as a pointer to T.
func Watch[C Client[L], L metav1.ListMetaAccessor, T any, P Object[T]](
ctx context.Context,
logger *zap.Logger,
client C,
config Config,
accessors Accessors[L, T],
mode InitMode,
listOpts metav1.ListOptions,
handlers HandlerFuncs[P],
) (*Store[T], error) {
if accessors.Items == nil {
panic(errors.New("accessors.Items == nil"))
}
if handlers.AddFunc == nil {
handlers.AddFunc = func(obj P, preexisting bool) {}
}
if handlers.UpdateFunc == nil {
handlers.UpdateFunc = func(oldObj, newObj P) {}
}
if handlers.DeleteFunc == nil {
handlers.DeleteFunc = func(obj P, mayBeStale bool) {}
}
// use a copy of the options for watching vs listing:
// We want to avoid setting some values for the list requests - specifically, in order to
// provide synchronization guarantees that the contents of the store are up-to-date strictly
// *after* the start of an explicit Relist() request, we need to *not* set a resource version in
// the request to get the most recent data.
// For more, see: https://kubernetes.io/docs/reference/using-api/api-concepts/#resource-versions
watchOpts := listOpts
// Handling bookmarks means that sometimes the API server will be kind, allowing us to continue
// the watch instead of resyncing.
watchOpts.AllowWatchBookmarks = true
// Perform an initial listing
config.Metrics.startList()
initialList, err := client.List(ctx, listOpts)
config.Metrics.doneList(err)
if err != nil {
return nil, fmt.Errorf("Initial list failed: %w", err)
}
// set ResourceVersion so that the client.Watch request(s) show only the changes since we made
// the initial list
watchOpts.ResourceVersion = initialList.GetListMeta().GetResourceVersion()
sendStop, stopSignal := util.NewSingleSignalPair[struct{}]()
store := Store[T]{
mutex: sync.Mutex{},
objects: make(map[types.UID]*T),
triggerRelist: make(chan struct{}, 1), // ensure sends are non-blocking
relisted: make(chan struct{}),
nextIndexID: 0,
indexes: make(map[uint64]Index[T]),
stopSignal: sendStop,
stopped: atomic.Bool{},
failing: atomic.Bool{},
}
items := accessors.Items(initialList)
var deferredAdds []T
if mode == InitModeDefer {
deferredAdds = items
} else {
for i := range items {
obj := &items[i]
uid := P(obj).GetObjectMeta().GetUID()
store.objects[uid] = obj
handlers.AddFunc(obj, true)
// Check if the context has been cancelled. This can happen in practice if AddFunc may
// take a long time to complete.
if err := ctx.Err(); err != nil {
return nil, err
}
}
}
items = nil // reset to allow GC
// Start watching
config.Metrics.startWatch()
watcher, err := client.Watch(ctx, watchOpts)
config.Metrics.doneWatch(err)
if err != nil {
return nil, fmt.Errorf("Initial watch failed: %w", err)
}
// Lock the store to pass it into the goroutine, so that we don't have to worry about immediate
// operations on the store racing with any deferred additions.
store.mutex.Lock()
// With the successful Watch call underway, we hand off responsibility to a new goroutine.
go func() {
holdingInitialLock := true
defer func() {
if holdingInitialLock {
store.mutex.Unlock()
}
}()
// note: instead of deferring watcher.Stop() directly, wrapping it in an outer function
// means that we'll always Stop the most recent watcher.
defer func() {
watcher.Stop()
}()
// explicitly stop on exit so that it's possible to know when the store is stopped
defer store.Stop()
config.Metrics.alive()
defer config.Metrics.unalive()
if len(deferredAdds) != 0 {
logger.Info("Handling deferred adds")
}
// Handle any deferred calls to AddFunc
// NB: This is only sound because we're still holding store.mutex; otherwise we'd have to
// deal with possible racy operations (including adding an index).
for i := range deferredAdds {
obj := &deferredAdds[i]
uid := P(obj).GetObjectMeta().GetUID()
store.objects[uid] = obj
handlers.AddFunc(obj, true)
if err := ctx.Err(); err != nil {
logger.Warn("Ending: because Context expired", zap.Error(ctx.Err()))
return
}
}
holdingInitialLock = false
store.mutex.Unlock()
defer config.Metrics.unfailing()
logger.Info("All setup complete, entering event loop")
for {
// this is used exclusively for relisting, but must be defined up here so that our gotos
// don't jump over variables.
var signalRelistComplete []chan struct{}
for {
select {
case <-stopSignal.Recv():
logger.Info("Ending: because we got a stop signal")
return
case <-ctx.Done():
logger.Info("Ending: because Context expired", zap.Error(ctx.Err()))
return
case <-store.triggerRelist:
config.Metrics.relistRequested()
goto relist
case event, ok := <-watcher.ResultChan():
if !ok {
logger.Info("Watcher ended gracefully, restarting")
goto newWatcher
}
config.Metrics.recordEvent(event.Type)
if event.Type == watch.Error {
err := apierrors.FromObject(event.Object)
// note: we can get 'too old resource version' errors when there's been a
// lot of resource updates that our ListOptions filtered out.
if apierrors.IsResourceExpired(err) {
logger.Warn("Received error event", zap.Error(err))
} else {
logger.Error("Received error event", zap.Error(err))
}
goto relist
}
obj, ok := event.Object.(P)
if !ok {
var p P
logger.Error(
"Error casting event object to desired type",
zap.String("eventType", string(event.Type)),
zap.String("eventObjectType", fmt.Sprintf("%T", event.Object)),
zap.String("desiredObjectType", fmt.Sprintf("%T", p)),
)
continue
}
meta := obj.GetObjectMeta()
// Update ResourceVersion so subsequent calls to client.Watch won't include this
// event, which we're currently processing.
watchOpts.ResourceVersion = meta.GetResourceVersion()
// Wrap the remainder in a function, so we can have deferred unlocks.
uid := meta.GetUID()
err := handleEvent(&store, handlers, event.Type, uid, obj)
if err != nil {
name := util.NamespacedName{Namespace: meta.GetNamespace(), Name: meta.GetName()}
logger.Error(
"failed to handle event",
zap.Error(err),
zap.String("UID", string(uid)),
zap.Object(config.ObjectNameLogField, name),
)
goto relist
}
}
}
relist:
// Every time we make a new request, we create a channel for it. That's because we need
// to make sure that any user's call to WatchStore.Relist() that happens *while* we're
// actually making the request to K8s won't get overwritten by that request. Basically,
// we need to make sure that relisting is only marked as complete if there was a request
// that occurred *after* the call to Relist() returned.
//
// There's probably other ways we could do this - it's an area for possible improvement.
//
// Note: if we didn't do this at all, the alternative would be to ignore additional
// relist requests, having them handled naturally as we get around to watching again.
// This can amplify request failures - particularly if the K8s API server is overloaded.
signalRelistComplete = make([]chan struct{}, 0, 1)
// When we get to this point in the control flow, it's not guaranteed that the watcher
// has stopped.
//
// As of 2023-12-05, the implementation of the API's watchers (internally handled by
// k8s.io/apimachinery@.../pkg/watch/streamwatcher.go) explicitly allows multiple calls
// to Stop().
//
// This all means that it's always safe for us to call Stop() here, and sometimes we
// MUST call it here (to avoid leaking watchers after relisting), so it's worth just
// always calling it.
watcher.Stop()
logger.Info("Relisting")
for first := true; ; first = false {
func() {
store.mutex.Lock()
defer store.mutex.Unlock()
newRelistTriggered := false
// Consume any additional relist request.
// All usage of triggerRelist from within (*Store[T]).Relist() is asynchronous,
// because triggerRelist has capacity=1 and has an item in it iff relisting has
// been requested, so if Relist() *would* block on sending, the signal has
// already been given.
// That's all to say: Receiving only once from triggerRelist is sufficient.
select {
case <-store.triggerRelist:
newRelistTriggered = true
config.Metrics.relistRequested()
default:
}
if first || newRelistTriggered {
signalRelistComplete = append(signalRelistComplete, store.relisted)
store.relisted = make(chan struct{})
}
}()
config.Metrics.startList()
relistList, err := client.List(ctx, listOpts) // don't include resource version, so it's guaranteed most recent
config.Metrics.doneList(err)
if err != nil {
logger.Error("Relist failed", zap.Error(err))
if config.RetryRelistAfter == nil {
logger.Info("Ending: because relist failed and RetryWatchAfter is nil")
return
}
retryAfter := config.RetryRelistAfter.Random()
logger.Info("Retrying relist after delay", zap.Duration("delay", retryAfter))
store.failing.Store(true)
config.Metrics.failing()
select {
case <-time.After(retryAfter):
logger.Info("Relist delay reached, retrying", zap.Duration("delay", retryAfter))
continue
case <-ctx.Done():
logger.Info("Ending: because Context expired", zap.Error(ctx.Err()))
return
case <-stopSignal.Recv():
logger.Info("Ending: because we got a stop signal")
return
}
}
store.failing.Store(false)
config.Metrics.unfailing()
// err == nil, process relistList
relistItems := accessors.Items(relistList)
func() {
store.mutex.Lock()
defer store.mutex.Unlock()
// Copy the current contents of objects, and start tracking which ones have
// since been deleted.
oldObjects := make(map[types.UID]*T)
deleted := make(map[types.UID]struct{}) // set of UIDs that have been deleted
for uid, obj := range store.objects {
oldObjects[uid] = obj
deleted[uid] = struct{}{} // initially mark everything as deleted, until we find it isn't
}
// Mark all items we still have as not deleted
for i := range relistItems {
uid := P(&relistItems[i]).GetObjectMeta().GetUID()
delete(deleted, uid)
}
// Generate deletion events for all objects that are no longer present. We do
// this first so that when there's externally-enforced uniqueness that isn't
// unique *across time* (e.g. object names), users can still rely on uniqueness
// at any time that handlers are called.
for uid := range deleted {
obj := store.objects[uid]
delete(store.objects, uid)
for _, index := range store.indexes {
index.Delete(obj)
}
handlers.DeleteFunc(obj, true)
}
for i := range relistItems {
obj := &relistItems[i]
uid := P(obj).GetObjectMeta().GetUID()
store.objects[uid] = obj
oldObj, hasObj := oldObjects[uid]
if hasObj {
for _, index := range store.indexes {
index.Update(oldObj, obj)
}
handlers.UpdateFunc(oldObj, obj)
} else {
for _, index := range store.indexes {
index.Add(obj)
}
handlers.AddFunc(obj, false)
}
}
}()
// Update ResourceVersion, recreate watcher.
watchOpts.ResourceVersion = relistList.GetListMeta().GetResourceVersion()
logger.Info("Relist complete, restarting watcher")
for _, ch := range signalRelistComplete {
close(ch)
}
goto newWatcher
}
newWatcher:
// In the loop, retry the API call to watch.
//
// It's possible that we attempt to watch with a resource version that's too old, in
// which case the API call *does* succeed, but the first event is an error (which we use
// to trigger relisting).
for {
config.Metrics.startWatch()
watcher, err = client.Watch(ctx, watchOpts)
config.Metrics.doneWatch(err)
if err != nil {
logger.Error("Re-watch failed", zap.Error(err))
if config.RetryWatchAfter == nil {
logger.Info("Ending: because re-watch failed and RetryWatchAfter is nil")
return
}
retryAfter := config.RetryWatchAfter.Random()
logger.Info("Retrying re-watch after delay", zap.Duration("delay", retryAfter))
store.failing.Store(true)
config.Metrics.failing()
select {
case <-time.After(retryAfter):
logger.Info("Re-watch delay reached, retrying", zap.Duration("delay", retryAfter))
continue
case <-ctx.Done():
logger.Info("Ending: because Context expired", zap.Error(ctx.Err()))
return
case <-stopSignal.Recv():
logger.Info("Ending: because we got a stop signal")
return
}
}
// err == nil
store.failing.Store(false)
config.Metrics.unfailing()
break
}
}
}()
return &store, nil
}
// helper for Watch. Error events are expected to already have been handled by the caller.
func handleEvent[T any, P ~*T](
store *Store[T],
handlers HandlerFuncs[P],
eventType watch.EventType,
uid types.UID,
ptr P,
) error {
obj := (*T)(ptr)
// Some of the cases below don't actually require locking the store. Most of the events that we
// receive *do* though, so we're better off doing it here for simplicity.
store.mutex.Lock()
defer store.mutex.Unlock()
switch eventType {
case watch.Added:
if _, ok := store.objects[uid]; ok {
return fmt.Errorf("received add event for object we already have")
}
store.objects[uid] = obj
for _, index := range store.indexes {
index.Add(obj)
}
handlers.AddFunc(obj, false)
case watch.Deleted:
// We're given the state of the object immediately before deletion, which
// *may* be different to what we currently have stored.
old, ok := store.objects[uid]
if !ok {
return errors.New("received delete event for object that's not present")
}
// Update:
for _, index := range store.indexes {
index.Update(old, obj)
}
handlers.UpdateFunc(old, obj)
// Delete:
delete(store.objects, uid)
for _, index := range store.indexes {
index.Delete(obj)
}
handlers.DeleteFunc(obj, false)
case watch.Modified:
old, ok := store.objects[uid]
if !ok {
return errors.New("received update event for object that's not present")
}
store.objects[uid] = obj
for _, index := range store.indexes {
index.Update(old, obj)
}
handlers.UpdateFunc(old, obj)
case watch.Bookmark:
// Nothing to do, just serves to give us a new ResourceVersion, which should be handled by
// the caller.
case watch.Error:
panic(errors.New("handleEvent unexpectedly called with eventType Error"))
default:
panic(errors.New("unknown watch event"))
}
return nil
}
// Store provides an interface for getting information about a list of Ts using the event
// listener from a previous call to Watch
type Store[T any] struct {
objects map[types.UID]*T
mutex sync.Mutex
// triggerRelist has capacity=1 and *if* the channel contains an item, then relisting has been
// requested by some call to (*Store[T]).Relist().
triggerRelist chan struct{}
// relisted is replaced and closed whenever relisting happens. Refer to its usage in Watch or
// (*Store[T]).Relist() for more detail.
relisted chan struct{}
nextIndexID uint64
indexes map[uint64]Index[T]
stopSignal util.SignalSender[struct{}]
stopped atomic.Bool
failing atomic.Bool
}
// Relist triggers re-listing the WatchStore, returning a channel that will be closed once the
// re-list is complete
func (w *Store[T]) Relist() <-chan struct{} {
w.mutex.Lock()
defer w.mutex.Unlock()
// Because triggerRelist has capacity=1, failing to immediately send to the channel means that
// there's already a signal to request relisting that has not yet been processed.
select {
case w.triggerRelist <- struct{}{}:
default:
}
// note: w.relisted is replaced immediately before every attempt at the API call for relisting,
// so that there's a strict happens-before relation that guarantees that *when* w.relisted is
// closed, the relevant List call *must* have happened after any attempted send on
// w.triggerRelist.
return w.relisted
}
func (w *Store[T]) Stop() {
w.stopSignal.Send(struct{}{})
w.stopped.Store(true)
}
func (w *Store[T]) Failing() bool {
return w.failing.Load()
}
func (w *Store[T]) Stopped() bool {
return w.stopped.Load()
}
func (w *Store[T]) Items() []*T {
w.mutex.Lock()
defer w.mutex.Unlock()
items := make([]*T, len(w.objects))
i := 0
for _, val := range w.objects {
items[i] = val
i += 1
}
return items
}
// NewIndexedStore creates a new IndexedWatchStore from the WatchStore and the index to use.
//
// Note: the index type is assumed to have reference semantics; i.e. any shallow copy of the value
// will affect any other shallow copy.
//
// For more information, refer to IndexedWatchStore.
func NewIndexedStore[T any, I Index[T]](store *Store[T], index I) IndexedStore[T, I] {
store.mutex.Lock()
defer store.mutex.Unlock()
for _, obj := range store.objects {
index.Add(obj)
}
id := store.nextIndexID
store.nextIndexID += 1
store.indexes[id] = index
collector := &struct{}{}
// when this IndexedWatchStore is GC'd, remove its index from the WatchStore. This should
// provide a reliable way of making sure that indexes always get cleaned up.
stdruntime.SetFinalizer(collector, func(_ any) {
// note: finalizers always run in a separate goroutine, so it's ok to lock here.
store.mutex.Lock()
defer store.mutex.Unlock()
delete(store.indexes, id)
})
return IndexedStore[T, I]{store, index, id, collector}
}
// IndexedStore represents a WatchStore, wrapped with a privileged WatchIndex that can be used
// to efficiently answer queries.
type IndexedStore[T any, I Index[T]] struct {
*Store[T]
index I
// id stores the id of this index in the WatchStore
id uint64
// collector has a destructor attached to it so that the index can be automatically removed from
// the WatchStore when it's no longer in use, without requiring users to manually get rid of it.
collector *struct{}
}
// WithIndex calls a function with the current state of the index, locking the WatchStore around it.
//
// It is almost guaranteed to be an error to indirectly return the index with this function.
func (w IndexedStore[T, I]) WithIndex(f func(I)) {
w.Store.mutex.Lock()
defer w.Store.mutex.Unlock()
f(w.index)
}
func (w IndexedStore[T, I]) GetIndexed(f func(I) (*T, bool)) (obj *T, ok bool) {
w.WithIndex(func(i I) {
obj, ok = f(i)
})
return
}
func (w IndexedStore[T, I]) ListIndexed(f func(I) []*T) (list []*T) {
w.WithIndex(func(i I) {
list = f(i)
})
return
}
func NewNameIndex[T any]() *NameIndex[T] {
// check that *T implements metav1.ObjectMetaAccessor
var zero T
ptrToZero := any(&zero)
if _, ok := ptrToZero.(metav1.ObjectMetaAccessor); !ok {
panic("type *T must implement metav1.ObjectMetaAccessor")
}
// This doesn't *need* to be a pointer, but the intent is a little more clear this way.
return &NameIndex[T]{
namespacedNames: make(map[util.NamespacedName]*T),
}
}
// NameIndex is a WatchIndex that provides efficient lookup for a value with a particular name
type NameIndex[T any] struct {
namespacedNames map[util.NamespacedName]*T
}
// note: requires that *T implements metav1.ObjectMetaAccessor
func keyForObj[T any](obj *T) util.NamespacedName {
meta := any(obj).(metav1.ObjectMetaAccessor).GetObjectMeta()
return util.NamespacedName{Namespace: meta.GetNamespace(), Name: meta.GetName()}
}
func (i *NameIndex[T]) Add(obj *T) {
i.namespacedNames[keyForObj(obj)] = obj
}
func (i *NameIndex[T]) Update(oldObj, newObj *T) {
i.Delete(oldObj)
i.Add(newObj)
}
func (i *NameIndex[T]) Delete(obj *T) {
delete(i.namespacedNames, keyForObj(obj))
}
func (i *NameIndex[T]) Get(namespace string, name string) (obj *T, ok bool) {
obj, ok = i.namespacedNames[util.NamespacedName{Namespace: namespace, Name: name}]
return
}
func NewFlatNameIndex[T any]() *FlatNameIndex[T] {
// check that *T implements metav1.ObjectMetaAccessor
var zero T
ptrToZero := any(&zero)
if _, ok := ptrToZero.(metav1.ObjectMetaAccessor); !ok {
panic("type *T must implement metav1.ObjectMetaAccessor")
}
return &FlatNameIndex[T]{
names: make(map[string]*T),
}
}
type FlatNameIndex[T any] struct {
names map[string]*T
}
// note: requires that *T implements metav1.ObjectMetaAccessor
func getName[T any](obj *T) string {
meta := any(obj).(metav1.ObjectMetaAccessor).GetObjectMeta()
return meta.GetName()
}
func (i *FlatNameIndex[T]) Add(obj *T) {
i.names[getName(obj)] = obj
}
func (i *FlatNameIndex[T]) Update(oldObj, newObj *T) {
i.Delete(oldObj)
i.Add(newObj)
}
func (i *FlatNameIndex[T]) Delete(obj *T) {
delete(i.names, getName(obj))
}
func (i *FlatNameIndex[T]) Get(name string) (obj *T, ok bool) {
obj, ok = i.names[name]
return
}
package xact
// Xact represents a single in-memory transaction, to aid with separating calculations from their
// application.
type Xact[T any] struct {
tmp T
base *T
}
// New returns a new transaction object (called Xact) operating on the given pointer
//
// NOTE: Any copying is shallow -- if T contains pointers, any changes to the values behind those
// will NOT be delayed until (*Xact[T]).Commit().
func New[T any](ptr *T) *Xact[T] {
return &Xact[T]{
tmp: *ptr,
base: ptr,
}
}
// Value returns a pointer to the temporary value stored in the Xact
//
// The returned value can be freely modified; it will have no effect until the transaction is
// committed with Commit().
func (x *Xact[T]) Value() *T {
return &x.tmp
}
// Commit assigns the temporary value back to the original pointer that the Xact was created with
//
// A transaction can be committed multiple times, if it's useful to reuse it.
func (x *Xact[T]) Commit() {
*x.base = x.tmp
}