/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package v1 contains API Schema definitions for the vm v1 API group
// +kubebuilder:object:generate=true
// +groupName=vm.neon.tech
package v1
import (
"sigs.k8s.io/controller-runtime/pkg/scheme"
"k8s.io/apimachinery/pkg/runtime/schema"
)
var (
// SchemeGroupVersion is group version used to register these objects
SchemeGroupVersion = schema.GroupVersion{Group: "vm.neon.tech", Version: "v1"}
// SchemeBuilder is used to add go types to the GroupVersionKind scheme
SchemeBuilder = &scheme.Builder{GroupVersion: SchemeGroupVersion}
// AddToScheme adds the types in this group-version to the given scheme.
AddToScheme = SchemeBuilder.AddToScheme
)
// Resource takes an unqualified resource and returns a Group qualified GroupResource
func Resource(resource string) schema.GroupResource {
return SchemeGroupVersion.WithResource(resource).GroupResource()
}
package v1
import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
// IPPoolSpec defines the desired state of IPPool
type IPPoolSpec struct {
// Range is a RFC 4632/4291-style string that represents an IP address and prefix length in CIDR notation
Range string `json:"range"`
// Allocations is the set of allocated IPs for the given range. Its` indices are a direct mapping to the
// IP with the same index/offset for the pool's range.
Allocations map[string]IPAllocation `json:"allocations"`
}
// IPAllocation represents metadata about the pod/container owner of a specific IP
// coped from Whereabout CNI as their allocation functions used
type IPAllocation struct {
ContainerID string `json:"id"`
PodRef string `json:"podref,omitempty"`
}
//+genclient
//+kubebuilder:object:root=true
//+kubebuilder:resource:singular=ippool
// IPPool is the Schema for the ippools API
type IPPool struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec IPPoolSpec `json:"spec,omitempty"`
}
// +kubebuilder:object:root=true
// IPPoolList contains a list of IPPool
type IPPoolList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []IPPool `json:"items"`
}
func init() {
SchemeBuilder.Register(&IPPool{}, &IPPoolList{}) //nolint:exhaustruct // just being used to provide the types
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
"encoding/json"
"errors"
"fmt"
"slices"
"time"
"github.com/samber/lo"
"go.uber.org/zap/zapcore"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
const (
// VirtualMachineNameLabel is the label assigned to each NeonVM Pod, providing the name of the
// VirtualMachine object for the VM running in it
//
// This label can be used both to find which VM is running in a Pod (by getting the value of the
// label) or to find which Pod a VM is running in (by searching for Pods with the label equal to
// the VM's name).
VirtualMachineNameLabel string = "vm.neon.tech/name"
// Label that determines the version of runner pod. May be missing on older runners
RunnerPodVersionLabel string = "vm.neon.tech/runner-version"
// VirtualMachineUsageAnnotation is the annotation added to each runner Pod, mirroring
// information about the resource allocations of the VM running in the pod.
//
// The value of this annotation is always a JSON-encoded VirtualMachineUsage object.
VirtualMachineUsageAnnotation string = "vm.neon.tech/usage"
// VirtualMachineResourcesAnnotation is the annotation added to each runner Pod, mirroring
// information about the resource allocations of the VM running in the pod.
//
// The value of this annotation is always a JSON-encoded VirtualMachineResources object.
VirtualMachineResourcesAnnotation string = "vm.neon.tech/resources"
)
// VirtualMachineUsage provides information about a VM's current usage. This is the type of the
// JSON-encoded data in the VirtualMachineUsageAnnotation attached to each runner pod.
type VirtualMachineUsage struct {
CPU *resource.Quantity `json:"cpu"`
Memory *resource.Quantity `json:"memory"`
}
// VirtualMachineResources provides information about a VM's resource allocations.
type VirtualMachineResources struct {
CPUs CPUs `json:"cpus"`
MemorySlots MemorySlots `json:"memorySlots"`
MemorySlotSize resource.Quantity `json:"memorySlotSize"`
}
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
// VirtualMachineSpec defines the desired state of VirtualMachine
type VirtualMachineSpec struct {
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=65535
// +kubebuilder:default:=20183
// +optional
QMP int32 `json:"qmp,omitempty"`
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=65535
// +kubebuilder:default:=20184
// +optional
QMPManual int32 `json:"qmpManual,omitempty"`
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=65535
// +kubebuilder:default:=25183
// +optional
RunnerPort int32 `json:"runnerPort,omitempty"`
// +kubebuilder:default:=5
// +optional
TerminationGracePeriodSeconds *int64 `json:"terminationGracePeriodSeconds"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
Affinity *corev1.Affinity `json:"affinity,omitempty"`
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
SchedulerName string `json:"schedulerName,omitempty"`
ServiceAccountName string `json:"serviceAccountName,omitempty"`
PodResources corev1.ResourceRequirements `json:"podResources,omitempty"`
// +kubebuilder:default:=Always
// +optional
RestartPolicy RestartPolicy `json:"restartPolicy"`
ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"`
Guest Guest `json:"guest"`
// Running init containers is costly, so InitScript field should be preferred over ExtraInitContainers
ExtraInitContainers []corev1.Container `json:"extraInitContainers,omitempty"`
// InitScript will be executed in the main container before VM is started.
// +optional
InitScript string `json:"initScript,omitempty"`
// List of disk that can be mounted by virtual machine.
// +optional
Disks []Disk `json:"disks,omitempty"`
// Extra network interface attached to network provided by Mutlus CNI.
// +optional
ExtraNetwork *ExtraNetwork `json:"extraNetwork,omitempty"`
// +optional
ServiceLinks *bool `json:"service_links,omitempty"`
// Use KVM acceleation
// +kubebuilder:default:=true
// +optional
EnableAcceleration *bool `json:"enableAcceleration,omitempty"`
// Override for normal neonvm-runner image
// +optional
RunnerImage *string `json:"runnerImage,omitempty"`
// Enable SSH on the VM. It works only if the VM image is built using VM Builder that
// has SSH support (TODO: mention VM Builder version).
// +kubebuilder:default:=true
// +optional
EnableSSH *bool `json:"enableSSH,omitempty"`
// TargetRevision is the identifier set by external party to track when changes to the spec
// propagate to the VM.
//
// If a certain value is written into Spec.TargetRevision together with the changes, and
// the same value is observed in Status.CurrentRevision, it means that the changes were
// propagated to the VM.
// +optional
TargetRevision *RevisionWithTime `json:"targetRevision,omitempty"`
}
func (spec *VirtualMachineSpec) Resources() VirtualMachineResources {
return VirtualMachineResources{
CPUs: spec.Guest.CPUs,
MemorySlots: spec.Guest.MemorySlots,
MemorySlotSize: spec.Guest.MemorySlotSize,
}
}
// +kubebuilder:validation:Enum=Always;OnFailure;Never
type RestartPolicy string
const (
RestartPolicyAlways RestartPolicy = "Always"
RestartPolicyOnFailure RestartPolicy = "OnFailure"
RestartPolicyNever RestartPolicy = "Never"
)
type Guest struct {
// +optional
KernelImage *string `json:"kernelImage,omitempty"`
// +optional
AppendKernelCmdline *string `json:"appendKernelCmdline,omitempty"`
// +optional
CPUs CPUs `json:"cpus"`
// +optional
// +kubebuilder:default:="1Gi"
MemorySlotSize resource.Quantity `json:"memorySlotSize"`
// +optional
MemorySlots MemorySlots `json:"memorySlots"`
// +optional
MemoryProvider *MemoryProvider `json:"memoryProvider,omitempty"`
// +optional
RootDisk RootDisk `json:"rootDisk"`
// Docker image Entrypoint array replacement.
// +optional
Command []string `json:"command,omitempty"`
// Arguments to the entrypoint.
// The docker image's cmd is used if this is not provided.
// +optional
Args []string `json:"args,omitempty"`
// List of environment variables to set in the vmstart process.
// +optional
Env []EnvVar `json:"env,omitempty" patchStrategy:"merge" patchMergeKey:"name"`
// List of ports to expose from the container.
// Cannot be updated.
// +optional
Ports []Port `json:"ports,omitempty"`
// Additional settings for the VM.
// Cannot be updated.
// +optional
Settings *GuestSettings `json:"settings,omitempty"`
}
const virtioMemBlockSizeBytes = 8 * 1024 * 1024 // 8 MiB
// ValidateForMemoryProvider returns an error iff the guest memory settings are invalid for the
// MemoryProvider.
//
// This is used in two places. First, to validate VirtualMachine object creation. Second, to handle
// the defaulting behavior for VirtualMachines that would be switching from DIMMSlots to VirtioMem
// on restart. We place more restrictions on VirtioMem because we use 8MiB block sizes, so changing
// to a new default can only happen if the memory slot size is a multiple of 8MiB.
func (g Guest) ValidateForMemoryProvider(p MemoryProvider) error {
if p == MemoryProviderVirtioMem {
if g.MemorySlotSize.Value()%virtioMemBlockSizeBytes != 0 {
return fmt.Errorf("memorySlotSize invalid for memoryProvider VirtioMem: must be a multiple of 8Mi")
}
}
return nil
}
// Flag is a bitmask of flags. The meaning is up to the user.
//
// Used in Revision below.
type Flag uint64
func (f *Flag) Set(flag Flag) {
*f |= flag
}
func (f *Flag) Clear(flag Flag) {
*f &= ^flag
}
func (f *Flag) Has(flag Flag) bool {
return *f&flag != 0
}
// Revision is an identifier, which can be assigned to a specific configuration of a VM.
// Later it can be used to track the application of the configuration.
type Revision struct {
Value int64 `json:"value"`
Flags Flag `json:"flags"`
}
// ZeroRevision is the default value when revisions updates are disabled.
var ZeroRevision = Revision{Value: 0, Flags: 0}
func (r Revision) Min(other Revision) Revision {
if r.Value < other.Value {
return r
}
return other
}
func (r Revision) WithTime(t time.Time) RevisionWithTime {
return RevisionWithTime{
Revision: r,
UpdatedAt: metav1.NewTime(t),
}
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that Revision can be used with zap.Object
func (r *Revision) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddInt64("value", r.Value)
enc.AddUint64("flags", uint64(r.Flags))
return nil
}
// RevisionWithTime contains a Revision and the time it was last updated.
type RevisionWithTime struct {
Revision `json:"revision"`
UpdatedAt metav1.Time `json:"updatedAt"`
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that RevisionWithTime can be used with zap.Object
func (r *RevisionWithTime) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddTime("updatedAt", r.UpdatedAt.Time)
return r.Revision.MarshalLogObject(enc)
}
type GuestSettings struct {
// Individual lines to add to a sysctl.conf file. See sysctl.conf(5) for more
// +optional
Sysctl []string `json:"sysctl,omitempty"`
// Swap adds a swap disk with the provided size.
//
// If Swap is provided, SwapInfo MUST NOT be provided, and vice versa.
//
// +optional
Swap *resource.Quantity `json:"swap,omitempty"`
// SwapInfo controls settings for adding a swap disk to the VM.
//
// SwapInfo is a temporary newer version of the Swap field.
//
// Eventually, after all VMs have moved from Swap to SwapInfo, we can change the type of the Swap
// field to SwapInfo, move VMs from SwapInfo back to Swap, and then remove SwapInfo.
//
// More information here: https://neondb.slack.com/archives/C06SW383C79/p1713298689471319
//
// +optional
SwapInfo *SwapInfo `json:"swapInfo,omitempty"`
}
func (s *GuestSettings) WithoutSwapFields() *GuestSettings {
return &GuestSettings{
Sysctl: s.Sysctl,
Swap: nil,
SwapInfo: nil,
}
}
// SwapInfo returns information about the swap requested, if there is any.
//
// This is an abstraction over the Swap/SwapInfo fields, providing a unified internal interface.
//
// SwapInfo returns error if both Swap and SwapInfo are provided. Typically the Kubernetes API
// guarantees that is not the case.
func (s *GuestSettings) GetSwapInfo() (*SwapInfo, error) {
if s.Swap != nil && s.SwapInfo != nil {
return nil, errors.New("cannot have both 'swap' and 'swapInfo' enabled")
}
if s.Swap != nil {
return &SwapInfo{
Size: *s.Swap,
SkipSwapon: nil,
}, nil
} else if s.SwapInfo != nil {
return lo.ToPtr(*s.SwapInfo), nil
}
return nil, nil
}
type SwapInfo struct {
// Size sets the size of the swap in the VM. The amount of space used on the host may be
// slightly more (by a few MiBs). The information reported by `cat /proc/meminfo` may show
// slightly less, due to a single page header (typically 4KiB).
Size resource.Quantity `json:"size"`
// SkipSwapon instructs the VM to *not* run swapon for the swap on startup.
//
// This is intended to be used in cases where you will *always* resize the swap post-startup,
// and don't need it available before that resizing.
//
// +optional
SkipSwapon *bool `json:"skipSwapon,omitempty"`
}
type CPUs struct {
Min MilliCPU `json:"min"`
Max MilliCPU `json:"max"`
Use MilliCPU `json:"use"`
}
// MilliCPU is a special type to represent vCPUs * 1000
// e.g. 2 vCPU is 2000, 0.25 is 250
//
// +kubebuilder:validation:XIntOrString
// +kubebuilder:validation:Pattern=^[0-9]+((\.[0-9]*)?|m)
type MilliCPU uint32 // note: pattern is more restrictive than resource.Quantity, because we're just using it for CPU
// RoundedUp returns the smallest integer number of CPUs greater than or equal to the effective
// value of m.
func (m MilliCPU) RoundedUp() uint32 {
r := uint32(m) / 1000
if m%1000 != 0 {
r += 1
}
return r
}
// MilliCPUFromResourceQuantity converts resource.Quantity into MilliCPU
func MilliCPUFromResourceQuantity(r resource.Quantity) MilliCPU {
return MilliCPU(r.MilliValue())
}
// ToResourceQuantity converts a MilliCPU to resource.Quantity
// this is useful for formatting/serialization
func (m MilliCPU) ToResourceQuantity() *resource.Quantity {
return resource.NewMilliQuantity(int64(m), resource.BinarySI)
}
// AsFloat64 converts the MilliCPU value into a float64 of CPU
//
// This should be preferred over calling m.ToResourceQuantity().AsApproximateFloat64(), because
// going through the resource.Quantity can produce less accurate floats.
func (m MilliCPU) AsFloat64() float64 {
return float64(m) / 1000
}
// this is used to parse scheduler config and communication between components
// we used resource.Quantity as underlying transport format for MilliCPU
func (m *MilliCPU) UnmarshalJSON(data []byte) error {
var quantity resource.Quantity
err := json.Unmarshal(data, &quantity)
if err != nil {
return err
}
*m = MilliCPUFromResourceQuantity(quantity)
return nil
}
func (m MilliCPU) MarshalJSON() ([]byte, error) {
// Mashal as an integer if we can, for backwards-compatibility with components that wouldn't be
// expecting a string here.
if m%1000 == 0 {
return json.Marshal(uint32(m / 1000))
}
return json.Marshal(m.ToResourceQuantity())
}
func (m MilliCPU) Format(state fmt.State, verb rune) {
switch {
case verb == 'v' && state.Flag('#'):
//nolint:errcheck // can't do anything about the write error
state.Write([]byte(fmt.Sprintf("%v", uint32(m))))
default:
//nolint:errcheck // can't do anything about the write error
state.Write([]byte(fmt.Sprintf("%v", m.AsFloat64())))
}
}
type MemorySlots struct {
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=128
// +kubebuilder:validation:ExclusiveMaximum=false
Min int32 `json:"min"`
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=128
// +kubebuilder:validation:ExclusiveMaximum=false
Max int32 `json:"max"`
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=128
// +kubebuilder:validation:ExclusiveMaximum=false
Use int32 `json:"use"`
}
// +kubebuilder:validation:Enum=DIMMSlots;VirtioMem
type MemoryProvider string
const (
MemoryProviderDIMMSlots MemoryProvider = "DIMMSlots"
MemoryProviderVirtioMem MemoryProvider = "VirtioMem"
)
// FlagFunc is a parsing function to be used with flag.Func
func (p *MemoryProvider) FlagFunc(value string) error {
possibleValues := []string{
string(MemoryProviderDIMMSlots),
string(MemoryProviderVirtioMem),
}
if !slices.Contains(possibleValues, value) {
return fmt.Errorf("Unknown MemoryProvider %q, must be one of %v", value, possibleValues)
}
*p = MemoryProvider(value)
return nil
}
type RootDisk struct {
Image string `json:"image"`
// +optional
Size resource.Quantity `json:"size,omitempty"`
// +optional
// +kubebuilder:default:="IfNotPresent"
ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy"`
// +optional
Execute []string `json:"execute,omitempty"`
}
type EnvVar struct {
// Name of the environment variable. Must be a C_IDENTIFIER.
Name string `json:"name"`
// +optional
// +kubebuilder:default:=""
Value string `json:"value,omitempty"`
}
type Port struct {
// If specified, this must be an IANA_SVC_NAME and unique within the pod. Each
// named port in a pod must have a unique name. Name for the port that can be
// referred to by services.
Name string `json:"name,omitempty"`
// Number of port to expose on the pod's IP address.
// This must be a valid port number, 0 < x < 65536.
// +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=65535
Port int `json:"port"`
// Protocol for port. Must be UDP or TCP.
// Defaults to "TCP".
// +kubebuilder:default:=TCP
Protocol Protocol `json:"protocol,omitempty"`
}
type Protocol string
const (
// ProtocolTCP is the TCP protocol.
ProtocolTCP Protocol = "TCP"
// ProtocolUDP is the UDP protocol.
ProtocolUDP Protocol = "UDP"
)
type Disk struct {
// Disk's name.
// Must be a DNS_LABEL and unique within the virtual machine.
Name string `json:"name"`
// Mounted read-only if true, read-write otherwise (false or unspecified).
// Defaults to false.
// +optional
// +kubebuilder:default:=false
ReadOnly *bool `json:"readOnly,omitempty"`
// Path within the virtual machine at which the disk should be mounted. Must
// not contain ':'.
MountPath string `json:"mountPath"`
// DiskSource represents the location and type of the mounted disk.
DiskSource `json:",inline"`
}
type DiskSource struct {
// EmptyDisk represents a temporary empty qcow2 disk that shares a vm's lifetime.
EmptyDisk *EmptyDiskSource `json:"emptyDisk,omitempty"`
// configMap represents a configMap that should populate this disk
// +optional
ConfigMap *corev1.ConfigMapVolumeSource `json:"configMap,omitempty"`
// Secret represents a secret that should populate this disk.
// +optional
Secret *corev1.SecretVolumeSource `json:"secret,omitempty"`
// TmpfsDisk represents a tmpfs.
// +optional
Tmpfs *TmpfsDiskSource `json:"tmpfs,omitempty"`
}
type EmptyDiskSource struct {
Size resource.Quantity `json:"size"`
// Discard enables the "discard" mount option for the filesystem
Discard bool `json:"discard,omitempty"`
}
type TmpfsDiskSource struct {
Size resource.Quantity `json:"size"`
}
type ExtraNetwork struct {
// Enable extra network interface
// +kubebuilder:default:=false
// +optional
Enable bool `json:"enable"`
// Interface name.
// +kubebuilder:default:=net1
// +optional
Interface string `json:"interface"`
// Multus Network name specified in network-attachments-definition.
// +optional
MultusNetwork string `json:"multusNetwork,omitempty"`
}
// VirtualMachineStatus defines the observed state of VirtualMachine
type VirtualMachineStatus struct {
// Represents the observations of a VirtualMachine's current state.
// VirtualMachine.status.conditions.type are: "Available", "Progressing", and "Degraded"
// VirtualMachine.status.conditions.status are one of True, False, Unknown.
// VirtualMachine.status.conditions.reason the value should be a CamelCase string and producers of specific
// condition types may define expected values and meanings for this field, and whether the values
// are considered a guaranteed API.
// VirtualMachine.status.conditions.Message is a human readable message indicating details about the transition.
// For further information see: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties
Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"`
// The phase of a VM is a simple, high-level summary of where the VM is in its lifecycle.
// +optional
Phase VmPhase `json:"phase,omitempty"`
// Number of times the VM runner pod has been recreated
// +optional
RestartCount int32 `json:"restartCount"`
// +optional
PodName string `json:"podName,omitempty"`
// +optional
PodIP string `json:"podIP,omitempty"`
// +optional
ExtraNetIP string `json:"extraNetIP,omitempty"`
// +optional
ExtraNetMask string `json:"extraNetMask,omitempty"`
// +optional
Node string `json:"node,omitempty"`
// +optional
CPUs *MilliCPU `json:"cpus,omitempty"`
// +optional
MemorySize *resource.Quantity `json:"memorySize,omitempty"`
// +optional
MemoryProvider *MemoryProvider `json:"memoryProvider,omitempty"`
// +optional
SSHSecretName string `json:"sshSecretName,omitempty"`
// CurrentRevision is updated with Spec.TargetRevision's value once
// the changes are propagated to the VM.
// +optional
CurrentRevision *RevisionWithTime `json:"currentRevision,omitempty"`
}
type VmPhase string
const (
// VmPending means the VM has been accepted by the system, but vm-runner pod
// has not been started. This includes time before being bound to a node, as well as time spent
// pulling images onto the host.
VmPending VmPhase = "Pending"
// VmRunning means the vm-runner pod has been bound to a node and have been started.
VmRunning VmPhase = "Running"
// VmSucceeded means that all containers in the vm-runner pod have voluntarily terminated
// with a container exit code of 0, and the system is not going to restart any of these containers.
VmSucceeded VmPhase = "Succeeded"
// VmFailed means that all containers in the vm-runner pod have terminated, and at least one container has
// terminated in a failure (exited with a non-zero exit code or was stopped by the system).
VmFailed VmPhase = "Failed"
// VmPreMigrating means that VM in preparation to start migration
VmPreMigrating VmPhase = "PreMigrating"
// VmMigrating means that VM in migration to another node
VmMigrating VmPhase = "Migrating"
// VmScaling means that devices are plugging/unplugging to/from the VM
VmScaling VmPhase = "Scaling"
)
// IsAlive returns whether the guest in the VM is expected to be running
func (p VmPhase) IsAlive() bool {
switch p {
case VmRunning, VmPreMigrating, VmMigrating, VmScaling:
return true
default:
return false
}
}
//+genclient
//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:resource:singular=neonvm
// VirtualMachine is the Schema for the virtualmachines API
// +kubebuilder:printcolumn:name="Cpus",type=string,JSONPath=`.status.cpus`
// +kubebuilder:printcolumn:name="Memory",type=string,JSONPath=`.status.memorySize`
// +kubebuilder:printcolumn:name="Pod",type=string,JSONPath=`.status.podName`
// +kubebuilder:printcolumn:name="ExtraIP",type=string,JSONPath=`.status.extraNetIP`
// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.phase`
// +kubebuilder:printcolumn:name="Restarts",type=string,JSONPath=`.status.restarts`
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
// +kubebuilder:printcolumn:name="Node",type=string,priority=1,JSONPath=`.status.node`
// +kubebuilder:printcolumn:name="Image",type=string,priority=1,JSONPath=`.spec.guest.rootDisk.image`
type VirtualMachine struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec VirtualMachineSpec `json:"spec,omitempty"`
Status VirtualMachineStatus `json:"status,omitempty"`
}
func (vm *VirtualMachine) Cleanup() {
vm.Status.PodName = ""
vm.Status.PodIP = ""
vm.Status.Node = ""
vm.Status.CPUs = nil
vm.Status.MemorySize = nil
vm.Status.MemoryProvider = nil
}
func (vm *VirtualMachine) HasRestarted() bool {
return vm.Status.RestartCount > 0
}
//+kubebuilder:object:root=true
// VirtualMachineList contains a list of VirtualMachine
type VirtualMachineList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []VirtualMachine `json:"items"`
}
func init() {
SchemeBuilder.Register(&VirtualMachine{}, &VirtualMachineList{}) //nolint:exhaustruct // just being used to provide the types
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
"errors"
"fmt"
"reflect"
"slices"
"sigs.k8s.io/controller-runtime/pkg/webhook"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
"k8s.io/apimachinery/pkg/runtime"
)
//+kubebuilder:webhook:path=/mutate-vm-neon-tech-v1-virtualmachine,mutating=true,failurePolicy=fail,sideEffects=None,groups=vm.neon.tech,resources=virtualmachines,verbs=create;update,versions=v1,name=mvirtualmachine.kb.io,admissionReviewVersions=v1
var _ webhook.Defaulter = &VirtualMachine{}
// Default implements webhook.Defaulter
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachine) Default() {
// Nothing to do.
}
//+kubebuilder:webhook:path=/validate-vm-neon-tech-v1-virtualmachine,mutating=false,failurePolicy=fail,sideEffects=None,groups=vm.neon.tech,resources=virtualmachines,verbs=create;update,versions=v1,name=vvirtualmachine.kb.io,admissionReviewVersions=v1
var _ webhook.Validator = &VirtualMachine{}
// ValidateCreate implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control.
func (r *VirtualMachine) ValidateCreate() (admission.Warnings, error) {
// validate .spec.guest.cpus.use and .spec.guest.cpus.max
if r.Spec.Guest.CPUs.Use < r.Spec.Guest.CPUs.Min {
return nil, fmt.Errorf(".spec.guest.cpus.use (%v) should be greater than or equal to the .spec.guest.cpus.min (%v)",
r.Spec.Guest.CPUs.Use,
r.Spec.Guest.CPUs.Min)
}
if r.Spec.Guest.CPUs.Use > r.Spec.Guest.CPUs.Max {
return nil, fmt.Errorf(".spec.guest.cpus.use (%v) should be less than or equal to the .spec.guest.cpus.max (%v)",
r.Spec.Guest.CPUs.Use,
r.Spec.Guest.CPUs.Max)
}
// validate .spec.guest.memorySlotSize w.r.t. .spec.guest.memoryProvider
if r.Spec.Guest.MemoryProvider != nil {
if err := r.Spec.Guest.ValidateForMemoryProvider(*r.Spec.Guest.MemoryProvider); err != nil {
return nil, fmt.Errorf(".spec.guest: %w", err)
}
}
// validate .spec.guest.memorySlots.use and .spec.guest.memorySlots.max
if r.Spec.Guest.MemorySlots.Use < r.Spec.Guest.MemorySlots.Min {
return nil, fmt.Errorf(".spec.guest.memorySlots.use (%d) should be greater than or equal to the .spec.guest.memorySlots.min (%d)",
r.Spec.Guest.MemorySlots.Use,
r.Spec.Guest.MemorySlots.Min)
}
if r.Spec.Guest.MemorySlots.Use > r.Spec.Guest.MemorySlots.Max {
return nil, fmt.Errorf(".spec.guest.memorySlots.use (%d) should be less than or equal to the .spec.guest.memorySlots.max (%d)",
r.Spec.Guest.MemorySlots.Use,
r.Spec.Guest.MemorySlots.Max)
}
// validate .spec.disk names
reservedDiskNames := []string{
"virtualmachineimages",
"rootdisk",
"runtime",
"swapdisk",
"sysfscgroup",
"containerdsock",
"ssh-privatekey",
"ssh-publickey",
"ssh-authorized-keys",
}
for _, disk := range r.Spec.Disks {
if slices.Contains(reservedDiskNames, disk.Name) {
return nil, fmt.Errorf("'%s' is reserved for .spec.disks[].name", disk.Name)
}
if len(disk.Name) > 32 {
return nil, fmt.Errorf("disk name '%s' too long, should be less than or equal to 32", disk.Name)
}
}
// validate .spec.guest.ports[].name
for _, port := range r.Spec.Guest.Ports {
if len(port.Name) != 0 && port.Name == "qmp" {
return nil, errors.New("'qmp' is reserved name for .spec.guest.ports[].name")
}
}
// validate that at most one type of swap is provided:
if settings := r.Spec.Guest.Settings; settings != nil {
if settings.Swap != nil && settings.SwapInfo != nil {
return nil, errors.New("cannot have both 'swap' and 'swapInfo' enabled")
}
}
return nil, nil
}
// ValidateUpdate implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control.
func (r *VirtualMachine) ValidateUpdate(old runtime.Object) (admission.Warnings, error) {
// process immutable fields
before, _ := old.(*VirtualMachine)
immutableFields := []struct {
fieldName string
getter func(*VirtualMachine) any
}{
{".spec.guest.cpus.min", func(v *VirtualMachine) any { return v.Spec.Guest.CPUs.Min }},
{".spec.guest.cpus.max", func(v *VirtualMachine) any { return v.Spec.Guest.CPUs.Max }},
{".spec.guest.memorySlots.min", func(v *VirtualMachine) any { return v.Spec.Guest.MemorySlots.Min }},
{".spec.guest.memorySlots.max", func(v *VirtualMachine) any { return v.Spec.Guest.MemorySlots.Max }},
// nb: we don't check memoryProvider here, so that it's allowed to be mutable as a way of
// getting flexibility to solidify the memory provider or change it across restarts.
// ref https://github.com/neondatabase/autoscaling/pull/970#discussion_r1644225986
{".spec.guest.ports", func(v *VirtualMachine) any { return v.Spec.Guest.Ports }},
{".spec.guest.rootDisk", func(v *VirtualMachine) any { return v.Spec.Guest.RootDisk }},
{".spec.guest.command", func(v *VirtualMachine) any { return v.Spec.Guest.Command }},
{".spec.guest.args", func(v *VirtualMachine) any { return v.Spec.Guest.Args }},
{".spec.guest.env", func(v *VirtualMachine) any { return v.Spec.Guest.Env }},
{".spec.guest.settings", func(v *VirtualMachine) any {
if v.Spec.Guest.Settings == nil {
//nolint:gocritic // linter complains that we could say 'nil' directly. It's typed vs untyped nil.
return v.Spec.Guest.Settings
} else {
// Selectively allow swap fields to change between Swap and SwapInfo. More below.
return v.Spec.Guest.Settings.WithoutSwapFields()
}
}},
{".spec.disks", func(v *VirtualMachine) any { return v.Spec.Disks }},
{".spec.podResources", func(v *VirtualMachine) any { return v.Spec.PodResources }},
{".spec.enableAcceleration", func(v *VirtualMachine) any { return v.Spec.EnableAcceleration }},
{".spec.enableSSH", func(v *VirtualMachine) any { return v.Spec.EnableSSH }},
{".spec.initScript", func(v *VirtualMachine) any { return v.Spec.InitScript }},
}
for _, info := range immutableFields {
if !reflect.DeepEqual(info.getter(r), info.getter(before)) {
return nil, fmt.Errorf("%s is immutable", info.fieldName)
}
}
// validate swap changes by comparing the SwapInfo for each.
//
// If there's an error with the old object, but NOT an error with the new one, we'll allow the
// new one to proceed. This is to prevent any VirtualMachine objects getting stuck during
// rollout of swap changes, in case there's logic bugs in handling the change.
//
// If we didn't have that exception, we could *in theory* end up with an object in a bad state,
// but be unable to fix it because the old state is bad - even if the new one is ok - because
// the webhook would return an error from the old state being invalid, which disallows the update
if r.Spec.Guest.Settings != nil /* from above, if new GuestSettings != nil, then old is as well */ {
newSwapInfo, err := r.Spec.Guest.Settings.GetSwapInfo()
if err != nil {
return nil, err
}
oldSwapInfo, err := before.Spec.Guest.Settings.GetSwapInfo()
if err != nil {
// do nothing; we'll allow fixing broken objects.
} else {
if !reflect.DeepEqual(newSwapInfo, oldSwapInfo) {
return nil, errors.New(".spec.guest.settings.{swap,swapInfo} is immutable")
}
}
}
// validate .spec.guest.cpu.use
if r.Spec.Guest.CPUs.Use < r.Spec.Guest.CPUs.Min {
return nil, fmt.Errorf(".cpus.use (%v) should be greater than or equal to the .cpus.min (%v)",
r.Spec.Guest.CPUs.Use,
r.Spec.Guest.CPUs.Min)
}
if r.Spec.Guest.CPUs.Use > r.Spec.Guest.CPUs.Max {
return nil, fmt.Errorf(".cpus.use (%v) should be less than or equal to the .cpus.max (%v)",
r.Spec.Guest.CPUs.Use,
r.Spec.Guest.CPUs.Max)
}
// validate .spec.guest.memorySlots.use
if r.Spec.Guest.MemorySlots.Use < r.Spec.Guest.MemorySlots.Min {
return nil, fmt.Errorf(".memorySlots.use (%d) should be greater than or equal to the .memorySlots.min (%d)",
r.Spec.Guest.MemorySlots.Use,
r.Spec.Guest.MemorySlots.Min)
}
if r.Spec.Guest.MemorySlots.Use > r.Spec.Guest.MemorySlots.Max {
return nil, fmt.Errorf(".memorySlots.use (%d) should be less than or equal to the .memorySlots.max (%d)",
r.Spec.Guest.MemorySlots.Use,
r.Spec.Guest.MemorySlots.Max)
}
return nil, nil
}
// ValidateDelete implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachine) ValidateDelete() (admission.Warnings, error) {
// No deletion validation required currently.
return nil, nil
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
const MigrationPort int32 = 20187
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
// VirtualMachineMigrationSpec defines the desired state of VirtualMachineMigration
type VirtualMachineMigrationSpec struct {
VmName string `json:"vmName"`
// TODO: not implemented
// +optional
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
// TODO: not implemented
// +optional
NodeAffinity *corev1.NodeAffinity `json:"nodeAffinity,omitempty"`
// +optional
// +kubebuilder:default:=true
PreventMigrationToSameHost bool `json:"preventMigrationToSameHost"`
// TODO: not implemented
// Set 1 hour as default timeout for migration
// +optional
// +kubebuilder:default:=3600
CompletionTimeout int32 `json:"completionTimeout"`
// Trigger incremental disk copy migration by default, otherwise full disk copy used in migration
// +optional
// +kubebuilder:default:=true
Incremental bool `json:"incremental"`
// Use PostCopy migration by default
// +optional
// +kubebuilder:default:=false
AllowPostCopy bool `json:"allowPostCopy"`
// Use Auto converge by default
// +optional
// +kubebuilder:default:=true
AutoConverge bool `json:"autoConverge"`
// Set 1 Gbyte/sec as default for migration bandwidth
// +optional
// +kubebuilder:default:="1Gi"
MaxBandwidth resource.Quantity `json:"maxBandwidth"`
}
// VirtualMachineMigrationStatus defines the observed state of VirtualMachineMigration
type VirtualMachineMigrationStatus struct {
// Represents the observations of a VirtualMachineMigration's current state.
// VirtualMachineMigration.status.conditions.type are: "Available", "Progressing", and "Degraded"
// VirtualMachineMigration.status.conditions.status are one of True, False, Unknown.
// VirtualMachineMigration.status.conditions.reason the value should be a CamelCase string and producers of specific
// condition types may define expected values and meanings for this field, and whether the values
// are considered a guaranteed API.
// VirtualMachineMigration.status.conditions.Message is a human readable message indicating details about the transition.
// For further information see: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties
Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type" protobuf:"bytes,1,rep,name=conditions"`
// The phase of a VM is a simple, high-level summary of where the VM is in its lifecycle.
// +optional
Phase VmmPhase `json:"phase,omitempty"`
// +optional
SourcePodName string `json:"sourcePodName,omitempty"`
// +optional
TargetPodName string `json:"targetPodName,omitempty"`
// +optional
SourcePodIP string `json:"sourcePodIP,omitempty"`
// +optional
TargetPodIP string `json:"targetPodIP,omitempty"`
// +optional
SourceNode string `json:"sourceNode,omitempty"`
// +optional
TargetNode string `json:"targetNode,omitempty"`
// +optional
Info MigrationInfo `json:"info,omitempty"`
}
type MigrationInfo struct {
// +optional
Status string `json:"status,omitempty"`
// +optional
TotalTimeMs int64 `json:"totalTimeMs,omitempty"`
// +optional
SetupTimeMs int64 `json:"setupTimeMs,omitempty"`
// +optional
DowntimeMs int64 `json:"downtimeMs,omitempty"`
// +optional
Ram MigrationInfoRam `json:"ram,omitempty"`
// +optional
Compression MigrationInfoCompression `json:"compression,omitempty"`
}
type MigrationInfoRam struct {
// +optional
Transferred int64 `json:"transferred,omitempty"`
// +optional
Remaining int64 `json:"remaining,omitempty"`
// +optional
Total int64 `json:"total,omitempty"`
}
type MigrationInfoCompression struct {
// +optional
CompressedSize int64 `json:"compressedSize,omitempty"`
// +optional
CompressionRate int64 `json:"compressionRate,omitempty"`
}
type VmmPhase string
const (
// VmmPending means the migration has been accepted by the system, but target vm-runner pod
// has not been started. This includes time before being bound to a node, as well as time spent
// pulling images onto the host.
VmmPending VmmPhase = "Pending"
// VmmRunning means the target vm-runner pod has been bound to a node and have been started.
VmmRunning VmmPhase = "Running"
// VmmSucceeded means that migration finisged with success
VmmSucceeded VmmPhase = "Succeeded"
// VmmFailed means that migration failed
VmmFailed VmmPhase = "Failed"
)
//+genclient
//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:resource:singular=neonvmm
// VirtualMachineMigration is the Schema for the virtualmachinemigrations API
// +kubebuilder:printcolumn:name="VM",type=string,JSONPath=`.spec.vmName`
// +kubebuilder:printcolumn:name="Source",type=string,JSONPath=`.status.sourcePodName`
// +kubebuilder:printcolumn:name="SourceIP",type=string,priority=1,JSONPath=`.status.sourcePodIP`
// +kubebuilder:printcolumn:name="Target",type=string,JSONPath=`.status.targetPodName`
// +kubebuilder:printcolumn:name="TargetIP",type=string,priority=1,JSONPath=`.status.targetPodIP`
// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.phase`
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
type VirtualMachineMigration struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec VirtualMachineMigrationSpec `json:"spec,omitempty"`
Status VirtualMachineMigrationStatus `json:"status,omitempty"`
}
//+kubebuilder:object:root=true
// VirtualMachineMigrationList contains a list of VirtualMachineMigration
type VirtualMachineMigrationList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []VirtualMachineMigration `json:"items"`
}
func init() {
SchemeBuilder.Register(&VirtualMachineMigration{}, &VirtualMachineMigrationList{}) //nolint:exhaustruct // just being used to provide the types
}
/*
Copyright 2023.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
"sigs.k8s.io/controller-runtime/pkg/webhook"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
"k8s.io/apimachinery/pkg/runtime"
)
//+kubebuilder:webhook:path=/mutate-vm-neon-tech-v1-virtualmachinemigration,mutating=true,failurePolicy=fail,sideEffects=None,groups=vm.neon.tech,resources=virtualmachinemigrations,verbs=create;update,versions=v1,name=mvirtualmachinemigration.kb.io,admissionReviewVersions=v1
var _ webhook.Defaulter = &VirtualMachineMigration{}
// Default implements webhook.Defaulter
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachineMigration) Default() {
// TODO: implement defaults
}
//+kubebuilder:webhook:path=/validate-vm-neon-tech-v1-virtualmachinemigration,mutating=false,failurePolicy=fail,sideEffects=None,groups=vm.neon.tech,resources=virtualmachinemigrations,verbs=create;update,versions=v1,name=vvirtualmachinemigration.kb.io,admissionReviewVersions=v1
var _ webhook.Validator = &VirtualMachineMigration{}
// ValidateCreate implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachineMigration) ValidateCreate() (admission.Warnings, error) {
// TODO: implement creation validation webhook (?)
return nil, nil
}
// ValidateUpdate implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachineMigration) ValidateUpdate(old runtime.Object) (admission.Warnings, error) {
// TODO: implement update validation webhook
return nil, nil
}
// ValidateDelete implements webhook.Validator
//
// The controller wraps this logic so it can inject extra control in the webhook.
func (r *VirtualMachineMigration) ValidateDelete() (admission.Warnings, error) {
// TODO: implement deletion validation webhook (?)
return nil, nil
}
//go:build !ignore_autogenerated
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by controller-gen. DO NOT EDIT.
package v1
import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
)
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *CPUs) DeepCopyInto(out *CPUs) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CPUs.
func (in *CPUs) DeepCopy() *CPUs {
if in == nil {
return nil
}
out := new(CPUs)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Disk) DeepCopyInto(out *Disk) {
*out = *in
if in.ReadOnly != nil {
in, out := &in.ReadOnly, &out.ReadOnly
*out = new(bool)
**out = **in
}
in.DiskSource.DeepCopyInto(&out.DiskSource)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Disk.
func (in *Disk) DeepCopy() *Disk {
if in == nil {
return nil
}
out := new(Disk)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DiskSource) DeepCopyInto(out *DiskSource) {
*out = *in
if in.EmptyDisk != nil {
in, out := &in.EmptyDisk, &out.EmptyDisk
*out = new(EmptyDiskSource)
(*in).DeepCopyInto(*out)
}
if in.ConfigMap != nil {
in, out := &in.ConfigMap, &out.ConfigMap
*out = new(corev1.ConfigMapVolumeSource)
(*in).DeepCopyInto(*out)
}
if in.Secret != nil {
in, out := &in.Secret, &out.Secret
*out = new(corev1.SecretVolumeSource)
(*in).DeepCopyInto(*out)
}
if in.Tmpfs != nil {
in, out := &in.Tmpfs, &out.Tmpfs
*out = new(TmpfsDiskSource)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiskSource.
func (in *DiskSource) DeepCopy() *DiskSource {
if in == nil {
return nil
}
out := new(DiskSource)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *EmptyDiskSource) DeepCopyInto(out *EmptyDiskSource) {
*out = *in
out.Size = in.Size.DeepCopy()
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EmptyDiskSource.
func (in *EmptyDiskSource) DeepCopy() *EmptyDiskSource {
if in == nil {
return nil
}
out := new(EmptyDiskSource)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *EnvVar) DeepCopyInto(out *EnvVar) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvVar.
func (in *EnvVar) DeepCopy() *EnvVar {
if in == nil {
return nil
}
out := new(EnvVar)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ExtraNetwork) DeepCopyInto(out *ExtraNetwork) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtraNetwork.
func (in *ExtraNetwork) DeepCopy() *ExtraNetwork {
if in == nil {
return nil
}
out := new(ExtraNetwork)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Guest) DeepCopyInto(out *Guest) {
*out = *in
if in.KernelImage != nil {
in, out := &in.KernelImage, &out.KernelImage
*out = new(string)
**out = **in
}
if in.AppendKernelCmdline != nil {
in, out := &in.AppendKernelCmdline, &out.AppendKernelCmdline
*out = new(string)
**out = **in
}
out.CPUs = in.CPUs
out.MemorySlotSize = in.MemorySlotSize.DeepCopy()
out.MemorySlots = in.MemorySlots
if in.MemoryProvider != nil {
in, out := &in.MemoryProvider, &out.MemoryProvider
*out = new(MemoryProvider)
**out = **in
}
in.RootDisk.DeepCopyInto(&out.RootDisk)
if in.Command != nil {
in, out := &in.Command, &out.Command
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.Args != nil {
in, out := &in.Args, &out.Args
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.Env != nil {
in, out := &in.Env, &out.Env
*out = make([]EnvVar, len(*in))
copy(*out, *in)
}
if in.Ports != nil {
in, out := &in.Ports, &out.Ports
*out = make([]Port, len(*in))
copy(*out, *in)
}
if in.Settings != nil {
in, out := &in.Settings, &out.Settings
*out = new(GuestSettings)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Guest.
func (in *Guest) DeepCopy() *Guest {
if in == nil {
return nil
}
out := new(Guest)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *GuestSettings) DeepCopyInto(out *GuestSettings) {
*out = *in
if in.Sysctl != nil {
in, out := &in.Sysctl, &out.Sysctl
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.Swap != nil {
in, out := &in.Swap, &out.Swap
x := (*in).DeepCopy()
*out = &x
}
if in.SwapInfo != nil {
in, out := &in.SwapInfo, &out.SwapInfo
*out = new(SwapInfo)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GuestSettings.
func (in *GuestSettings) DeepCopy() *GuestSettings {
if in == nil {
return nil
}
out := new(GuestSettings)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IPAllocation) DeepCopyInto(out *IPAllocation) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IPAllocation.
func (in *IPAllocation) DeepCopy() *IPAllocation {
if in == nil {
return nil
}
out := new(IPAllocation)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IPPool) DeepCopyInto(out *IPPool) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
in.Spec.DeepCopyInto(&out.Spec)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IPPool.
func (in *IPPool) DeepCopy() *IPPool {
if in == nil {
return nil
}
out := new(IPPool)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *IPPool) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IPPoolList) DeepCopyInto(out *IPPoolList) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ListMeta.DeepCopyInto(&out.ListMeta)
if in.Items != nil {
in, out := &in.Items, &out.Items
*out = make([]IPPool, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IPPoolList.
func (in *IPPoolList) DeepCopy() *IPPoolList {
if in == nil {
return nil
}
out := new(IPPoolList)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *IPPoolList) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IPPoolSpec) DeepCopyInto(out *IPPoolSpec) {
*out = *in
if in.Allocations != nil {
in, out := &in.Allocations, &out.Allocations
*out = make(map[string]IPAllocation, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IPPoolSpec.
func (in *IPPoolSpec) DeepCopy() *IPPoolSpec {
if in == nil {
return nil
}
out := new(IPPoolSpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MemorySlots) DeepCopyInto(out *MemorySlots) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MemorySlots.
func (in *MemorySlots) DeepCopy() *MemorySlots {
if in == nil {
return nil
}
out := new(MemorySlots)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MigrationInfo) DeepCopyInto(out *MigrationInfo) {
*out = *in
out.Ram = in.Ram
out.Compression = in.Compression
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationInfo.
func (in *MigrationInfo) DeepCopy() *MigrationInfo {
if in == nil {
return nil
}
out := new(MigrationInfo)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MigrationInfoCompression) DeepCopyInto(out *MigrationInfoCompression) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationInfoCompression.
func (in *MigrationInfoCompression) DeepCopy() *MigrationInfoCompression {
if in == nil {
return nil
}
out := new(MigrationInfoCompression)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MigrationInfoRam) DeepCopyInto(out *MigrationInfoRam) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationInfoRam.
func (in *MigrationInfoRam) DeepCopy() *MigrationInfoRam {
if in == nil {
return nil
}
out := new(MigrationInfoRam)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Port) DeepCopyInto(out *Port) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Port.
func (in *Port) DeepCopy() *Port {
if in == nil {
return nil
}
out := new(Port)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Revision) DeepCopyInto(out *Revision) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Revision.
func (in *Revision) DeepCopy() *Revision {
if in == nil {
return nil
}
out := new(Revision)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RevisionWithTime) DeepCopyInto(out *RevisionWithTime) {
*out = *in
out.Revision = in.Revision
in.UpdatedAt.DeepCopyInto(&out.UpdatedAt)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RevisionWithTime.
func (in *RevisionWithTime) DeepCopy() *RevisionWithTime {
if in == nil {
return nil
}
out := new(RevisionWithTime)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RootDisk) DeepCopyInto(out *RootDisk) {
*out = *in
out.Size = in.Size.DeepCopy()
if in.Execute != nil {
in, out := &in.Execute, &out.Execute
*out = make([]string, len(*in))
copy(*out, *in)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RootDisk.
func (in *RootDisk) DeepCopy() *RootDisk {
if in == nil {
return nil
}
out := new(RootDisk)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *SwapInfo) DeepCopyInto(out *SwapInfo) {
*out = *in
out.Size = in.Size.DeepCopy()
if in.SkipSwapon != nil {
in, out := &in.SkipSwapon, &out.SkipSwapon
*out = new(bool)
**out = **in
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SwapInfo.
func (in *SwapInfo) DeepCopy() *SwapInfo {
if in == nil {
return nil
}
out := new(SwapInfo)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *TmpfsDiskSource) DeepCopyInto(out *TmpfsDiskSource) {
*out = *in
out.Size = in.Size.DeepCopy()
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TmpfsDiskSource.
func (in *TmpfsDiskSource) DeepCopy() *TmpfsDiskSource {
if in == nil {
return nil
}
out := new(TmpfsDiskSource)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachine) DeepCopyInto(out *VirtualMachine) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
in.Spec.DeepCopyInto(&out.Spec)
in.Status.DeepCopyInto(&out.Status)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachine.
func (in *VirtualMachine) DeepCopy() *VirtualMachine {
if in == nil {
return nil
}
out := new(VirtualMachine)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *VirtualMachine) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineList) DeepCopyInto(out *VirtualMachineList) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ListMeta.DeepCopyInto(&out.ListMeta)
if in.Items != nil {
in, out := &in.Items, &out.Items
*out = make([]VirtualMachine, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineList.
func (in *VirtualMachineList) DeepCopy() *VirtualMachineList {
if in == nil {
return nil
}
out := new(VirtualMachineList)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *VirtualMachineList) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineMigration) DeepCopyInto(out *VirtualMachineMigration) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
in.Spec.DeepCopyInto(&out.Spec)
in.Status.DeepCopyInto(&out.Status)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineMigration.
func (in *VirtualMachineMigration) DeepCopy() *VirtualMachineMigration {
if in == nil {
return nil
}
out := new(VirtualMachineMigration)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *VirtualMachineMigration) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineMigrationList) DeepCopyInto(out *VirtualMachineMigrationList) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ListMeta.DeepCopyInto(&out.ListMeta)
if in.Items != nil {
in, out := &in.Items, &out.Items
*out = make([]VirtualMachineMigration, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineMigrationList.
func (in *VirtualMachineMigrationList) DeepCopy() *VirtualMachineMigrationList {
if in == nil {
return nil
}
out := new(VirtualMachineMigrationList)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *VirtualMachineMigrationList) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineMigrationSpec) DeepCopyInto(out *VirtualMachineMigrationSpec) {
*out = *in
if in.NodeSelector != nil {
in, out := &in.NodeSelector, &out.NodeSelector
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
if in.NodeAffinity != nil {
in, out := &in.NodeAffinity, &out.NodeAffinity
*out = new(corev1.NodeAffinity)
(*in).DeepCopyInto(*out)
}
out.MaxBandwidth = in.MaxBandwidth.DeepCopy()
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineMigrationSpec.
func (in *VirtualMachineMigrationSpec) DeepCopy() *VirtualMachineMigrationSpec {
if in == nil {
return nil
}
out := new(VirtualMachineMigrationSpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineMigrationStatus) DeepCopyInto(out *VirtualMachineMigrationStatus) {
*out = *in
if in.Conditions != nil {
in, out := &in.Conditions, &out.Conditions
*out = make([]metav1.Condition, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
out.Info = in.Info
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineMigrationStatus.
func (in *VirtualMachineMigrationStatus) DeepCopy() *VirtualMachineMigrationStatus {
if in == nil {
return nil
}
out := new(VirtualMachineMigrationStatus)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineResources) DeepCopyInto(out *VirtualMachineResources) {
*out = *in
out.CPUs = in.CPUs
out.MemorySlots = in.MemorySlots
out.MemorySlotSize = in.MemorySlotSize.DeepCopy()
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineResources.
func (in *VirtualMachineResources) DeepCopy() *VirtualMachineResources {
if in == nil {
return nil
}
out := new(VirtualMachineResources)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineSpec) DeepCopyInto(out *VirtualMachineSpec) {
*out = *in
if in.TerminationGracePeriodSeconds != nil {
in, out := &in.TerminationGracePeriodSeconds, &out.TerminationGracePeriodSeconds
*out = new(int64)
**out = **in
}
if in.NodeSelector != nil {
in, out := &in.NodeSelector, &out.NodeSelector
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
if in.Affinity != nil {
in, out := &in.Affinity, &out.Affinity
*out = new(corev1.Affinity)
(*in).DeepCopyInto(*out)
}
if in.Tolerations != nil {
in, out := &in.Tolerations, &out.Tolerations
*out = make([]corev1.Toleration, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
in.PodResources.DeepCopyInto(&out.PodResources)
if in.ImagePullSecrets != nil {
in, out := &in.ImagePullSecrets, &out.ImagePullSecrets
*out = make([]corev1.LocalObjectReference, len(*in))
copy(*out, *in)
}
in.Guest.DeepCopyInto(&out.Guest)
if in.ExtraInitContainers != nil {
in, out := &in.ExtraInitContainers, &out.ExtraInitContainers
*out = make([]corev1.Container, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.Disks != nil {
in, out := &in.Disks, &out.Disks
*out = make([]Disk, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.ExtraNetwork != nil {
in, out := &in.ExtraNetwork, &out.ExtraNetwork
*out = new(ExtraNetwork)
**out = **in
}
if in.ServiceLinks != nil {
in, out := &in.ServiceLinks, &out.ServiceLinks
*out = new(bool)
**out = **in
}
if in.EnableAcceleration != nil {
in, out := &in.EnableAcceleration, &out.EnableAcceleration
*out = new(bool)
**out = **in
}
if in.RunnerImage != nil {
in, out := &in.RunnerImage, &out.RunnerImage
*out = new(string)
**out = **in
}
if in.EnableSSH != nil {
in, out := &in.EnableSSH, &out.EnableSSH
*out = new(bool)
**out = **in
}
if in.TargetRevision != nil {
in, out := &in.TargetRevision, &out.TargetRevision
*out = new(RevisionWithTime)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineSpec.
func (in *VirtualMachineSpec) DeepCopy() *VirtualMachineSpec {
if in == nil {
return nil
}
out := new(VirtualMachineSpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineStatus) DeepCopyInto(out *VirtualMachineStatus) {
*out = *in
if in.Conditions != nil {
in, out := &in.Conditions, &out.Conditions
*out = make([]metav1.Condition, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.CPUs != nil {
in, out := &in.CPUs, &out.CPUs
*out = new(MilliCPU)
**out = **in
}
if in.MemorySize != nil {
in, out := &in.MemorySize, &out.MemorySize
x := (*in).DeepCopy()
*out = &x
}
if in.MemoryProvider != nil {
in, out := &in.MemoryProvider, &out.MemoryProvider
*out = new(MemoryProvider)
**out = **in
}
if in.CurrentRevision != nil {
in, out := &in.CurrentRevision, &out.CurrentRevision
*out = new(RevisionWithTime)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineStatus.
func (in *VirtualMachineStatus) DeepCopy() *VirtualMachineStatus {
if in == nil {
return nil
}
out := new(VirtualMachineStatus)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VirtualMachineUsage) DeepCopyInto(out *VirtualMachineUsage) {
*out = *in
if in.CPU != nil {
in, out := &in.CPU, &out.CPU
x := (*in).DeepCopy()
*out = &x
}
if in.Memory != nil {
in, out := &in.Memory, &out.Memory
x := (*in).DeepCopy()
*out = &x
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineUsage.
func (in *VirtualMachineUsage) DeepCopy() *VirtualMachineUsage {
if in == nil {
return nil
}
out := new(VirtualMachineUsage)
in.DeepCopyInto(out)
return out
}
package controllers
import (
"context"
"fmt"
"runtime/debug"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)
type catchPanicReconciler struct {
inner reconcile.Reconciler
}
func withCatchPanic(r reconcile.Reconciler) reconcile.Reconciler {
return &catchPanicReconciler{inner: r}
}
func (r *catchPanicReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) {
log := log.FromContext(ctx)
defer func() {
if v := recover(); v != nil {
err = fmt.Errorf("panicked with: %v", v)
log.Error(err, "Reconcile panicked", "stack", string(debug.Stack()))
}
}()
result, err = r.inner.Reconcile(ctx, req)
return
}
package controllers
import (
"time"
"k8s.io/apimachinery/pkg/types"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)
// ReconcilerConfig stores shared configuration for VirtualMachineReconciler and
// VirtualMachineMigrationReconciler.
type ReconcilerConfig struct {
// IsK3s is true iff the cluster is running k3s nodes.
//
// This is required because - unlike the other most common kubernetes distributions - k3s
// changes the location of the containerd socket.
// There unfortunately does not appear to be a way to disable this behavior.
IsK3s bool
// UseContainerMgr, if true, enables using container-mgr for new VM runner pods.
//
// This is defined as a config option so we can do a gradual rollout of this change.
UseContainerMgr bool
// DisableRunnerCgroup, if true, disables running QEMU in a cgroup in new VM runner pods.
// Fractional CPU scaling will continue to *pretend* to work, but it will not do anything in
// practice.
//
// Under the hood, this results in passing -skip-cgroup-management and -enable-dummy-cpu-server
// to neonvm-runner.
DisableRunnerCgroup bool
MaxConcurrentReconciles int
// SkipUpdateValidationFor is the set of object names that we should ignore when doing webhook
// update validation.
SkipUpdateValidationFor map[types.NamespacedName]struct{}
// QEMUDiskCacheSettings sets the values of the 'cache.*' settings used for QEMU disks.
//
// This field is passed to neonvm-runner as the `-qemu-disk-cache-settings` arg, and is directly
// used in setting up the VM disks via QEMU's `-drive` flag.
QEMUDiskCacheSettings string
// DefaultMemoryProvider is the memory provider (dimm slots or virtio-mem) that will be used for
// new VMs (or, when old ones restart) if nothing is explicitly set.
DefaultMemoryProvider vmv1.MemoryProvider
// MemhpAutoMovableRatio specifies the value that new neonvm-runners will set as the
// kernel's 'memory_hotplug.auto_movable_ratio', iff the memory provider is virtio-mem.
//
// This value is passed directly to neonvm-runner as the '-memhp-auto-movable-ratio' flag.
// We've confirmed sensible values are from 301 to 801 (i.e. 3.01:1 through 8.01:1).
// The range of sensible values may extend further, but we have not tested that.
MemhpAutoMovableRatio string
// FailurePendingPeriod is the period for the propagation of
// reconciliation failures to the observability instruments
FailurePendingPeriod time.Duration
// FailingRefreshInterval is the interval between consecutive
// updates of metrics and logs, related to failing reconciliations
FailingRefreshInterval time.Duration
}
func (c *ReconcilerConfig) criEndpointSocketPath() string {
if c.IsK3s {
return "/run/k3s/containerd/containerd.sock"
} else {
return "/run/containerd/containerd.sock"
}
}
package failurelag
import (
"sync"
"time"
)
// Tracker accumulates failure events for a given key and determines if
// the key is degraded. The key becomes degraded if it receives only failures
// over a configurable pending period. Once the success event is received, the key
// is no longer considered degraded, and the pending period is reset.
type Tracker[T comparable] struct {
period time.Duration
pendingSince map[T]time.Time
degraded map[T]struct{}
degradeAt []degradeAt[T]
lock sync.Mutex
Now func() time.Time
}
type degradeAt[T comparable] struct {
ts time.Time
key T
}
func NewTracker[T comparable](period time.Duration) *Tracker[T] {
return &Tracker[T]{
period: period,
pendingSince: make(map[T]time.Time),
degraded: make(map[T]struct{}),
degradeAt: []degradeAt[T]{},
lock: sync.Mutex{},
Now: time.Now,
}
}
// forward processes all the fireAt events that are now in the past.
func (t *Tracker[T]) forward(now time.Time) {
i := 0
for ; i < len(t.degradeAt); i++ {
event := t.degradeAt[i]
if event.ts.After(now) {
break
}
pendingSince, ok := t.pendingSince[event.key]
if !ok {
// There was a success event in between
continue
}
if event.ts.Sub(pendingSince) < t.period {
// There was a success, and another failure in between
// We will have another fireAt event for this key in the future
continue
}
t.degraded[event.key] = struct{}{}
}
t.degradeAt = t.degradeAt[i:]
}
func (t *Tracker[T]) RecordSuccess(key T) {
t.lock.Lock()
defer t.lock.Unlock()
delete(t.degraded, key)
delete(t.pendingSince, key)
t.forward(t.Now())
}
func (t *Tracker[T]) RecordFailure(key T) {
t.lock.Lock()
defer t.lock.Unlock()
now := t.Now()
if _, ok := t.pendingSince[key]; !ok {
t.pendingSince[key] = now
}
t.degradeAt = append(t.degradeAt, degradeAt[T]{
ts: now.Add(t.period),
key: key,
})
t.forward(now)
}
func (t *Tracker[T]) DegradedCount() int {
t.lock.Lock()
defer t.lock.Unlock()
t.forward(t.Now())
return len(t.degraded)
}
func (t *Tracker[T]) Degraded() []T {
t.lock.Lock()
defer t.lock.Unlock()
t.forward(t.Now())
keys := make([]T, 0, len(t.degraded))
for k := range t.degraded {
keys = append(keys, k)
}
return keys
}
package controllers
import (
"context"
"fmt"
"time"
"github.com/go-logr/logr"
"github.com/prometheus/client_golang/prometheus"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/metrics"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"k8s.io/apimachinery/pkg/api/errors"
"github.com/neondatabase/autoscaling/neonvm/controllers/failurelag"
"github.com/neondatabase/autoscaling/pkg/util"
)
type ReconcilerMetrics struct {
failing *prometheus.GaugeVec
vmCreationToRunnerCreationTime prometheus.Histogram
runnerCreationToVMRunningTime prometheus.Histogram
vmCreationToVMRunningTime prometheus.Histogram
vmRestartCounts prometheus.Counter
reconcileDuration prometheus.HistogramVec
}
const OutcomeLabel = "outcome"
func MakeReconcilerMetrics() ReconcilerMetrics {
// Copied bucket values from controller runtime latency metric. We can
// adjust them in the future if needed.
buckets := []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}
m := ReconcilerMetrics{
failing: util.RegisterMetric(metrics.Registry, prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "reconcile_failing_objects",
Help: "Number of objects that are failing to reconcile for each specific controller",
},
[]string{"controller", OutcomeLabel},
)),
vmCreationToRunnerCreationTime: util.RegisterMetric(metrics.Registry, prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "vm_creation_to_runner_creation_duration_seconds",
Help: "Time duration from VirtualMachine.CreationTimestamp to runner Pod.CreationTimestamp",
Buckets: buckets,
},
)),
runnerCreationToVMRunningTime: util.RegisterMetric(metrics.Registry, prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "vm_runner_creation_to_vm_running_duration_seconds",
Help: "Time duration from runner Pod.CreationTimestamp to the moment when VirtualMachine.Status.Phase becomes Running",
Buckets: buckets,
},
)),
vmCreationToVMRunningTime: util.RegisterMetric(metrics.Registry, prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "vm_creation_to_vm_running_duration_seconds",
Help: "Time duration from VirtualMachine.CreationTimeStamp to the moment when VirtualMachine.Status.Phase becomes Running",
Buckets: buckets,
},
)),
vmRestartCounts: util.RegisterMetric(metrics.Registry, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "vm_restarts_count",
Help: "Total number of VM restarts across the cluster captured by VirtualMachine reconciler",
},
)),
reconcileDuration: *util.RegisterMetric(metrics.Registry, prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "reconcile_duration_seconds",
Help: "Time duration of reconciles",
Buckets: buckets,
}, []string{OutcomeLabel},
)),
}
return m
}
type ReconcileOutcome string
const (
SuccessOutcome ReconcileOutcome = "success"
FailureOutcome ReconcileOutcome = "failure"
ConflictOutcome ReconcileOutcome = "conflict"
)
func (m ReconcilerMetrics) ObserveReconcileDuration(
outcome ReconcileOutcome,
duration time.Duration,
) {
m.reconcileDuration.WithLabelValues(string(outcome)).Observe(duration.Seconds())
}
type wrappedReconciler struct {
ControllerName string
Reconciler reconcile.Reconciler
Metrics ReconcilerMetrics
refreshFailingInterval time.Duration
failing *failurelag.Tracker[client.ObjectKey]
conflicting *failurelag.Tracker[client.ObjectKey]
}
// ReconcilerWithMetrics is a Reconciler produced by WithMetrics that can return a snapshot of the
// state backing the metrics.
type ReconcilerWithMetrics interface {
reconcile.Reconciler
Snapshot() ReconcileSnapshot
FailingRefresher() FailingRefresher
}
// ReconcileSnapshot provides a glimpse into the current state of ongoing reconciles
//
// This type is (transitively) returned by the controller's "dump state" HTTP endpoint, and exists
// to allow us to get deeper information on the metrics - we can't expose information for every
// VirtualMachine into the metrics (it'd be too high cardinality), but we *can* make it available
// when requested.
type ReconcileSnapshot struct {
// ControllerName is the name of the controller: virtualmachine or virtualmachinemigration.
ControllerName string `json:"controllerName"`
// Failing is the list of objects currently failing to reconcile
Failing []string `json:"failing"`
// Conflicting is the list of objects currently failing to reconcile
// due to a conflict
Conflicting []string `json:"conflicting"`
}
// WithMetrics wraps a given Reconciler with metrics capabilities.
//
// The returned reconciler also provides a way to get a snapshot of the state of ongoing reconciles,
// to see the data backing the metrics.
func WithMetrics(
reconciler reconcile.Reconciler,
rm ReconcilerMetrics,
cntrlName string,
failurePendingPeriod time.Duration,
refreshFailingInterval time.Duration,
) ReconcilerWithMetrics {
return &wrappedReconciler{
Reconciler: reconciler,
Metrics: rm,
ControllerName: cntrlName,
failing: failurelag.NewTracker[client.ObjectKey](failurePendingPeriod),
conflicting: failurelag.NewTracker[client.ObjectKey](failurePendingPeriod),
refreshFailingInterval: refreshFailingInterval,
}
}
func (d *wrappedReconciler) refreshFailing(
log logr.Logger,
outcome ReconcileOutcome,
tracker *failurelag.Tracker[client.ObjectKey],
) {
degraded := tracker.Degraded()
d.Metrics.failing.WithLabelValues(d.ControllerName, string(outcome)).
Set(float64(len(degraded)))
// Log each object on a separate line (even though we could just put them all on the same line)
// so that:
// 1. we avoid super long log lines (which can make log storage / querying unhappy), and
// 2. so that we can process it with Grafana Loki, which can't handle arrays
for _, obj := range degraded {
log.Info(
fmt.Sprintf("Currently failing to reconcile %v object", d.ControllerName),
"outcome", outcome,
"object", obj,
)
}
}
func (d *wrappedReconciler) runRefreshFailing(ctx context.Context) {
log := log.FromContext(ctx)
for {
select {
case <-ctx.Done():
return
case <-time.After(d.refreshFailingInterval):
d.refreshFailing(log, FailureOutcome, d.failing)
d.refreshFailing(log, ConflictOutcome, d.conflicting)
}
}
}
func (d *wrappedReconciler) FailingRefresher() FailingRefresher {
return FailingRefresher{r: d}
}
// FailingRefresher is a wrapper, which implements manager.Runnable
type FailingRefresher struct {
r *wrappedReconciler
}
func (f FailingRefresher) Start(ctx context.Context) error {
go f.r.runRefreshFailing(ctx)
return nil
}
func (d *wrappedReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := log.FromContext(ctx)
now := time.Now()
res, err := d.Reconciler.Reconcile(ctx, req)
duration := time.Since(now)
outcome := SuccessOutcome
if err != nil {
if errors.IsConflict(err) {
outcome = ConflictOutcome
d.conflicting.RecordFailure(req.NamespacedName)
} else {
outcome = FailureOutcome
d.failing.RecordFailure(req.NamespacedName)
// If the VM is now getting non-conflict errors, it probably
// means transient conflicts has been resolved.
//
// Notably, the other way around is not true:
// if a VM is getting conflict errors, it doesn't mean
// non-conflict errors are resolved, as they are more
// likely to be persistent.
d.conflicting.RecordSuccess(req.NamespacedName)
}
log.Error(err, "Failed to reconcile VirtualMachine",
"duration", duration.String(), "outcome", outcome)
} else {
d.failing.RecordSuccess(req.NamespacedName)
d.conflicting.RecordSuccess(req.NamespacedName)
log.Info("Successful reconciliation", "duration", duration.String(), "requeueAfter", res.RequeueAfter)
}
d.Metrics.ObserveReconcileDuration(outcome, duration)
d.Metrics.failing.WithLabelValues(d.ControllerName,
string(FailureOutcome)).Set(float64(d.failing.DegradedCount()))
d.Metrics.failing.WithLabelValues(d.ControllerName,
string(ConflictOutcome)).Set(float64(d.conflicting.DegradedCount()))
return res, err
}
func toStringSlice(s []client.ObjectKey) []string {
keys := make([]string, 0, len(s))
for _, k := range s {
keys = append(keys, k.String())
}
return keys
}
func (r *wrappedReconciler) Snapshot() ReconcileSnapshot {
failing := toStringSlice(r.failing.Degraded())
conflicting := toStringSlice(r.conflicting.Degraded())
return ReconcileSnapshot{
ControllerName: r.ControllerName,
Failing: failing,
Conflicting: conflicting,
}
}
/*
Copyright 2022.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controllers
import (
"bytes"
"context"
"crypto/ed25519"
"crypto/rand"
"encoding/base64"
"encoding/json"
"encoding/pem"
"fmt"
"io"
"net/http"
"os"
"reflect"
"strconv"
"time"
nadapiv1 "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1"
"github.com/samber/lo"
"golang.org/x/crypto/ssh"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/log"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apiserver/pkg/storage/names"
"k8s.io/client-go/tools/record"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/neonvm/controllers/buildtag"
"github.com/neondatabase/autoscaling/neonvm/pkg/ipam"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util/patch"
)
const (
virtualmachineFinalizer = "vm.neon.tech/finalizer"
)
// Definitions to manage status conditions
const (
// typeAvailableVirtualMachine represents the status of the Deployment reconciliation
typeAvailableVirtualMachine = "Available"
// typeDegradedVirtualMachine represents the status used when the custom resource is deleted and the finalizer operations are must to occur.
typeDegradedVirtualMachine = "Degraded"
)
const (
minSupportedRunnerVersion api.RunnerProtoVersion = api.RunnerProtoV1
maxSupportedRunnerVersion api.RunnerProtoVersion = api.RunnerProtoV1
)
// VMReconciler reconciles a VirtualMachine object
type VMReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
Config *ReconcilerConfig
Metrics ReconcilerMetrics `exhaustruct:"optional"`
}
// The following markers are used to generate the rules permissions (RBAC) on config/rbac using controller-gen
// when controller-gen (used by 'make generate') is executed.
// To know more about markers see: https://book.kubebuilder.io/reference/markers.html
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachines,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachines/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachines/finalizers,verbs=update
//+kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
//+kubebuilder:rbac:groups=core,resources=nodes,verbs=list
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;list;watch
//+kubebuilder:rbac:groups=vm.neon.tech,resources=ippools,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=vm.neon.tech,resources=ippools/finalizers,verbs=update
//+kubebuilder:rbac:groups=k8s.cni.cncf.io,resources=network-attachment-definitions,verbs=get;list;watch
// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// It is essential for the controller's reconciliation loop to be idempotent. By following the Operator
// pattern you will create Controllers which provide a reconcile function
// responsible for synchronizing resources until the desired state is reached on the cluster.
// Breaking this recommendation goes against the design principles of controller-runtime.
// and may lead to unforeseen consequences such as resources becoming stuck and requiring manual intervention.
// For further info:
// - About Operator Pattern: https://kubernetes.io/docs/concepts/extend-kubernetes/operator/
// - About Controllers: https://kubernetes.io/docs/concepts/architecture/controller/
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.13.0/pkg/reconcile
func (r *VMReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := log.FromContext(ctx)
var vm vmv1.VirtualMachine
if err := r.Get(ctx, req.NamespacedName, &vm); err != nil {
// Error reading the object - requeue the request.
if notfound := client.IgnoreNotFound(err); notfound == nil {
log.Info("virtualmachine resource not found. Ignoring since object must be deleted")
return ctrl.Result{}, nil
}
log.Error(err, "Unable to fetch VirtualMachine")
return ctrl.Result{}, client.IgnoreNotFound(err)
}
// examine DeletionTimestamp to determine if object is under deletion
if vm.ObjectMeta.DeletionTimestamp.IsZero() {
// The object is not being deleted, so if it does not have our finalizer,
// then lets add the finalizer and update the object. This is equivalent
// registering our finalizer.
if !controllerutil.ContainsFinalizer(&vm, virtualmachineFinalizer) {
log.Info("Adding Finalizer for VirtualMachine")
if ok := controllerutil.AddFinalizer(&vm, virtualmachineFinalizer); !ok {
log.Info("Failed to add finalizer from VirtualMachine")
return ctrl.Result{Requeue: true}, nil
}
if err := r.tryUpdateVM(ctx, &vm); err != nil {
log.Error(err, "Failed to update status about adding finalizer to VirtualMachine")
return ctrl.Result{}, err
}
return ctrl.Result{Requeue: true}, nil
}
} else {
// The object is being deleted
if controllerutil.ContainsFinalizer(&vm, virtualmachineFinalizer) {
// our finalizer is present, so lets handle any external dependency
log.Info("Performing Finalizer Operations for VirtualMachine before delete it")
r.doFinalizerOperationsForVirtualMachine(ctx, &vm)
// remove our finalizer from the list and update it.
log.Info("Removing Finalizer for VirtualMachine after successfully perform the operations")
if ok := controllerutil.RemoveFinalizer(&vm, virtualmachineFinalizer); !ok {
log.Info("Failed to remove finalizer from VirtualMachine")
return ctrl.Result{Requeue: true}, nil
}
if err := r.tryUpdateVM(ctx, &vm); err != nil {
log.Error(err, "Failed to update status about removing finalizer from VirtualMachine")
return ctrl.Result{}, err
}
}
// Stop reconciliation as the item is being deleted
return ctrl.Result{}, nil
}
statusBefore := vm.Status.DeepCopy()
if err := r.doReconcile(ctx, &vm); err != nil {
r.Recorder.Eventf(&vm, corev1.EventTypeWarning, "Failed",
"Failed to reconcile (%s): %s", vm.Name, err)
return ctrl.Result{}, err
}
// If the status changed, try to update the object
if !DeepEqual(statusBefore, vm.Status) {
if err := r.Status().Update(ctx, &vm); err != nil {
log.Error(err, "Failed to update VirtualMachine status after reconcile loop",
"virtualmachine", vm.Name)
return ctrl.Result{}, err
}
}
// Only quickly requeue if we're scaling or migrating. Otherwise, we aren't expecting any
// changes from QEMU, and it's wasteful to repeatedly check.
requeueAfter := time.Second
if vm.Status.Phase == vmv1.VmPending || vm.Status.Phase == vmv1.VmRunning {
requeueAfter = 15 * time.Second
}
return ctrl.Result{RequeueAfter: requeueAfter}, nil
}
// doFinalizerOperationsForVirtualMachine will perform the required operations before delete the CR.
func (r *VMReconciler) doFinalizerOperationsForVirtualMachine(ctx context.Context, vm *vmv1.VirtualMachine) {
// Note: It is not recommended to use finalizers with the purpose of delete resources which are
// created and managed in the reconciliation. These ones, such as the Pod created on this reconcile,
// are defined as depended of the custom resource. See that we use the method ctrl.SetControllerReference.
// to set the ownerRef which means that the Deployment will be deleted by the Kubernetes API.
// More info: https://kubernetes.io/docs/tasks/administer-cluster/use-cascading-deletion/
log := log.FromContext(ctx)
// The following implementation will raise an event
r.Recorder.Event(vm, "Warning", "Deleting",
fmt.Sprintf("Custom Resource %s is being deleted from the namespace %s",
vm.Name,
vm.Namespace))
// Release overlay IP address
if vm.Spec.ExtraNetwork != nil {
// Create IPAM object
nadName, err := nadIpamName()
if err != nil {
// ignore error
log.Error(err, "ignored error")
return
}
nadNamespace, err := nadIpamNamespace()
if err != nil {
// ignore error
log.Error(err, "ignored error")
return
}
ipam, err := ipam.New(ctx, nadName, nadNamespace)
if err != nil {
// ignore error
log.Error(err, "ignored error")
return
}
defer ipam.Close()
ip, err := ipam.ReleaseIP(ctx, vm.Name, vm.Namespace)
if err != nil {
// ignore error
log.Error(err, "fail to release IP, error ignored")
return
}
message := fmt.Sprintf("Released IP %s", ip.String())
log.Info(message)
r.Recorder.Event(vm, "Normal", "OverlayNet", message)
}
}
func getRunnerVersion(pod *corev1.Pod) (api.RunnerProtoVersion, error) {
val, ok := pod.Labels[vmv1.RunnerPodVersionLabel]
if !ok {
return api.RunnerProtoVersion(0), nil
}
uintVal, err := strconv.ParseUint(val, 10, 32)
if err != nil {
return 0, fmt.Errorf("failed to parse label value as integer: %w", err)
}
return api.RunnerProtoVersion(uintVal), nil
}
func runnerVersionIsSupported(version api.RunnerProtoVersion) bool {
return version >= minSupportedRunnerVersion && version <= maxSupportedRunnerVersion
}
func (r *VMReconciler) updateVMStatusCPU(
ctx context.Context,
vm *vmv1.VirtualMachine,
vmRunner *corev1.Pod,
qmpPluggedCPUs uint32,
cgroupUsage *api.VCPUCgroup,
) {
log := log.FromContext(ctx)
// We expect:
// - vm.Status.CPUs = cgroupUsage.VCPUs
// - vm.Status.CPUs.RoundUp() == qmpPluggedCPUs
// Otherwise, we update the status.
var currentCPUUsage vmv1.MilliCPU
if cgroupUsage != nil {
if cgroupUsage.VCPUs.RoundedUp() != qmpPluggedCPUs {
// This is not expected but it's fine. We only report the
// mismatch here and will resolve it in the next reconcile
// iteration loops by comparing these values to spec CPU use
// and moving to the scaling phase.
log.Error(nil, "Mismatch in the number of VM's plugged CPUs and runner pod's cgroup vCPUs",
"VirtualMachine", vm.Name,
"Runner Pod", vmRunner.Name,
"plugged CPUs", qmpPluggedCPUs,
"cgroup vCPUs", cgroupUsage.VCPUs)
}
currentCPUUsage = min(cgroupUsage.VCPUs, vmv1.MilliCPU(1000*qmpPluggedCPUs))
} else {
currentCPUUsage = vmv1.MilliCPU(1000 * qmpPluggedCPUs)
}
if vm.Status.CPUs == nil || *vm.Status.CPUs != currentCPUUsage {
vm.Status.CPUs = ¤tCPUUsage
r.Recorder.Event(vm, "Normal", "CpuInfo",
fmt.Sprintf("VirtualMachine %s uses %v cpu cores",
vm.Name,
vm.Status.CPUs))
}
}
func (r *VMReconciler) updateVMStatusMemory(
vm *vmv1.VirtualMachine,
qmpMemorySize *resource.Quantity,
) {
if vm.Status.MemorySize == nil || !qmpMemorySize.Equal(*vm.Status.MemorySize) {
vm.Status.MemorySize = qmpMemorySize
r.Recorder.Event(vm, "Normal", "MemoryInfo",
fmt.Sprintf("VirtualMachine %s uses %v memory",
vm.Name,
vm.Status.MemorySize))
}
}
func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine) error {
log := log.FromContext(ctx)
// Let's check and just set the condition status as Unknown when no status are available
if vm.Status.Conditions == nil || len(vm.Status.Conditions) == 0 {
// set Unknown condition status for AvailableVirtualMachine
meta.SetStatusCondition(&vm.Status.Conditions, metav1.Condition{Type: typeAvailableVirtualMachine, Status: metav1.ConditionUnknown, Reason: "Reconciling", Message: "Starting reconciliation"})
}
// NB: .Spec.EnableSSH guaranteed non-nil because the k8s API server sets the default for us.
enableSSH := *vm.Spec.EnableSSH
// Generate ssh secret name
if enableSSH && len(vm.Status.SSHSecretName) == 0 {
vm.Status.SSHSecretName = fmt.Sprintf("ssh-neonvm-%s", vm.Name)
}
// Set memory provider for old VMs that don't have it in the Status.
if vm.Status.PodName != "" && vm.Status.MemoryProvider == nil {
oldMemProvider := vmv1.MemoryProviderDIMMSlots
log.Error(nil, "Setting default MemoryProvider for VM", "MemoryProvider", oldMemProvider)
vm.Status.MemoryProvider = lo.ToPtr(oldMemProvider)
}
switch vm.Status.Phase {
case "":
// Acquire overlay IP address
if vm.Spec.ExtraNetwork != nil &&
vm.Spec.ExtraNetwork.Enable &&
len(vm.Status.ExtraNetIP) == 0 {
// Create IPAM object
nadName, err := nadIpamName()
if err != nil {
return err
}
nadNamespace, err := nadIpamNamespace()
if err != nil {
return err
}
ipam, err := ipam.New(ctx, nadName, nadNamespace)
if err != nil {
log.Error(err, "failed to create IPAM")
return err
}
defer ipam.Close()
ip, err := ipam.AcquireIP(ctx, vm.Name, vm.Namespace)
if err != nil {
log.Error(err, "fail to acquire IP")
return err
}
message := fmt.Sprintf("Acquired IP %s for overlay network interface", ip.String())
log.Info(message)
vm.Status.ExtraNetIP = ip.IP.String()
vm.Status.ExtraNetMask = fmt.Sprintf("%d.%d.%d.%d", ip.Mask[0], ip.Mask[1], ip.Mask[2], ip.Mask[3])
r.Recorder.Event(vm, "Normal", "OverlayNet", message)
}
// VirtualMachine just created, change Phase to "Pending"
vm.Status.Phase = vmv1.VmPending
case vmv1.VmPending:
// Generate runner pod name and set desired memory provider.
// Together with Status.MemoryProvider set for PodName != "" above,
// It is now guaranteed to have Status.MemoryProvider != nil
if len(vm.Status.PodName) == 0 {
vm.Status.PodName = names.SimpleNameGenerator.GenerateName(fmt.Sprintf("%s-", vm.Name))
if vm.Status.MemoryProvider == nil {
vm.Status.MemoryProvider = lo.ToPtr(pickMemoryProvider(r.Config, vm))
}
// Update the .Status on API Server to avoid creating multiple pods for a single VM
// See https://github.com/neondatabase/autoscaling/issues/794 for the context
if err := r.Status().Update(ctx, vm); err != nil {
return fmt.Errorf("Failed to update VirtualMachine status: %w", err)
}
}
memoryProvider := *vm.Status.MemoryProvider
// Check if the runner pod already exists, if not create a new one
vmRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, vmRunner)
if err != nil && apierrors.IsNotFound(err) {
var sshSecret *corev1.Secret
if enableSSH {
// Check if the ssh secret already exists, if not create a new one
sshSecret = &corev1.Secret{}
err := r.Get(ctx, types.NamespacedName{
Name: vm.Status.SSHSecretName,
Namespace: vm.Namespace,
}, sshSecret)
if err != nil && apierrors.IsNotFound(err) {
// Define a new ssh secret
sshSecret, err = r.sshSecretForVirtualMachine(vm)
if err != nil {
log.Error(err, "Failed to define new SSH Secret for VirtualMachine")
return err
}
log.Info("Creating a new SSH Secret", "Secret.Namespace", sshSecret.Namespace, "Secret.Name", sshSecret.Name)
if err = r.Create(ctx, sshSecret); err != nil {
log.Error(err, "Failed to create new SSH secret", "Secret.Namespace", sshSecret.Namespace, "Secret.Name", sshSecret.Name)
return err
}
log.Info("SSH Secret was created", "Secret.Namespace", sshSecret.Namespace, "Secret.Name", sshSecret.Name)
} else if err != nil {
log.Error(err, "Failed to get SSH Secret")
return err
}
}
// Define a new pod
pod, err := r.podForVirtualMachine(vm, memoryProvider, sshSecret)
if err != nil {
log.Error(err, "Failed to define new Pod resource for VirtualMachine")
return err
}
log.Info("Creating a new Pod", "Pod.Namespace", pod.Namespace, "Pod.Name", pod.Name)
if err = r.Create(ctx, pod); err != nil {
log.Error(err, "Failed to create new Pod", "Pod.Namespace", pod.Namespace, "Pod.Name", pod.Name)
return err
}
log.Info("Runner Pod was created", "Pod.Namespace", pod.Namespace, "Pod.Name", pod.Name)
msg := fmt.Sprintf("VirtualMachine %s created, Pod %s", vm.Name, pod.Name)
if sshSecret != nil {
msg = fmt.Sprintf("%s, SSH Secret %s", msg, sshSecret.Name)
}
r.Recorder.Event(vm, "Normal", "Created", msg)
if !vm.HasRestarted() {
d := pod.CreationTimestamp.Time.Sub(vm.CreationTimestamp.Time)
r.Metrics.vmCreationToRunnerCreationTime.Observe(d.Seconds())
}
} else if err != nil {
log.Error(err, "Failed to get vm-runner Pod")
return err
}
// runner pod found, check phase
switch runnerStatus(vmRunner) {
case runnerRunning:
vm.Status.PodIP = vmRunner.Status.PodIP
vm.Status.Phase = vmv1.VmRunning
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) created successfully", vm.Status.PodName, vm.Name)})
{
// Calculating VM startup latency metrics
now := time.Now()
d := now.Sub(vmRunner.CreationTimestamp.Time)
r.Metrics.runnerCreationToVMRunningTime.Observe(d.Seconds())
if !vm.HasRestarted() {
d := now.Sub(vm.CreationTimestamp.Time)
r.Metrics.vmCreationToVMRunningTime.Observe(d.Seconds())
log.Info("VM creation to VM running time", "duration(sec)", d.Seconds())
}
}
case runnerSucceeded:
vm.Status.Phase = vmv1.VmSucceeded
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) succeeded", vm.Status.PodName, vm.Name)})
case runnerFailed:
vm.Status.Phase = vmv1.VmFailed
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachine,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) failed", vm.Status.PodName, vm.Name)})
case runnerUnknown:
vm.Status.Phase = vmv1.VmPending
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) in Unknown phase", vm.Status.PodName, vm.Name)})
default:
// do nothing
}
case vmv1.VmRunning:
// Check if the runner pod exists
vmRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, vmRunner)
if err != nil && apierrors.IsNotFound(err) {
// lost runner pod for running VirtualMachine ?
r.Recorder.Event(vm, "Warning", "NotFound",
fmt.Sprintf("runner pod %s not found",
vm.Status.PodName))
vm.Status.Phase = vmv1.VmFailed
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachine,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) not found", vm.Status.PodName, vm.Name)})
} else if err != nil {
log.Error(err, "Failed to get runner Pod")
return err
}
// Update the metadata (including "usage" annotation) before anything else, so that it
// will be correctly set even if the rest of the reconcile operation fails.
if err := updatePodMetadataIfNecessary(ctx, r.Client, vm, vmRunner); err != nil {
log.Error(err, "Failed to sync pod labels and annotations", "VirtualMachine", vm.Name)
}
// runner pod found, check/update phase now
switch runnerStatus(vmRunner) {
case runnerRunning:
// update status by IP of runner pod
vm.Status.PodIP = vmRunner.Status.PodIP
// update phase
vm.Status.Phase = vmv1.VmRunning
// update Node name where runner working
vm.Status.Node = vmRunner.Spec.NodeName
runnerVersion, err := getRunnerVersion(vmRunner)
if err != nil {
log.Error(err, "Failed to get runner version of VM runner pod", "VirtualMachine", vm.Name)
return err
}
if !runnerVersionIsSupported(runnerVersion) {
err := fmt.Errorf("runner version %v is not supported", runnerVersion)
log.Error(err, "VM runner pod has unsupported version", "VirtualMachine", vm.Name)
return err
}
// get CPU details from QEMU
cpuSlotsPlugged, _, err := QmpGetCpus(QmpAddr(vm))
if err != nil {
log.Error(err, "Failed to get CPU details from VirtualMachine", "VirtualMachine", vm.Name)
return err
}
pluggedCPU := uint32(len(cpuSlotsPlugged))
// get cgroups CPU details from runner pod
cgroupUsage, err := getRunnerCgroup(ctx, vm)
if err != nil {
log.Error(err, "Failed to get CPU details from runner", "VirtualMachine", vm.Name)
return err
}
// update status by CPUs used in the VM
r.updateVMStatusCPU(ctx, vm, vmRunner, pluggedCPU, cgroupUsage)
// get Memory details from hypervisor and update VM status
memorySize, err := QmpGetMemorySize(QmpAddr(vm))
if err != nil {
log.Error(err, "Failed to get Memory details from VirtualMachine", "VirtualMachine", vm.Name)
return err
}
// update status by memory sizes used in the VM
r.updateVMStatusMemory(vm, memorySize)
// check if need hotplug/unplug CPU or memory
// compare guest spec and count of plugged
specUseCPU := vm.Spec.Guest.CPUs.Use
scaleCgroupCPU := specUseCPU != cgroupUsage.VCPUs
scaleQemuCPU := specUseCPU.RoundedUp() != pluggedCPU
if scaleCgroupCPU || scaleQemuCPU {
log.Info("VM goes into scaling mode, CPU count needs to be changed",
"CPUs on runner pod cgroup", cgroupUsage.VCPUs,
"CPUs on board", pluggedCPU,
"CPUs in spec", vm.Spec.Guest.CPUs.Use)
vm.Status.Phase = vmv1.VmScaling
}
memorySizeFromSpec := resource.NewQuantity(int64(vm.Spec.Guest.MemorySlots.Use)*vm.Spec.Guest.MemorySlotSize.Value(), resource.BinarySI)
if !memorySize.Equal(*memorySizeFromSpec) {
log.Info("VM goes into scale mode, need to resize Memory",
"Memory on board", memorySize,
"Memory in spec", memorySizeFromSpec)
vm.Status.Phase = vmv1.VmScaling
}
case runnerSucceeded:
vm.Status.Phase = vmv1.VmSucceeded
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) succeeded", vm.Status.PodName, vm.Name)})
case runnerFailed:
vm.Status.Phase = vmv1.VmFailed
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachine,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) failed", vm.Status.PodName, vm.Name)})
case runnerUnknown:
vm.Status.Phase = vmv1.VmPending
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) in Unknown phase", vm.Status.PodName, vm.Name)})
default:
// do nothing
}
case vmv1.VmScaling:
// Check that runner pod is still ok
vmRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, vmRunner)
if err != nil && apierrors.IsNotFound(err) {
// lost runner pod for running VirtualMachine ?
r.Recorder.Event(vm, "Warning", "NotFound",
fmt.Sprintf("runner pod %s not found",
vm.Status.PodName))
vm.Status.Phase = vmv1.VmFailed
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachine,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) not found", vm.Status.PodName, vm.Name)})
} else if err != nil {
log.Error(err, "Failed to get runner Pod")
return err
}
// Update the metadata (including "usage" annotation) before anything else, so that it
// will be correctly set even if the rest of the reconcile operation fails.
if err := updatePodMetadataIfNecessary(ctx, r.Client, vm, vmRunner); err != nil {
log.Error(err, "Failed to sync pod labels and annotations", "VirtualMachine", vm.Name)
}
// runner pod found, check that it's still up:
switch runnerStatus(vmRunner) {
case runnerSucceeded:
vm.Status.Phase = vmv1.VmSucceeded
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) succeeded", vm.Status.PodName, vm.Name)})
return nil
case runnerFailed:
vm.Status.Phase = vmv1.VmFailed
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachine,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) failed", vm.Status.PodName, vm.Name)})
return nil
case runnerUnknown:
vm.Status.Phase = vmv1.VmPending
meta.SetStatusCondition(&vm.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachine,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) in Unknown phase", vm.Status.PodName, vm.Name)})
return nil
default:
// do nothing
}
runnerVersion, err := getRunnerVersion(vmRunner)
if err != nil {
log.Error(err, "Failed to get runner version of VM runner pod", "VirtualMachine", vm.Name)
return err
}
if !runnerVersionIsSupported(runnerVersion) {
err := fmt.Errorf("runner version %v is not supported", runnerVersion)
log.Error(err, "VM runner pod has unsupported version", "VirtualMachine", vm.Name)
return err
}
cpuScaled := false
ramScaled := false
// do hotplug/unplug CPU
// firstly get current state from QEMU
cpuSlotsPlugged, _, err := QmpGetCpus(QmpAddr(vm))
if err != nil {
log.Error(err, "Failed to get CPU details from VirtualMachine", "VirtualMachine", vm.Name)
return err
}
specCPU := vm.Spec.Guest.CPUs.Use
pluggedCPU := uint32(len(cpuSlotsPlugged))
cgroupUsage, err := getRunnerCgroup(ctx, vm)
if err != nil {
log.Error(err, "Failed to get CPU details from runner", "VirtualMachine", vm.Name)
return err
}
// compare guest spec to count of plugged and runner pod cgroups
if specCPU.RoundedUp() > pluggedCPU {
// going to plug one CPU
log.Info("Plug one more CPU into VM")
if err := QmpPlugCpu(QmpAddr(vm)); err != nil {
return err
}
r.Recorder.Event(vm, "Normal", "ScaleUp",
fmt.Sprintf("One more CPU was plugged into VM %s",
vm.Name))
} else if specCPU.RoundedUp() < pluggedCPU {
// going to unplug one CPU
log.Info("Unplug one CPU from VM")
if err := QmpUnplugCpu(QmpAddr(vm)); err != nil {
return err
}
r.Recorder.Event(vm, "Normal", "ScaleDown",
fmt.Sprintf("One CPU was unplugged from VM %s",
vm.Name))
} else if specCPU != cgroupUsage.VCPUs {
log.Info("Update runner pod cgroups", "runner", cgroupUsage.VCPUs, "spec", specCPU)
if err := setRunnerCgroup(ctx, vm, specCPU); err != nil {
return err
}
reason := "ScaleDown"
if specCPU > cgroupUsage.VCPUs {
reason = "ScaleUp"
}
r.Recorder.Event(vm, "Normal", reason,
fmt.Sprintf("Runner pod cgroups was updated on VM %s",
vm.Name))
} else {
// seems already plugged correctly
cpuScaled = true
}
// update status by CPUs used in the VM
r.updateVMStatusCPU(ctx, vm, vmRunner, pluggedCPU, cgroupUsage)
// do hotplug/unplug Memory
switch *vm.Status.MemoryProvider {
case vmv1.MemoryProviderVirtioMem:
ramScaled, err = r.doVirtioMemScaling(vm)
if err != nil {
return err
}
case vmv1.MemoryProviderDIMMSlots:
ramScaled, err = r.doDIMMSlotsScaling(ctx, vm)
if err != nil {
return err
}
default:
panic(fmt.Errorf("unexpected vm.status.memoryProvider %q", *vm.Status.MemoryProvider))
}
// set VM phase to running if everything scaled
if cpuScaled && ramScaled {
vm.Status.Phase = vmv1.VmRunning
}
case vmv1.VmSucceeded, vmv1.VmFailed:
// Always delete runner pod. Otherwise, we could end up with one container succeeded/failed
// but the other one still running (meaning that the pod still ends up Running).
vmRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, vmRunner)
if err == nil {
// delete current runner
if err := r.deleteRunnerPodIfEnabled(ctx, vm, vmRunner); err != nil {
return err
}
} else if !apierrors.IsNotFound(err) {
return err
}
// We must keep the VM status the same until we know the neonvm-runner container has been
// terminated, otherwise we could end up starting a new runner pod while the VM in the old
// one is still running.
//
// Note that this is required because 'VmSucceeded' and 'VmFailed' are true if *at least
// one* container inside the runner pod has finished; the VM itself may still be running.
if apierrors.IsNotFound(err) || runnerContainerStopped(vmRunner) {
// NB: Cleanup() leaves status .Phase and .RestartCount (+ some others) but unsets other fields.
vm.Cleanup()
var shouldRestart bool
switch vm.Spec.RestartPolicy {
case vmv1.RestartPolicyAlways:
shouldRestart = true
case vmv1.RestartPolicyOnFailure:
shouldRestart = vm.Status.Phase == vmv1.VmFailed
case vmv1.RestartPolicyNever:
shouldRestart = false
}
if shouldRestart {
log.Info("Restarting VM runner pod", "VM.Phase", vm.Status.Phase, "RestartPolicy", vm.Spec.RestartPolicy)
vm.Status.Phase = vmv1.VmPending // reset to trigger restart
vm.Status.RestartCount += 1 // increment restart count
r.Metrics.vmRestartCounts.Inc()
}
// TODO for RestartPolicyNever: implement TTL or do nothing
}
default:
// do nothing
}
// Propagate TargetRevision to CurrentRevision. This is done only if the VM is fully
// reconciled and running.
if vm.Status.Phase == vmv1.VmRunning {
propagateRevision(vm)
}
return nil
}
func propagateRevision(vm *vmv1.VirtualMachine) {
if vm.Spec.TargetRevision == nil {
return
}
if vm.Status.CurrentRevision != nil &&
vm.Status.CurrentRevision.Revision == vm.Spec.TargetRevision.Revision {
return
}
rev := vm.Spec.TargetRevision.WithTime(time.Now())
vm.Status.CurrentRevision = &rev
}
func pickMemoryProvider(config *ReconcilerConfig, vm *vmv1.VirtualMachine) vmv1.MemoryProvider {
if p := vm.Spec.Guest.MemoryProvider; p != nil {
return *p
}
if p := vm.Status.MemoryProvider; p != nil {
return *p
}
// Not all configurations are valid for virtio-mem. Only switch to the default as long as it
// won't be invalid:
if err := vm.Spec.Guest.ValidateForMemoryProvider(config.DefaultMemoryProvider); err != nil {
return vmv1.MemoryProviderDIMMSlots
}
return config.DefaultMemoryProvider
}
func (r *VMReconciler) doVirtioMemScaling(vm *vmv1.VirtualMachine) (done bool, _ error) {
targetSlotCount := int(vm.Spec.Guest.MemorySlots.Use - vm.Spec.Guest.MemorySlots.Min)
targetVirtioMemSize := int64(targetSlotCount) * vm.Spec.Guest.MemorySlotSize.Value()
previousTarget, err := QmpSetVirtioMem(vm, targetVirtioMemSize)
if err != nil {
return false, err
}
goalTotalSize := resource.NewQuantity(
int64(vm.Spec.Guest.MemorySlots.Use)*vm.Spec.Guest.MemorySlotSize.Value(),
resource.BinarySI,
)
if previousTarget != targetVirtioMemSize {
// We changed the requested size. Make an event for it.
reason := "ScaleUp"
if targetVirtioMemSize < previousTarget {
reason = "ScaleDown"
}
r.Recorder.Eventf(vm, "Normal", reason, "Set virtio-mem size for %v total memory", goalTotalSize)
}
// Maybe we're already using the amount we want?
// Update the status to reflect the current size - and if it matches goalTotalSize, ram
// scaling is done.
currentTotalSize, err := QmpGetMemorySize(QmpAddr(vm))
if err != nil {
return false, err
}
done = currentTotalSize.Value() == goalTotalSize.Value()
r.updateVMStatusMemory(vm, currentTotalSize)
return done, nil
}
func (r *VMReconciler) doDIMMSlotsScaling(ctx context.Context, vm *vmv1.VirtualMachine) (done bool, _ error) {
log := log.FromContext(ctx)
memSlotsMin := vm.Spec.Guest.MemorySlots.Min
targetSlotCount := int(vm.Spec.Guest.MemorySlots.Use - memSlotsMin)
realSlots, err := QmpSetMemorySlots(ctx, vm, targetSlotCount, r.Recorder)
if realSlots < 0 {
return false, err
}
if realSlots != int(targetSlotCount) {
log.Info("Couldn't achieve desired memory slot count, will modify .spec.guest.memorySlots.use instead", "details", err)
// firstly re-fetch VM
if err := r.Get(ctx, types.NamespacedName{Name: vm.Name, Namespace: vm.Namespace}, vm); err != nil {
log.Error(err, "Unable to re-fetch VirtualMachine")
return false, err
}
memorySlotsUseInSpec := vm.Spec.Guest.MemorySlots.Use
memoryPluggedSlots := memSlotsMin + int32(realSlots)
vm.Spec.Guest.MemorySlots.Use = memoryPluggedSlots
if err := r.tryUpdateVM(ctx, vm); err != nil {
log.Error(err, "Failed to update .spec.guest.memorySlots.use",
"old value", memorySlotsUseInSpec,
"new value", memoryPluggedSlots)
return false, err
}
} else {
done = true
}
// get Memory details from hypervisor and update VM status
memorySize, err := QmpGetMemorySize(QmpAddr(vm))
if err != nil {
log.Error(err, "Failed to get Memory details from VirtualMachine", "VirtualMachine", vm.Name)
return false, err
}
// update status by memory sizes used in the VM
r.updateVMStatusMemory(vm, memorySize)
return done, nil
}
type runnerStatusKind string
const (
runnerUnknown runnerStatusKind = "Unknown"
runnerPending runnerStatusKind = "Pending"
runnerRunning runnerStatusKind = "Running"
runnerFailed runnerStatusKind = "Failed"
runnerSucceeded runnerStatusKind = "Succeeded"
)
// runnerStatus returns a description of the status of the VM inside the runner pod.
//
// This is *similar* to the value of pod.Status.Phase, but takes into consideration the statuses of
// the individual containers within the pod. This is because Kubernetes sets the pod phase to Failed
// or Succeeded only if *all* pods have exited, whereas we'd like to consider the VM to be Failed or
// Succeeded if *any* pod has exited.
//
// The full set of outputs is:
//
// - runnerUnknown, if pod.Status.Phase is Unknown
// - runnerPending, if pod.Status.Phase is "" or Pending
// - runnerRunning, if pod.Status.Phase is Running, and no containers have exited
// - runnerFailed, if pod.Status.Phase is Failed, or if any container has failed, or if any
// container other than neonvm-runner has exited
// - runnerSucceeded, if pod.Status.Phase is Succeeded, or if neonvm-runner has exited
// successfully
func runnerStatus(pod *corev1.Pod) runnerStatusKind {
switch pod.Status.Phase {
case "", corev1.PodPending:
return runnerPending
case corev1.PodSucceeded:
return runnerSucceeded
case corev1.PodFailed:
return runnerFailed
case corev1.PodUnknown:
return runnerUnknown
// See comment above for context on this logic
case corev1.PodRunning:
nonRunnerContainerSucceeded := false
runnerContainerSucceeded := false
for _, stat := range pod.Status.ContainerStatuses {
if stat.State.Terminated != nil {
failed := stat.State.Terminated.ExitCode != 0
isRunner := stat.Name == "neonvm-runner"
if failed {
// return that the "runner" has failed if any container has.
return runnerFailed
} else /* succeeded */ {
if isRunner {
// neonvm-runner succeeded. We'll return runnerSucceeded if no other
// container has failed.
runnerContainerSucceeded = true
} else {
// Other container has succeeded. We'll return runnerSucceeded if
// neonvm-runner has succeeded, but runnerFailed if this exited while
// neonvm-runner is still going.
nonRunnerContainerSucceeded = true
}
}
}
}
if runnerContainerSucceeded {
return runnerSucceeded
} else if nonRunnerContainerSucceeded {
return runnerFailed
} else {
return runnerRunning
}
default:
panic(fmt.Errorf("unknown pod phase: %q", pod.Status.Phase))
}
}
// runnerContainerStopped returns true iff the neonvm-runner container has exited.
//
// The guarantee is simple: It is only safe to start a new runner pod for a VM if
// runnerContainerStopped returns true (otherwise, we may end up with >1 instance of the same VM).
func runnerContainerStopped(pod *corev1.Pod) bool {
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
return true
}
for _, stat := range pod.Status.ContainerStatuses {
if stat.Name == "neonvm-runner" {
return stat.State.Terminated != nil
}
}
return false
}
// deleteRunnerPodIfEnabled deletes the runner pod if buildtag.NeverDeleteRunnerPods is false, and
// then emits an event and log line about what it did, whether it actually deleted the runner pod.
func (r *VMReconciler) deleteRunnerPodIfEnabled(
ctx context.Context,
vm *vmv1.VirtualMachine,
runner *corev1.Pod,
) error {
log := log.FromContext(ctx)
var msg, eventReason string
if buildtag.NeverDeleteRunnerPods {
msg = fmt.Sprintf("VM runner pod deletion was skipped due to '%s' build tag", buildtag.TagnameNeverDeleteRunnerPods)
eventReason = "DeleteSkipped"
} else {
// delete current runner
if err := r.Delete(ctx, runner); err != nil {
return err
}
msg = "VM runner pod was deleted"
eventReason = "Deleted"
}
log.Info(msg, "Pod.Namespace", runner.Namespace, "Pod.Name", runner.Name)
r.Recorder.Event(vm, "Normal", eventReason, fmt.Sprintf("%s: %s", msg, runner.Name))
return nil
}
// updates the values of the runner pod's labels and annotations so that they are exactly equal to
// the set of labels/annotations we expect - minus some that are ignored.
//
// The reason we also need to delete unrecognized labels/annotations is so that if a
// label/annotation on the VM itself is deleted, we can accurately reflect that in the pod.
func updatePodMetadataIfNecessary(ctx context.Context, c client.Client, vm *vmv1.VirtualMachine, runnerPod *corev1.Pod) error {
log := log.FromContext(ctx)
var patches []patch.Operation
metaSpecs := []struct {
metaField string
expected map[string]string
actual map[string]string
ignoreExtra map[string]bool // use bool here so `if ignoreExtra[key] { ... }` works
}{
{
metaField: "labels",
expected: labelsForVirtualMachine(vm, nil), // don't include runner version
actual: runnerPod.Labels,
ignoreExtra: map[string]bool{
// Don't override the runner pod version - we need to keep it around without
// changing it; otherwise it's not useful!
vmv1.RunnerPodVersionLabel: true,
},
},
{
metaField: "annotations",
expected: annotationsForVirtualMachine(vm),
actual: runnerPod.Annotations,
ignoreExtra: map[string]bool{
"k8s.v1.cni.cncf.io/networks": true,
"k8s.v1.cni.cncf.io/network-status": true,
"k8s.v1.cni.cncf.io/networks-status": true,
},
},
}
var removedMessageParts []string
for _, spec := range metaSpecs {
// Add/update the entries we're expecting to be there
for k, e := range spec.expected {
if a, ok := spec.actual[k]; !ok || e != a {
patches = append(patches, patch.Operation{
// From RFC 6902 (JSON patch):
//
// > The "add" operation performs one of the following functions, depending upon
// > what the target location references:
// >
// > [ ... ]
// >
// > * If the target location specifies an object member that does not already
// > exist, a new member is added to the object.
// > * If the target location specifies an object member that does exist, that
// > member's value is replaced.
//
// So: if the value is missing we'll add it. And if it's different, we'll replace it.
Op: patch.OpAdd,
Path: fmt.Sprintf("/metadata/%s/%s", spec.metaField, patch.PathEscape(k)),
Value: e,
})
}
}
// Remove the entries we aren't expecting to be there
var removed []string
for k := range spec.actual {
if _, expected := spec.expected[k]; !expected && !spec.ignoreExtra[k] {
removed = append(removed, k)
patches = append(patches, patch.Operation{
Op: patch.OpRemove,
Path: fmt.Sprintf("/metadata/%s/%s", spec.metaField, patch.PathEscape(k)),
})
}
}
if len(removed) != 0 {
// note: formatting with %q for a []string will print the array normally, but escape the
// strings inside. For example:
//
// fmt.Printf("%q\n", []string{"foo", "bar", "escaped\nstring"})
//
// outputs:
//
// ["foo" "bar" "escaped\nstring"]
//
// So the "message part" might look like `labels ["foo" "test-label"]`
removedMessageParts = append(removedMessageParts, fmt.Sprintf("%s %q", spec.metaField, removed))
}
}
if len(patches) == 0 {
return nil
}
patchData, err := json.Marshal(patches)
if err != nil {
panic(fmt.Errorf("error marshalling JSON patch: %w", err))
}
if len(removedMessageParts) != 0 {
var msg string
if len(removedMessageParts) == 1 {
msg = fmt.Sprintf("removing runner pod %s", removedMessageParts[0])
} else /* len = 2 */ {
msg = fmt.Sprintf("removing runner pod %s and %s", removedMessageParts[0], removedMessageParts[1])
}
// We want to log something when labels/annotations are removed, because the ignoreExtra
// values above might be incomplete, and it'd be hard to debug without an logs for the
// change.
log.Info(msg, "VirtualMachine", vm.Name, "Pod", runnerPod.Name)
}
// NOTE: We don't need to update the data in runnerPod ourselves because c.Patch will update it
// with what we get back from the k8s API after the patch completes.
return c.Patch(ctx, runnerPod, client.RawPatch(types.JSONPatchType, patchData))
}
func extractVirtualMachineUsageJSON(spec vmv1.VirtualMachineSpec) string {
cpu := spec.Guest.CPUs.Use
memorySlots := spec.Guest.MemorySlots.Use
usage := vmv1.VirtualMachineUsage{
CPU: cpu.ToResourceQuantity(),
Memory: resource.NewQuantity(spec.Guest.MemorySlotSize.Value()*int64(memorySlots), resource.BinarySI),
}
usageJSON, err := json.Marshal(usage)
if err != nil {
panic(fmt.Errorf("error marshalling JSON: %w", err))
}
return string(usageJSON)
}
func extractVirtualMachineResourcesJSON(spec vmv1.VirtualMachineSpec) string {
resourcesJSON, err := json.Marshal(spec.Resources())
if err != nil {
panic(fmt.Errorf("error marshalling JSON: %w", err))
}
return string(resourcesJSON)
}
// podForVirtualMachine returns a VirtualMachine Pod object
func (r *VMReconciler) podForVirtualMachine(
vm *vmv1.VirtualMachine,
memoryProvider vmv1.MemoryProvider,
sshSecret *corev1.Secret,
) (*corev1.Pod, error) {
pod, err := podSpec(vm, memoryProvider, sshSecret, r.Config)
if err != nil {
return nil, err
}
// Set the ownerRef for the Pod
// More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/owners-dependents/
if err := ctrl.SetControllerReference(vm, pod, r.Scheme); err != nil {
return nil, err
}
return pod, nil
}
func (r *VMReconciler) sshSecretForVirtualMachine(vm *vmv1.VirtualMachine) (*corev1.Secret, error) {
secret, err := sshSecretSpec(vm)
if err != nil {
return nil, err
}
// Set the ownerRef for the Secret
// More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/owners-dependents/
if err := ctrl.SetControllerReference(vm, secret, r.Scheme); err != nil {
return nil, err
}
return secret, nil
}
func sshSecretSpec(vm *vmv1.VirtualMachine) (*corev1.Secret, error) {
// using ed25519 signatures it takes ~16us to finish
publicKey, privateKey, err := sshKeygen()
if err != nil {
return nil, err
}
secret := &corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: vm.Status.SSHSecretName,
Namespace: vm.Namespace,
},
Immutable: lo.ToPtr(true),
Type: corev1.SecretTypeSSHAuth,
Data: map[string][]byte{
"ssh-publickey": publicKey,
"ssh-privatekey": privateKey,
},
}
return secret, nil
}
// labelsForVirtualMachine returns the labels for selecting the resources
// More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/
func labelsForVirtualMachine(vm *vmv1.VirtualMachine, runnerVersion *api.RunnerProtoVersion) map[string]string {
l := make(map[string]string, len(vm.Labels)+3)
for k, v := range vm.Labels {
l[k] = v
}
l["app.kubernetes.io/name"] = "NeonVM"
l[vmv1.VirtualMachineNameLabel] = vm.Name
if runnerVersion != nil {
l[vmv1.RunnerPodVersionLabel] = fmt.Sprintf("%d", *runnerVersion)
}
return l
}
func annotationsForVirtualMachine(vm *vmv1.VirtualMachine) map[string]string {
// use bool here so `if ignored[key] { ... }` works
ignored := map[string]bool{
"kubectl.kubernetes.io/last-applied-configuration": true,
}
a := make(map[string]string, len(vm.Annotations)+2)
for k, v := range vm.Annotations {
if !ignored[k] {
a[k] = v
}
}
a["kubectl.kubernetes.io/default-container"] = "neonvm-runner"
a[vmv1.VirtualMachineUsageAnnotation] = extractVirtualMachineUsageJSON(vm.Spec)
a[vmv1.VirtualMachineResourcesAnnotation] = extractVirtualMachineResourcesJSON(vm.Spec)
return a
}
func affinityForVirtualMachine(vm *vmv1.VirtualMachine) *corev1.Affinity {
a := vm.Spec.Affinity
if a == nil {
a = &corev1.Affinity{}
}
if a.NodeAffinity == nil {
a.NodeAffinity = &corev1.NodeAffinity{}
}
if a.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil {
a.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &corev1.NodeSelector{}
}
// if NodeSelectorTerms list is empty - add default values (arch==amd64 or os==linux)
if len(a.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms) == 0 {
a.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms = append(
a.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms,
corev1.NodeSelectorTerm{
MatchExpressions: []corev1.NodeSelectorRequirement{
{
Key: "kubernetes.io/arch",
Operator: "In",
Values: []string{"amd64"},
},
{
Key: "kubernetes.io/os",
Operator: "In",
Values: []string{"linux"},
},
},
})
}
return a
}
func setRunnerCgroup(ctx context.Context, vm *vmv1.VirtualMachine, cpu vmv1.MilliCPU) error {
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
url := fmt.Sprintf("http://%s:%d/cpu_change", vm.Status.PodIP, vm.Spec.RunnerPort)
update := api.VCPUChange{VCPUs: cpu}
data, err := json.Marshal(update)
if err != nil {
return err
}
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(data))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return fmt.Errorf("unexpected status %s", resp.Status)
}
return nil
}
func getRunnerCgroup(ctx context.Context, vm *vmv1.VirtualMachine) (*api.VCPUCgroup, error) {
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
url := fmt.Sprintf("http://%s:%d/cpu_current", vm.Status.PodIP, vm.Spec.RunnerPort)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, err
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
if resp.StatusCode != 200 {
return nil, fmt.Errorf("unexpected status %s", resp.Status)
}
body, err := io.ReadAll(resp.Body)
defer resp.Body.Close()
if err != nil {
return nil, err
}
var result api.VCPUCgroup
err = json.Unmarshal(body, &result)
if err != nil {
return nil, err
}
return &result, nil
}
// imageForVirtualMachine gets the Operand image which is managed by this controller
// from the VM_RUNNER_IMAGE environment variable defined in the config/manager/manager.yaml
func imageForVmRunner() (string, error) {
var imageEnvVar = "VM_RUNNER_IMAGE"
image, found := os.LookupEnv(imageEnvVar)
if !found {
return "", fmt.Errorf("unable to find %s environment variable with the image", imageEnvVar)
}
return image, nil
}
func podSpec(
vm *vmv1.VirtualMachine,
memoryProvider vmv1.MemoryProvider,
sshSecret *corev1.Secret,
config *ReconcilerConfig,
) (*corev1.Pod, error) {
runnerVersion := api.RunnerProtoV1
labels := labelsForVirtualMachine(vm, &runnerVersion)
annotations := annotationsForVirtualMachine(vm)
affinity := affinityForVirtualMachine(vm)
// Get the Operand image
image, err := imageForVmRunner()
if err != nil {
return nil, err
}
vmSpecJson, err := json.Marshal(vm.Spec)
if err != nil {
return nil, fmt.Errorf("marshal VM Spec: %w", err)
}
vmStatusJson, err := json.Marshal(vm.Status)
if err != nil {
return nil, fmt.Errorf("marshal VM Status: %w", err)
}
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: vm.Status.PodName,
Namespace: vm.Namespace,
Labels: labels,
Annotations: annotations,
},
Spec: corev1.PodSpec{
EnableServiceLinks: vm.Spec.ServiceLinks,
AutomountServiceAccountToken: lo.ToPtr(false),
RestartPolicy: corev1.RestartPolicyNever,
TerminationGracePeriodSeconds: vm.Spec.TerminationGracePeriodSeconds,
NodeSelector: vm.Spec.NodeSelector,
ImagePullSecrets: vm.Spec.ImagePullSecrets,
Tolerations: vm.Spec.Tolerations,
ServiceAccountName: vm.Spec.ServiceAccountName,
SchedulerName: vm.Spec.SchedulerName,
Affinity: affinity,
InitContainers: []corev1.Container{
{
Image: vm.Spec.Guest.RootDisk.Image,
Name: "init",
ImagePullPolicy: vm.Spec.Guest.RootDisk.ImagePullPolicy,
VolumeMounts: []corev1.VolumeMount{{
Name: "virtualmachineimages",
MountPath: "/vm/images",
}},
Command: []string{
"sh", "-c",
"cp /disk.qcow2 /vm/images/rootdisk.qcow2 && " +
/* uid=36(qemu) gid=34(kvm) groups=34(kvm) */
"chown 36:34 /vm/images/rootdisk.qcow2 && " +
"sysctl -w net.ipv4.ip_forward=1",
},
SecurityContext: &corev1.SecurityContext{
Privileged: lo.ToPtr(true),
},
},
},
// generate containers as an inline function so the context isn't isolated
Containers: func() []corev1.Container {
runner := corev1.Container{
Image: image,
Name: "neonvm-runner",
ImagePullPolicy: corev1.PullIfNotPresent,
// Ensure restrictive context for the container
// More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
SecurityContext: &corev1.SecurityContext{
Privileged: lo.ToPtr(false),
Capabilities: &corev1.Capabilities{
Add: []corev1.Capability{
"NET_ADMIN",
"SYS_ADMIN",
"SYS_RESOURCE",
},
},
},
Ports: []corev1.ContainerPort{{
ContainerPort: vm.Spec.QMP,
Name: "qmp",
}, {
ContainerPort: vm.Spec.QMPManual,
Name: "qmp-manual",
}},
Command: func() []string {
cmd := []string{"runner"}
if config.UseContainerMgr || config.DisableRunnerCgroup {
cmd = append(cmd, "-skip-cgroup-management")
}
if config.DisableRunnerCgroup {
// cgroup management disabled, but we still need something to provide
// the server, so the runner will just provide a dummy implementation.
cmd = append(cmd, "-enable-dummy-cpu-server")
}
cmd = append(
cmd,
"-qemu-disk-cache-settings", config.QEMUDiskCacheSettings,
"-memory-provider", string(memoryProvider),
)
if memoryProvider == vmv1.MemoryProviderVirtioMem {
cmd = append(cmd, "-memhp-auto-movable-ratio", config.MemhpAutoMovableRatio)
}
// put these last, so that the earlier args are easier to see (because these
// can get quite large)
cmd = append(
cmd,
"-vmspec", base64.StdEncoding.EncodeToString(vmSpecJson),
"-vmstatus", base64.StdEncoding.EncodeToString(vmStatusJson),
)
return cmd
}(),
Env: []corev1.EnvVar{{
Name: "K8S_POD_NAME",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.name",
},
},
}},
VolumeMounts: func() []corev1.VolumeMount {
images := corev1.VolumeMount{
Name: "virtualmachineimages",
MountPath: "/vm/images",
}
cgroups := corev1.VolumeMount{
Name: "sysfscgroup",
MountPath: "/sys/fs/cgroup",
// MountPropagationNone means that the volume in a container will
// not receive new mounts from the host or other containers, and filesystems
// mounted inside the container won't be propagated to the host or other
// containers.
// Note that this mode corresponds to "private" in Linux terminology.
MountPropagation: lo.ToPtr(corev1.MountPropagationNone),
}
if config.UseContainerMgr || config.DisableRunnerCgroup {
return []corev1.VolumeMount{images}
} else {
// the /sys/fs/cgroup mount is only necessary if neonvm-runner has to
// do is own cpu limiting
return []corev1.VolumeMount{images, cgroups}
}
}(),
Resources: vm.Spec.PodResources,
}
containerMgr := corev1.Container{
Image: image,
Name: "neonvm-container-mgr",
Command: []string{
"container-mgr",
"-port", strconv.Itoa(int(vm.Spec.RunnerPort)),
"-init-milli-cpu", strconv.Itoa(int(vm.Spec.Guest.CPUs.Use)),
},
Env: []corev1.EnvVar{
{
Name: "K8S_POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
{
Name: "CRI_ENDPOINT",
Value: fmt.Sprintf("unix://%s", config.criEndpointSocketPath()),
},
},
LivenessProbe: &corev1.Probe{
InitialDelaySeconds: 10,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/healthz",
Port: intstr.FromInt(int(vm.Spec.RunnerPort)),
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("50m"),
corev1.ResourceMemory: resource.MustParse("50Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"), // cpu limit > request, because usage is spiky
corev1.ResourceMemory: resource.MustParse("50Mi"),
},
},
// socket for crictl to connect to
VolumeMounts: []corev1.VolumeMount{
{
Name: "containerdsock",
MountPath: config.criEndpointSocketPath(),
},
},
}
if config.UseContainerMgr {
return []corev1.Container{runner, containerMgr}
} else {
// Return only the runner if we aren't supposed to use container-mgr
return []corev1.Container{runner}
}
}(),
Volumes: func() []corev1.Volume {
images := corev1.Volume{
Name: "virtualmachineimages",
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
}
cgroup := corev1.Volume{
Name: "sysfscgroup",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/sys/fs/cgroup",
Type: lo.ToPtr(corev1.HostPathDirectory),
},
},
}
containerdSock := corev1.Volume{
Name: "containerdsock",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: config.criEndpointSocketPath(),
Type: lo.ToPtr(corev1.HostPathSocket),
},
},
}
if config.UseContainerMgr {
return []corev1.Volume{images, containerdSock}
} else if config.DisableRunnerCgroup {
return []corev1.Volume{images}
} else {
return []corev1.Volume{images, cgroup}
}
}(),
},
}
if sshSecret != nil {
pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts,
corev1.VolumeMount{
Name: "ssh-privatekey",
MountPath: "/mnt/ssh",
},
corev1.VolumeMount{
Name: "ssh-publickey",
MountPath: "/vm/ssh",
},
)
pod.Spec.Volumes = append(pod.Spec.Volumes,
corev1.Volume{
Name: "ssh-privatekey",
VolumeSource: corev1.VolumeSource{
Secret: &corev1.SecretVolumeSource{
SecretName: sshSecret.Name,
Items: []corev1.KeyToPath{
{
Key: "ssh-privatekey",
Path: "id_ed25519",
Mode: lo.ToPtr[int32](0600),
},
},
},
},
},
corev1.Volume{
Name: "ssh-publickey",
VolumeSource: corev1.VolumeSource{
Secret: &corev1.SecretVolumeSource{
SecretName: sshSecret.Name,
Items: []corev1.KeyToPath{
{
Key: "ssh-publickey",
Path: "authorized_keys",
Mode: lo.ToPtr[int32](0644),
},
},
},
},
},
)
}
// If a custom neonvm-runner image is requested, use that instead:
if vm.Spec.RunnerImage != nil {
pod.Spec.Containers[0].Image = *vm.Spec.RunnerImage
if config.UseContainerMgr {
pod.Spec.Containers[1].Image = *vm.Spec.RunnerImage
}
}
// If a custom kernel is used, add that image:
if vm.Spec.Guest.KernelImage != nil {
pod.Spec.Containers[0].Args = append(pod.Spec.Containers[0].Args, "-kernelpath=/vm/images/vmlinuz")
pod.Spec.InitContainers = append(pod.Spec.InitContainers, corev1.Container{
Image: *vm.Spec.Guest.KernelImage,
Name: "init-kernel",
ImagePullPolicy: vm.Spec.Guest.RootDisk.ImagePullPolicy,
Args: []string{"cp", "/vmlinuz", "/vm/images/vmlinuz"},
VolumeMounts: []corev1.VolumeMount{{
Name: "virtualmachineimages",
MountPath: "/vm/images",
}},
SecurityContext: &corev1.SecurityContext{
// uid=36(qemu) gid=34(kvm) groups=34(kvm)
RunAsUser: lo.ToPtr[int64](36),
RunAsGroup: lo.ToPtr[int64](34),
},
})
}
if vm.Spec.Guest.AppendKernelCmdline != nil {
pod.Spec.Containers[0].Args = append(pod.Spec.Containers[0].Args, fmt.Sprintf("-appendKernelCmdline=%s", *vm.Spec.Guest.AppendKernelCmdline))
}
// Add any InitContainers that were specified by the spec
pod.Spec.InitContainers = append(pod.Spec.InitContainers, vm.Spec.ExtraInitContainers...)
// allow access to /dev/kvm and /dev/vhost-net devices by generic-device-plugin for kubelet
if pod.Spec.Containers[0].Resources.Limits == nil {
pod.Spec.Containers[0].Resources.Limits = corev1.ResourceList{}
}
pod.Spec.Containers[0].Resources.Limits["neonvm/vhost-net"] = resource.MustParse("1")
// NB: EnableAcceleration guaranteed non-nil because the k8s API server sets the default for us.
if *vm.Spec.EnableAcceleration {
pod.Spec.Containers[0].Resources.Limits["neonvm/kvm"] = resource.MustParse("1")
}
for _, port := range vm.Spec.Guest.Ports {
cPort := corev1.ContainerPort{
ContainerPort: int32(port.Port),
}
if len(port.Name) != 0 {
cPort.Name = port.Name
}
if len(port.Protocol) != 0 {
cPort.Protocol = corev1.Protocol(port.Protocol)
}
pod.Spec.Containers[0].Ports = append(pod.Spec.Containers[0].Ports, cPort)
}
if settings := vm.Spec.Guest.Settings; settings != nil {
swapInfo, err := settings.GetSwapInfo()
if err != nil {
return nil, fmt.Errorf("error getting SwapInfo from VirtualMachine guest settings: %w", err)
}
if swapInfo != nil {
diskName := "swapdisk"
pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, corev1.VolumeMount{
Name: diskName,
MountPath: fmt.Sprintf("/vm/mounts/%s", diskName),
})
pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
Name: diskName,
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{
SizeLimit: &swapInfo.Size,
},
},
})
}
}
for _, disk := range vm.Spec.Disks {
mnt := corev1.VolumeMount{
Name: disk.Name,
MountPath: fmt.Sprintf("/vm/mounts%s", disk.MountPath),
}
if disk.ReadOnly != nil {
mnt.ReadOnly = *disk.ReadOnly
}
switch {
case disk.ConfigMap != nil:
pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, mnt)
pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
Name: disk.Name,
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: disk.ConfigMap.Name,
},
Items: disk.ConfigMap.Items,
},
},
})
case disk.Secret != nil:
pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, mnt)
pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
Name: disk.Name,
VolumeSource: corev1.VolumeSource{
Secret: &corev1.SecretVolumeSource{
SecretName: disk.Secret.SecretName,
Items: disk.Secret.Items,
},
},
})
case disk.EmptyDisk != nil:
pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts, mnt)
pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
Name: disk.Name,
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{
SizeLimit: &disk.EmptyDisk.Size,
},
},
})
default:
// do nothing
}
}
// use multus network to add extra network interface
if vm.Spec.ExtraNetwork != nil && vm.Spec.ExtraNetwork.Enable {
var nadNetwork string
if len(vm.Spec.ExtraNetwork.MultusNetwork) > 0 { // network specified in spec
nadNetwork = vm.Spec.ExtraNetwork.MultusNetwork
} else { // get network from env variables
nadName, err := nadRunnerName()
if err != nil {
return nil, err
}
nadNamespace, err := nadRunnerNamespace()
if err != nil {
return nil, err
}
nadNetwork = fmt.Sprintf("%s/%s", nadNamespace, nadName)
}
pod.ObjectMeta.Annotations[nadapiv1.NetworkAttachmentAnnot] = fmt.Sprintf("%s@%s", nadNetwork, vm.Spec.ExtraNetwork.Interface)
}
return pod, nil
}
// SetupWithManager sets up the controller with the Manager.
// Note that the Runner Pod will be also watched in order to ensure its
// desirable state on the cluster
func (r *VMReconciler) SetupWithManager(mgr ctrl.Manager) (ReconcilerWithMetrics, error) {
cntrlName := "virtualmachine"
reconciler := WithMetrics(
withCatchPanic(r),
r.Metrics,
cntrlName,
r.Config.FailurePendingPeriod,
r.Config.FailingRefreshInterval,
)
err := ctrl.NewControllerManagedBy(mgr).
For(&vmv1.VirtualMachine{}).
Owns(&corev1.Pod{}).
WithOptions(controller.Options{MaxConcurrentReconciles: r.Config.MaxConcurrentReconciles}).
Named(cntrlName).
Complete(reconciler)
return reconciler, err
}
func DeepEqual(v1, v2 interface{}) bool {
if reflect.DeepEqual(v1, v2) {
return true
}
var x1 interface{}
bytesA, _ := json.Marshal(v1)
_ = json.Unmarshal(bytesA, &x1)
var x2 interface{}
bytesB, _ := json.Marshal(v2)
_ = json.Unmarshal(bytesB, &x2)
return reflect.DeepEqual(x1, x2)
}
// TODO: reimplement to r.Patch()
func (r *VMReconciler) tryUpdateVM(ctx context.Context, vm *vmv1.VirtualMachine) error {
return r.Update(ctx, vm)
}
// return Network Attachment Definition name with IPAM settings
func nadIpamName() (string, error) {
return getEnvVarValue("NAD_IPAM_NAME")
}
// return Network Attachment Definition namespace with IPAM settings
func nadIpamNamespace() (string, error) {
return getEnvVarValue("NAD_IPAM_NAMESPACE")
}
// return Network Attachment Definition name for second interface in Runner
func nadRunnerName() (string, error) {
return getEnvVarValue("NAD_RUNNER_NAME")
}
// return Network Attachment Definition namespace for second interface in Runner
func nadRunnerNamespace() (string, error) {
return getEnvVarValue("NAD_RUNNER_NAMESPACE")
}
// return env variable value
func getEnvVarValue(envVarName string) (string, error) {
value, found := os.LookupEnv(envVarName)
if !found {
return "", fmt.Errorf("unable to find %s environment variable", envVarName)
}
return value, nil
}
// sshKeygen generates a pair of public and private keys using the ed25519
// algorithm. It returns the generated public key and private key as byte
// slices. If an error occurs during key generation or encoding, it returns nil
// for both keys and the error.
func sshKeygen() (publicKeyBytes []byte, privateKeyBytes []byte, err error) {
publicKey, privateKey, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
return nil, nil, err
}
publicKeyBytes, err = encodePublicKey(publicKey)
if err != nil {
return nil, nil, err
}
privateKeyBytes, err = encodePrivateKey(privateKey)
if err != nil {
return nil, nil, err
}
return
}
func encodePrivateKey(privateKey ed25519.PrivateKey) ([]byte, error) {
privBlock, err := ssh.MarshalPrivateKey(privateKey, "")
if err != nil {
return nil, err
}
privatePEM := pem.EncodeToMemory(privBlock)
return privatePEM, nil
}
func encodePublicKey(publicKey ed25519.PublicKey) ([]byte, error) {
sshPublicKey, err := ssh.NewPublicKey(publicKey)
if err != nil {
return nil, err
}
pubKeyBytes := ssh.MarshalAuthorizedKey(sshPublicKey)
return pubKeyBytes, nil
}
package controllers
import (
"context"
"encoding/json"
"errors"
"fmt"
"strconv"
"strings"
"time"
"github.com/digitalocean/go-qemu/qmp"
"github.com/go-logr/logr"
"sigs.k8s.io/controller-runtime/pkg/log"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/client-go/tools/record"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)
type QmpCpus struct {
Return []struct {
Props struct {
CoreId int32 `json:"core-id"`
ThreadId int32 `json:"thread-id"`
SocketId int32 `json:"socket-id"`
} `json:"props"`
VcpusCount int32 `json:"vcpus-count"`
QomPath *string `json:"qom-path"`
Type string `json:"type"`
} `json:"return"`
}
type QmpMemorySize struct {
Return struct {
BaseMemory int64 `json:"base-memory"`
PluggedMemory int64 `json:"plugged-memory"`
} `json:"return"`
}
type QmpCpuSlot struct {
Core int32 `json:"core"`
QOM string `json:"qom"`
Type string `json:"type"`
}
type QmpMemoryDevices struct {
Return []QmpMemoryDevice `json:"return"`
}
type QmpMemoryDevice struct {
Type string `json:"type"`
Data struct {
Memdev string `json:"memdev"`
Hotplugged bool `json:"hotplugged"`
Addr int64 `json:"addr"`
Hotplugguble bool `json:"hotpluggable"`
Size int64 `json:"size"`
Slot int64 `json:"slot"`
Node int64 `json:"node"`
Id string `json:"id"`
} `json:"data"`
}
type QmpObjects struct {
Return []QmpObject `json:"return"`
}
type QmpObject struct {
Name string `json:"name"`
Type string `json:"type"`
}
type QmpMigrationInfo struct {
Return MigrationInfo `json:"return"`
}
type MigrationInfo struct {
Status string `json:"status"`
TotalTimeMs int64 `json:"total-time"`
SetupTimeMs int64 `json:"setup-time"`
DowntimeMs int64 `json:"downtime"`
Ram struct {
Transferred int64 `json:"transferred"`
Remaining int64 `json:"remaining"`
Total int64 `json:"total"`
Duplicate int64 `json:"duplicate"`
Normal int64 `json:"normal"`
NormalBytes int64 `json:"normal-bytes"`
DirtySyncCount int64 `json:"dirty-sync-count"`
} `json:"ram"`
Compression struct {
CompressedSize int64 `json:"compressed-size"`
CompressionRate float64 `json:"compression-rate"`
} `json:"compression"`
}
func QmpAddr(vm *vmv1.VirtualMachine) (ip string, port int32) {
return vm.Status.PodIP, vm.Spec.QMP
}
func QmpConnect(ip string, port int32) (*qmp.SocketMonitor, error) {
mon, err := qmp.NewSocketMonitor("tcp", fmt.Sprintf("%s:%d", ip, port), 2*time.Second)
if err != nil {
return nil, err
}
if err := mon.Connect(); err != nil {
return nil, err
}
return mon, nil
}
func QmpGetCpus(ip string, port int32) ([]QmpCpuSlot, []QmpCpuSlot, error) {
mon, err := QmpConnect(ip, port)
if err != nil {
return nil, nil, err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
qmpcmd := []byte(`{"execute": "query-hotpluggable-cpus"}`)
raw, err := mon.Run(qmpcmd)
if err != nil {
return nil, nil, err
}
var result QmpCpus
if err := json.Unmarshal(raw, &result); err != nil {
return nil, nil, fmt.Errorf("error unmarshaling json: %w", err)
}
plugged := []QmpCpuSlot{}
empty := []QmpCpuSlot{}
for _, entry := range result.Return {
if entry.QomPath != nil {
plugged = append(plugged, QmpCpuSlot{Core: entry.Props.CoreId, QOM: *entry.QomPath, Type: entry.Type})
} else {
empty = append(empty, QmpCpuSlot{Core: entry.Props.CoreId, QOM: "", Type: entry.Type})
}
}
return plugged, empty, nil
}
func QmpPlugCpu(ip string, port int32) error {
_, empty, err := QmpGetCpus(ip, port)
if err != nil {
return err
}
if len(empty) == 0 {
return errors.New("no empty slots for CPU hotplug")
}
mon, err := QmpConnect(ip, port)
if err != nil {
return err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
// empty list reversed, first cpu slot in the end of list and last cpu slot in the beginning
slot := empty[len(empty)-1]
qmpcmd := []byte(fmt.Sprintf(`{
"execute": "device_add",
"arguments": {
"id": "cpu%d",
"driver": %q,
"core-id": %d,
"socket-id": 0,
"thread-id": 0
}
}`, slot.Core, slot.Type, slot.Core))
_, err = mon.Run(qmpcmd)
if err != nil {
return err
}
return nil
}
func QmpUnplugCpu(ip string, port int32) error {
plugged, _, err := QmpGetCpus(ip, port)
if err != nil {
return err
}
slot := -1
found := false
for i, s := range plugged {
if strings.Contains(s.QOM, "machine/peripheral/cpu") {
found = true
slot = i
break
}
}
if !found {
return errors.New("there are no unpluggable CPUs")
}
mon, err := QmpConnect(ip, port)
if err != nil {
return err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
cmd := []byte(fmt.Sprintf(`{"execute": "device_del", "arguments": {"id": %q}}`, plugged[slot].QOM))
_, err = mon.Run(cmd)
if err != nil {
return err
}
// small pause to let hypervisor do unplug
time.Sleep(500 * time.Millisecond)
return nil
}
func QmpSyncCpuToTarget(vm *vmv1.VirtualMachine, migration *vmv1.VirtualMachineMigration) error {
plugged, _, err := QmpGetCpus(QmpAddr(vm))
if err != nil {
return err
}
pluggedInTarget, _, err := QmpGetCpus(migration.Status.TargetPodIP, vm.Spec.QMP)
if err != nil {
return err
}
if len(plugged) == len(pluggedInTarget) {
// no need plug anything
return nil
}
target, err := QmpConnect(migration.Status.TargetPodIP, vm.Spec.QMP)
if err != nil {
return err
}
defer target.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
searchForEmpty:
for _, slot := range plugged {
// firsly check if slot occupied already
// run over Target CPUs and compare with source
for _, tslot := range pluggedInTarget {
if slot == tslot {
// that mean such CPU already present in Target, skip it
continue searchForEmpty
}
}
qmpcmd := []byte(fmt.Sprintf(`{
"execute": "device_add",
"arguments": {
"id": "cpu%d",
"driver": %q,
"core-id": %d,
"socket-id": 0,
"thread-id": 0
}
}`, slot.Core, slot.Type, slot.Core))
_, err = target.Run(qmpcmd)
if err != nil {
return err
}
}
return nil
}
func QmpQueryMemoryDevices(ip string, port int32) ([]QmpMemoryDevice, error) {
mon, err := QmpConnect(ip, port)
if err != nil {
return nil, err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
return QmpMonQueryMemoryDevices(mon)
}
func QmpMonQueryMemoryDevices(mon *qmp.SocketMonitor) ([]QmpMemoryDevice, error) {
cmd := []byte(`{"execute": "query-memory-devices"}`)
raw, err := mon.Run(cmd)
if err != nil {
return nil, err
}
var result QmpMemoryDevices
if err := json.Unmarshal(raw, &result); err != nil {
return nil, fmt.Errorf("error unmarshaling json: %w", err)
}
return result.Return, nil
}
// MemslotIdxFromName takes "/objects/memslot3" or "memslot3 and returns 3
func MemslotIdxFromName(name string) (int, error) {
name = strings.TrimPrefix(name, "/objects/")
idxStr := strings.TrimPrefix(name, "memslot")
idx, err := strconv.Atoi(idxStr)
if err != nil {
// doesn't reference `err`, because we don't know the actual issue
return 0, fmt.Errorf("failed to parse memory device id: %q", name)
}
return idx, nil
}
func QmpQueryMemoryBackendIds(mon *qmp.SocketMonitor) (map[int]struct{}, error) {
cmd := []byte(`{"execute": "qom-list", "arguments": {"path": "/objects"}}`)
raw, err := mon.Run(cmd)
if err != nil {
return nil, err
}
var result QmpObjects
if err := json.Unmarshal(raw, &result); err != nil {
return nil, fmt.Errorf("error unmarshaling json: %w", err)
}
backends := map[int]struct{}{}
for _, o := range result.Return {
if o.Name == "pc.ram" { // Non-hotplugged memory
continue
}
if o.Type != "child<memory-backend-ram>" {
continue
}
idx, err := MemslotIdxFromName(o.Name)
if err != nil {
return nil, err
}
backends[idx] = struct{}{}
}
return backends, nil
}
type QMPRunner interface {
Run([]byte) ([]byte, error)
}
// QmpSetVirtioMem updates virtio-mem to the new target size, returning the previous target.
//
// If the new target size is equal to the previous one, this function does nothing but query the
// target.
func QmpSetVirtioMem(vm *vmv1.VirtualMachine, targetVirtioMemSize int64) (previous int64, _ error) {
// Note: The virtio-mem device only exists when max mem != min mem.
// So if min == max, we should just short-cut, skip the queries, and say it's all good.
// Refer to the instantiation in neonvm-runner for more.
if vm.Spec.Guest.MemorySlots.Min == vm.Spec.Guest.MemorySlots.Max {
// if target size is non-zero even though min == max, something went very wrong
if targetVirtioMemSize != 0 {
panic(fmt.Sprintf(
"VM min mem slots == max mem slots, but target virtio-mem size %d != 0",
targetVirtioMemSize,
))
}
// Otherwise, we're all good, just pretend like we talked to the VM.
return 0, nil
}
mon, err := QmpConnect(QmpAddr(vm))
if err != nil {
return 0, err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
// First, fetch current desired virtio-mem size. If it's the same as targetVirtioMemSize, then
// we can report that it was already the same.
cmd := []byte(`{"execute": "qom-get", "arguments": {"path": "vm0", "property": "requested-size"}}`)
raw, err := mon.Run(cmd)
if err != nil {
return 0, err
}
result := struct {
Return int64 `json:"return"`
}{Return: 0}
if err := json.Unmarshal(raw, &result); err != nil {
return 0, fmt.Errorf("error unmarshaling json: %w", err)
}
previous = result.Return
if previous == targetVirtioMemSize {
return previous, nil
}
// The current requested size is not equal to the new desired size. Let's change that.
cmd = []byte(fmt.Sprintf(
`{"execute": "qom-set", "arguments": {"path": "vm0", "property": "requested-size", "value": %d}}`,
targetVirtioMemSize,
))
_, err = mon.Run(cmd)
if err != nil {
return 0, err
}
return previous, nil
}
// QmpAddMemoryBackend adds a single memory slot to the VM with the given size.
//
// The memory slot does nothing until a corresponding "device" is added to the VM for the same memory slot.
// See QmpAddMemoryDevice for more.
// When unplugging, QmpDelMemoryDevice must be called before QmpDelMemoryBackend.
func QmpAddMemoryBackend(mon QMPRunner, idx int, sizeBytes int64) error {
cmd := []byte(fmt.Sprintf(
`{"execute": "object-add",
"arguments": {"id": "memslot%d",
"size": %d,
"qom-type": "memory-backend-ram"}}`, idx, sizeBytes,
))
_, err := mon.Run(cmd)
return err
}
func QmpDelMemoryBackend(mon *qmp.SocketMonitor, idx int) error {
cmd := []byte(fmt.Sprintf(
`{"execute": "object-del",
"arguments": {"id": "memslot%d"}}`, idx,
))
_, err := mon.Run(cmd)
return err
}
func QmpAddMemoryDevice(mon *qmp.SocketMonitor, idx int) error {
cmd := []byte(fmt.Sprintf(
`{"execute": "device_add",
"arguments": {"id": "dimm%d",
"driver": "pc-dimm",
"memdev": "memslot%d"}}`, idx, idx,
))
_, err := mon.Run(cmd)
return err
}
func QmpDelMemoryDevice(mon *qmp.SocketMonitor, idx int) error {
cmd := []byte(fmt.Sprintf(
`{"execute": "device_del",
"arguments": {"id": "dimm%d"}}`, idx,
))
_, err := mon.Run(cmd)
return err
}
type QmpMemorySetter struct {
vm *vmv1.VirtualMachine
targetCnt int
recorder record.EventRecorder
log logr.Logger
mon *qmp.SocketMonitor
memBackends map[int]bool // idx -> is active
maxBackend int // stores the max idx that was discovered to added.
// Is needed to know where to start deletion
memDevCount int
errs []error
}
func (r *QmpMemorySetter) buildState() error {
memDevs, err := QmpMonQueryMemoryDevices(r.mon)
if err != nil {
return err
}
r.memDevCount = len(memDevs)
for _, m := range memDevs {
idx, err := MemslotIdxFromName(m.Data.Memdev)
if err == nil {
r.memBackends[idx] = true
}
}
backends, err := QmpQueryMemoryBackendIds(r.mon)
if err != nil {
return err
}
for b := range backends {
if _, ok := r.memBackends[b]; !ok {
r.memBackends[b] = false
}
}
for idx := range r.memBackends {
r.maxBackend = max(r.maxBackend, idx)
}
r.log.Info("QMP memory state", "backends", r.memBackends, "maxBackend", r.maxBackend)
return nil
}
func (r *QmpMemorySetter) Disconnect() {
if r.mon != nil {
err := r.mon.Disconnect()
if err != nil {
r.log.Error(err, "Failed to disconnect QMP")
}
}
}
// attemptsCounter limits the total number of operations in each phase.
// In case QMP keeps timeouting, but the operation silently succeeding,
// we don't want to keep doing the QMP actions until we get enough positive
// results.
type attemptsCounter struct {
target int
done int
}
func newAttemptsCounter(target int) *attemptsCounter {
return &attemptsCounter{
target: target,
done: 0,
}
}
// Registers an attempt and returns true if it allowed to continue
func (t *attemptsCounter) attempt() bool {
if t.done < t.target {
t.done++
return true
}
return false
}
func (t *attemptsCounter) didSomething() bool {
return t.done > 0
}
func (r *QmpMemorySetter) AddBackends() {
if r.targetCnt <= len(r.memBackends) {
return
}
attempts := newAttemptsCounter(r.targetCnt - len(r.memBackends))
for idx := 1; idx <= r.targetCnt; idx++ {
if _, ok := r.memBackends[idx]; ok {
continue
}
if !attempts.attempt() {
break
}
err := QmpAddMemoryBackend(r.mon, idx, r.vm.Spec.Guest.MemorySlotSize.Value())
if err != nil {
r.errs = append(r.errs, err)
r.recorder.Event(r.vm, "Warning", "ScaleUp",
fmt.Sprintf("Failed to add memslot%d: %s",
idx, err.Error()))
continue
}
r.recorder.Event(r.vm, "Normal", "ScaleUp",
fmt.Sprintf("Added memslot%d", idx))
r.memBackends[idx] = false
// The one we just added might be the new max
r.maxBackend = max(r.maxBackend, idx)
}
if attempts.didSomething() {
// might need to wait for QEMU to allocate the memory
time.Sleep(time.Second)
}
}
func (r *QmpMemorySetter) AddDevices() {
if r.targetCnt <= r.memDevCount {
return
}
attempts := newAttemptsCounter(r.targetCnt - r.memDevCount)
for idx := 1; idx <= r.maxBackend; idx++ {
active, ok := r.memBackends[idx]
if !ok || active {
continue
}
// Found unused backend to plug into
if !attempts.attempt() {
break
}
err := QmpAddMemoryDevice(r.mon, idx)
if err != nil {
r.errs = append(r.errs, err)
r.recorder.Event(r.vm, "Warning", "ScaleUp",
fmt.Sprintf("Failed to add dimm%d to VM %s: %s",
idx, r.vm.Name, err.Error()))
continue
}
r.recorder.Event(r.vm, "Normal", "ScaleUp",
fmt.Sprintf("Added dimm%d", idx))
r.memBackends[idx] = true
r.memDevCount++
}
}
func (r *QmpMemorySetter) RemoveDevices() {
if r.memDevCount <= r.targetCnt {
return
}
attempts := newAttemptsCounter(r.memDevCount - r.targetCnt)
// Removing from the end to keep memslot1,memslot2,...
for idx := r.maxBackend; idx >= 1; idx-- {
active, ok := r.memBackends[idx]
if !ok || !active {
continue
}
// Found used backend to remove
if !attempts.attempt() {
break
}
err := QmpDelMemoryDevice(r.mon, idx)
if err != nil {
r.errs = append(r.errs, err)
r.recorder.Event(r.vm, "Warning", "ScaleDown",
fmt.Sprintf("Failed to remove dimm%d: %s",
idx, err.Error()))
continue
}
r.recorder.Event(r.vm, "Normal", "ScaleDown",
fmt.Sprintf("Removed dimm%d", idx))
r.memBackends[idx] = false
r.memDevCount--
}
if attempts.didSomething() {
// wait a bit to allow guest kernel remove memory block
time.Sleep(time.Second)
}
}
func (r *QmpMemorySetter) RemoveBackends() {
if len(r.memBackends) <= r.targetCnt {
return
}
attempts := newAttemptsCounter(len(r.memBackends) - r.targetCnt)
for idx := r.maxBackend; idx >= 1; idx-- {
active, ok := r.memBackends[idx]
if !ok || active {
continue
}
if !attempts.attempt() {
break
}
err := QmpDelMemoryBackend(r.mon, idx)
if err != nil {
r.errs = append(r.errs, err)
r.recorder.Event(r.vm, "Warning", "ScaleDown",
fmt.Sprintf("Failed to remove memslot%d: %s",
idx, err.Error()))
continue
}
r.recorder.Event(r.vm, "Normal", "ScaleDown",
fmt.Sprintf("Removed memslot%d", idx))
delete(r.memBackends, idx)
}
}
func (r *QmpMemorySetter) run() (int, error) {
// Usually, runs first two or last two phases.
// If there are leftover slots, might run 2 and 4.
// If there are errors, last two phases serve as cleanup.
phases := []func(){
r.AddBackends,
r.AddDevices,
r.RemoveDevices,
r.RemoveBackends,
}
for _, phase := range phases {
phase()
}
return r.memDevCount, errors.Join(r.errs...)
}
// QmpSetMemorySlots attempts to plug/unplug memory slots to match targetCnt.
//
// Returns the number of slots, which the function managed to plug.
// Ideally, it matches targetCnt, but can be less or more if there are
// errors.
//
// Returns -1 if failed to father current state of memory, otherwise,
// the return value is valid even if there are errors.
//
// In order for the hotplug to occur, we have to do two things:
// 1. Plug memory backend (memslot<n>) - a QEMU object, which physically
// allocates the memory from host
// 2. Plug DIMM device (dimm<n>) - a device, which exposes the memory to the
// host. dimm<n> is always plugged into memslot<n> with the same n.
//
// In order to do hotunplug, we need to make the same actions in the reversed
// order.
func QmpSetMemorySlots(
ctx context.Context,
vm *vmv1.VirtualMachine,
targetCnt int,
recorder record.EventRecorder,
) (int, error) {
log := log.FromContext(ctx)
mon, err := QmpConnect(QmpAddr(vm))
if err != nil {
return -1, err
}
setter := &QmpMemorySetter{
vm: vm,
targetCnt: targetCnt,
recorder: recorder,
log: log,
mon: mon,
memBackends: map[int]bool{},
maxBackend: 0,
memDevCount: 0,
errs: []error{},
}
defer setter.Disconnect()
err = setter.buildState()
if err != nil {
return -1, err
}
return setter.run()
}
func QmpSyncMemoryToTarget(vm *vmv1.VirtualMachine, migration *vmv1.VirtualMachineMigration) error {
memoryDevices, err := QmpQueryMemoryDevices(QmpAddr(vm))
if err != nil {
return err
}
memoryDevicesInTarget, err := QmpQueryMemoryDevices(migration.Status.TargetPodIP, vm.Spec.QMP)
if err != nil {
return err
}
target, err := QmpConnect(migration.Status.TargetPodIP, vm.Spec.QMP)
if err != nil {
return err
}
defer target.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
for _, m := range memoryDevices {
// firsly check if slot occupied already
// run over Target memory and compare device id
found := false
for _, tm := range memoryDevicesInTarget {
if DeepEqual(m, tm) {
found = true
}
}
if found {
// that mean such memory device 'm' already present in Target, skip it
continue
}
// add memdev object
memdevIdx, err := MemslotIdxFromName(m.Data.Memdev)
if err != nil {
return err
}
err = QmpAddMemoryBackend(target, memdevIdx, m.Data.Size)
if err != nil {
return err
}
// now add pc-dimm device
err = QmpAddMemoryDevice(target, memdevIdx)
if err != nil {
// device_add command failed... so try remove object that we just created
_ = QmpDelMemoryBackend(target, memdevIdx)
return err
}
}
return nil
}
func QmpGetMemorySize(ip string, port int32) (*resource.Quantity, error) {
mon, err := QmpConnect(ip, port)
if err != nil {
return nil, err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
qmpcmd := []byte(`{"execute": "query-memory-size-summary"}`)
raw, err := mon.Run(qmpcmd)
if err != nil {
return nil, err
}
var result QmpMemorySize
if err := json.Unmarshal(raw, &result); err != nil {
return nil, fmt.Errorf("error unmarshaling json: %w", err)
}
return resource.NewQuantity(result.Return.BaseMemory+result.Return.PluggedMemory, resource.BinarySI), nil
}
func QmpStartMigration(virtualmachine *vmv1.VirtualMachine, virtualmachinemigration *vmv1.VirtualMachineMigration) error {
// QMP port
port := virtualmachine.Spec.QMP
// connect to source runner QMP
s_ip := virtualmachinemigration.Status.SourcePodIP
smon, err := qmp.NewSocketMonitor("tcp", fmt.Sprintf("%s:%d", s_ip, port), 2*time.Second)
if err != nil {
return err
}
if err := smon.Connect(); err != nil {
return err
}
defer smon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
// connect to target runner QMP
t_ip := virtualmachinemigration.Status.TargetPodIP
tmon, err := qmp.NewSocketMonitor("tcp", fmt.Sprintf("%s:%d", t_ip, port), 2*time.Second)
if err != nil {
return err
}
if err := tmon.Connect(); err != nil {
return err
}
defer tmon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
cache := resource.MustParse("256Mi")
var qmpcmd []byte
// setup migration on source runner
qmpcmd = []byte(fmt.Sprintf(`{
"execute": "migrate-set-capabilities",
"arguments":
{
"capabilities": [
{"capability": "postcopy-ram", "state": %t},
{"capability": "xbzrle", "state": true},
{"capability": "compress", "state": true},
{"capability": "auto-converge", "state": %t},
{"capability": "zero-blocks", "state": true}
]
}
}`, virtualmachinemigration.Spec.AllowPostCopy, virtualmachinemigration.Spec.AutoConverge))
_, err = smon.Run(qmpcmd)
if err != nil {
return err
}
qmpcmd = []byte(fmt.Sprintf(`{
"execute": "migrate-set-parameters",
"arguments":
{
"xbzrle-cache-size": %d,
"max-bandwidth": %d,
"multifd-compression": "zstd"
}
}`, cache.Value(), virtualmachinemigration.Spec.MaxBandwidth.Value()))
_, err = smon.Run(qmpcmd)
if err != nil {
return err
}
// setup migration on target runner
qmpcmd = []byte(fmt.Sprintf(`{
"execute": "migrate-set-capabilities",
"arguments":
{
"capabilities": [
{"capability": "postcopy-ram", "state": %t},
{"capability": "xbzrle", "state": true},
{"capability": "compress", "state": true},
{"capability": "auto-converge", "state": %t},
{"capability": "zero-blocks", "state": true}
]
}
}`, virtualmachinemigration.Spec.AllowPostCopy, virtualmachinemigration.Spec.AutoConverge))
_, err = tmon.Run(qmpcmd)
if err != nil {
return err
}
qmpcmd = []byte(fmt.Sprintf(`{
"execute": "migrate-set-parameters",
"arguments":
{
"xbzrle-cache-size": %d,
"max-bandwidth": %d,
"multifd-compression": "zstd"
}
}`, cache.Value(), virtualmachinemigration.Spec.MaxBandwidth.Value()))
_, err = tmon.Run(qmpcmd)
if err != nil {
return err
}
// trigger migration
qmpcmd = []byte(fmt.Sprintf(`{
"execute": "migrate",
"arguments":
{
"uri": "tcp:%s:%d",
"inc": %t,
"blk": %t
}
}`, t_ip, vmv1.MigrationPort, virtualmachinemigration.Spec.Incremental, !virtualmachinemigration.Spec.Incremental))
_, err = smon.Run(qmpcmd)
if err != nil {
return err
}
if virtualmachinemigration.Spec.AllowPostCopy {
qmpcmd = []byte(`{"execute": "migrate-start-postcopy"}`)
_, err = smon.Run(qmpcmd)
if err != nil {
return err
}
}
return nil
}
func QmpGetMigrationInfo(ip string, port int32) (*MigrationInfo, error) {
mon, err := QmpConnect(ip, port)
if err != nil {
return nil, err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
qmpcmd := []byte(`{"execute": "query-migrate"}`)
raw, err := mon.Run(qmpcmd)
if err != nil {
return nil, err
}
var result QmpMigrationInfo
if err := json.Unmarshal(raw, &result); err != nil {
return nil, fmt.Errorf("error unmarshaling json: %w", err)
}
return &result.Return, nil
}
func QmpCancelMigration(ip string, port int32) error {
mon, err := QmpConnect(ip, port)
if err != nil {
return err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
qmpcmd := []byte(`{"execute": "migrate_cancel"}`)
_, err = mon.Run(qmpcmd)
if err != nil {
return err
}
return nil
}
func QmpQuit(ip string, port int32) error {
mon, err := QmpConnect(ip, port)
if err != nil {
return err
}
defer mon.Disconnect() //nolint:errcheck // nothing to do with error when deferred. TODO: log it?
qmpcmd := []byte(`{"execute": "quit"}`)
_, err = mon.Run(qmpcmd)
if err != nil {
return err
}
return nil
}
/*
Copyright 2023.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controllers
import (
"context"
"errors"
"fmt"
"math"
"time"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/log"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apiserver/pkg/storage/names"
"k8s.io/client-go/tools/record"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/neonvm/controllers/buildtag"
)
const virtualmachinemigrationFinalizer = "vm.neon.tech/finalizer"
// Definitions to manage status conditions
const (
// typeAvailableVirtualMachineMigration represents the status of the Deployment reconciliation
typeAvailableVirtualMachineMigration = "Available"
// typeDegradedVirtualMachineMigration represents the status used when the custom resource is deleted and the finalizer operations are must to occur.
typeDegradedVirtualMachineMigration = "Degraded"
)
// VirtualMachineMigrationReconciler reconciles a VirtualMachineMigration object
type VirtualMachineMigrationReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
Config *ReconcilerConfig
Metrics ReconcilerMetrics
}
// The following markers are used to generate the rules permissions (RBAC) on config/rbac using controller-gen
// when controller-gen (used by 'make generate') is executed.
// To know more about markers see: https://book.kubebuilder.io/reference/markers.html
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachinemigrations,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachinemigrations/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=vm.neon.tech,resources=virtualmachinemigrations/finalizers,verbs=update
//+kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;list;watch
// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// It is essential for the controller's reconciliation loop to be idempotent. By following the Operator
// pattern you will create Controllers which provide a reconcile function
// responsible for synchronizing resources until the desired state is reached on the cluster.
// Breaking this recommendation goes against the design principles of controller-runtime.
// and may lead to unforeseen consequences such as resources becoming stuck and requiring manual intervention.
// For further info:
// - About Operator Pattern: https://kubernetes.io/docs/concepts/extend-kubernetes/operator/
// - About Controllers: https://kubernetes.io/docs/concepts/architecture/controller/
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.13.0/pkg/reconcile
func (r *VirtualMachineMigrationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := log.FromContext(ctx)
// Fetch the VirtualMachineMigration instance
// The purpose is check if the Custom Resource for the Kind VirtualMachineMigration
// is applied on the cluster if not we return nil to stop the reconciliation
migration := new(vmv1.VirtualMachineMigration)
if err := r.Get(ctx, req.NamespacedName, migration); err != nil {
// ignore error and stop reconcile loop if object not found (already deleted?)
if apierrors.IsNotFound(err) {
return ctrl.Result{}, nil
}
log.Error(err, "Unable to fetch Migration")
return ctrl.Result{}, err
}
// examine DeletionTimestamp to determine if object is under deletion
if migration.ObjectMeta.DeletionTimestamp.IsZero() {
// The object is not being deleted, so if it does not have our finalizer,
// then lets add the finalizer and update the object. This is equivalent
// registering our finalizer.
if !controllerutil.ContainsFinalizer(migration, virtualmachinemigrationFinalizer) {
log.Info("Adding Finalizer to Migration")
if !controllerutil.AddFinalizer(migration, virtualmachinemigrationFinalizer) {
return ctrl.Result{}, errors.New("Failed to add finalizer to Migration")
}
if err := r.Update(ctx, migration); err != nil {
return ctrl.Result{}, err
}
// stop this reconciliation cycle, new will be triggered as Migration updated
return ctrl.Result{}, nil
}
} else {
// The object is being deleted
if controllerutil.ContainsFinalizer(migration, virtualmachinemigrationFinalizer) {
// our finalizer is present, so lets handle any external dependency
log.Info("Performing Finalizer Operations for Migration")
vm := new(vmv1.VirtualMachine)
err := r.Get(ctx, types.NamespacedName{Name: migration.Spec.VmName, Namespace: migration.Namespace}, vm)
if err != nil {
log.Error(err, "Failed to get VM", "VmName", migration.Spec.VmName)
}
if err := r.doFinalizerOperationsForVirtualMachineMigration(ctx, migration, vm); err != nil {
// if fail to delete the external dependency here, return with error
// so that it can be retried
return ctrl.Result{}, err
}
// remove our finalizer from the list and update it.
log.Info("Removing Finalizer from Migration")
if !controllerutil.RemoveFinalizer(migration, virtualmachinemigrationFinalizer) {
return ctrl.Result{}, errors.New("Failed to remove finalizer from Migration")
}
if err := r.Update(ctx, migration); err != nil {
return ctrl.Result{}, err
}
}
// Stop reconciliation as the item is being deleted
return ctrl.Result{}, nil
}
// Fetch the corresponding VirtualMachine instance
vm := new(vmv1.VirtualMachine)
err := r.Get(ctx, types.NamespacedName{Name: migration.Spec.VmName, Namespace: migration.Namespace}, vm)
if err != nil {
log.Error(err, "Failed to get VM", "VmName", migration.Spec.VmName)
if apierrors.IsNotFound(err) {
// stop reconcile loop if vm not found (already deleted?)
message := fmt.Sprintf("VM (%s) not found", migration.Spec.VmName)
r.Recorder.Event(migration, "Warning", "Failed", message)
meta.SetStatusCondition(&migration.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachineMigration,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: message})
migration.Status.Phase = vmv1.VmmFailed
return r.updateMigrationStatus(ctx, migration)
}
// return err and try reconcile again
return ctrl.Result{}, err
}
// Set owner for VM migration object
if !metav1.IsControlledBy(migration, vm) {
log.Info("Set VM as owner for Migration", "vm.Name", vm.Name)
if err := ctrl.SetControllerReference(vm, migration, r.Scheme); err != nil {
return ctrl.Result{}, err
}
if err := r.Update(ctx, migration); err != nil {
log.Info("Failed to add owner to Migration", "error", err)
return ctrl.Result{}, err
}
// stop this reconciliation cycle, new will be triggered as Migration updated
return ctrl.Result{}, nil
}
// MAIN RECONCILE LOOP START
// Let's check and just set the condition status as Unknown when no status are available
if migration.Status.Conditions == nil || len(migration.Status.Conditions) == 0 {
log.Info("Set initial Unknown condition status")
meta.SetStatusCondition(&migration.Status.Conditions, metav1.Condition{Type: typeAvailableVirtualMachineMigration, Status: metav1.ConditionUnknown, Reason: "Reconciling", Message: "Starting reconciliation"})
return r.updateMigrationStatus(ctx, migration)
}
// target runner pod details - generate name
if len(migration.Status.TargetPodName) == 0 {
targetPodName := names.SimpleNameGenerator.GenerateName(fmt.Sprintf("%s-", vm.Name))
log.Info("Set Target Pod Name", "TargetPod.Name", targetPodName)
migration.Status.TargetPodName = targetPodName
return r.updateMigrationStatus(ctx, migration)
}
switch migration.Status.Phase {
case "":
// need change VM status asap to prevent autoscler change CPU/RAM in VM
// but only if VM running
if vm.Status.Phase == vmv1.VmRunning {
vm.Status.Phase = vmv1.VmPreMigrating
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to update VM status to PreMigrating", "Status", vm.Status.Phase)
return ctrl.Result{}, err
}
// Migration just created, change Phase to "Pending"
migration.Status.Phase = vmv1.VmmPending
return r.updateMigrationStatus(ctx, migration)
}
// some other VM status (Scaling may be), requeue after second
return ctrl.Result{RequeueAfter: time.Second}, nil
case vmv1.VmmPending:
// Check if the target runner pod already exists,
// if not create a new one using source pod as template
targetRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: migration.Status.TargetPodName, Namespace: vm.Namespace}, targetRunner)
if err != nil && apierrors.IsNotFound(err) {
// NB: .Spec.EnableSSH guaranteed non-nil because the k8s API server sets the default for us.
enableSSH := *vm.Spec.EnableSSH
var sshSecret *corev1.Secret
if enableSSH {
// We require the SSH secret to exist because we cannot unmount and
// mount the new secret into the VM after the live migration. If a
// VM's SSH secret is deleted accidentally then live migration is
// not possible.
if len(vm.Status.SSHSecretName) == 0 {
err := errors.New("VM has .Spec.EnableSSH but its .Status.SSHSecretName is empty")
log.Error(err, "Failed to get VM's SSH Secret")
r.Recorder.Event(migration, "Warning", "Failed", err.Error())
return ctrl.Result{}, err
}
sshSecret = &corev1.Secret{}
err := r.Get(ctx, types.NamespacedName{Name: vm.Status.SSHSecretName, Namespace: vm.Namespace}, sshSecret)
if err != nil {
log.Error(err, "Failed to get VM's SSH Secret")
r.Recorder.Event(migration, "Warning", "Failed", fmt.Sprintf("Failed to get VM's SSH Secret: %v", err))
return ctrl.Result{}, err
}
}
// Define a new target pod
tpod, err := r.targetPodForVirtualMachine(vm, migration, sshSecret)
if err != nil {
log.Error(err, "Failed to generate Target Pod spec")
return ctrl.Result{}, err
}
log.Info("Creating a Target Pod", "Pod.Namespace", tpod.Namespace, "Pod.Name", tpod.Name)
if err = r.Create(ctx, tpod); err != nil {
log.Error(err, "Failed to create Target Pod", "Pod.Namespace", tpod.Namespace, "Pod.Name", tpod.Name)
return ctrl.Result{}, err
}
log.Info("Target runner Pod was created", "Pod.Namespace", tpod.Namespace, "Pod.Name", tpod.Name)
// add event with some info
r.Recorder.Event(migration, "Normal", "Created",
fmt.Sprintf("VM (%s) ready migrate to target pod (%s)",
vm.Name, tpod.Name))
// target pod was just created, so requeue reconcile
return ctrl.Result{RequeueAfter: time.Second}, nil
} else if err != nil {
log.Error(err, "Failed to get Target Pod")
return ctrl.Result{}, err
}
// Update the metadata (including "usage" annotation) before anything else, so that it
// will be correctly set even if the rest of the reconcile operation fails.
if err := updatePodMetadataIfNecessary(ctx, r.Client, vm, targetRunner); err != nil {
log.Error(err, "Failed to sync pod labels and annotations", "TargetPod.Name", targetRunner.Name)
}
// If not already, set an additional (non-controller) owner reference for the source pod:
sourceRunner := &corev1.Pod{}
err = r.Get(ctx, types.NamespacedName{Name: vm.Status.PodName, Namespace: vm.Namespace}, sourceRunner)
if err != nil {
log.Error(err, "Failed to get migration source pod")
return ctrl.Result{}, err
}
ownedByMigration := false
for _, ref := range sourceRunner.OwnerReferences {
if ref.UID == migration.UID {
ownedByMigration = true
break
}
}
if !ownedByMigration {
if err = controllerutil.SetOwnerReference(migration, sourceRunner, r.Scheme); err != nil {
log.Error(err, "Failed to set owner reference for source pod")
return ctrl.Result{}, err
}
if err = r.Update(ctx, sourceRunner); err != nil {
log.Error(err, "Failed to update owner of source runner")
// Requeue so that we try again, even though we're not an owner of the source runner
return ctrl.Result{RequeueAfter: time.Second}, err
}
}
// now inspect target pod status and update migration
switch runnerStatus(targetRunner) {
case runnerRunning:
// update migration status
migration.Status.SourcePodName = vm.Status.PodName
migration.Status.SourcePodIP = vm.Status.PodIP
migration.Status.TargetPodIP = targetRunner.Status.PodIP
// do hotplugCPU in targetRunner before migration
log.Info("Syncing CPUs in Target runner", "TargetPod.Name", migration.Status.TargetPodName)
if err := QmpSyncCpuToTarget(vm, migration); err != nil {
return ctrl.Result{}, err
}
log.Info("CPUs in Target runner synced", "TargetPod.Name", migration.Status.TargetPodName)
// do hotplug Memory in targetRunner -- only needed for dimm slots; virtio-mem Just Works™
switch *vm.Status.MemoryProvider {
case vmv1.MemoryProviderVirtioMem:
// ref "Migration works out of the box" - https://lwn.net/Articles/755423/
log.Info(
"No need to sync memory in Target runner because MemoryProvider is VirtioMem",
"TargetPod.Name", migration.Status.TargetPodName,
)
case vmv1.MemoryProviderDIMMSlots:
log.Info("Syncing Memory in Target runner", "TargetPod.Name", migration.Status.TargetPodName)
if err := QmpSyncMemoryToTarget(vm, migration); err != nil {
return ctrl.Result{}, err
}
log.Info("Memory in Target runner synced", "TargetPod.Name", migration.Status.TargetPodName)
default:
panic(fmt.Errorf("unexpected vm.status.memoryProvider %q", *vm.Status.MemoryProvider))
}
// Migrate only running VMs to target with plugged devices
if vm.Status.Phase == vmv1.VmPreMigrating {
// update VM status
vm.Status.Phase = vmv1.VmMigrating
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to update VirtualMachine status to 'Migrating'")
return ctrl.Result{}, err
}
// trigger migration
if err := QmpStartMigration(vm, migration); err != nil {
migration.Status.Phase = vmv1.VmmFailed
return ctrl.Result{}, err
}
message := fmt.Sprintf("Migration was started to target runner (%s)", targetRunner.Name)
log.Info(message)
r.Recorder.Event(migration, "Normal", "Started", message)
meta.SetStatusCondition(&migration.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachineMigration,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: message})
// finally update migration phase to Running
migration.Status.Phase = vmv1.VmmRunning
return r.updateMigrationStatus(ctx, migration)
}
case runnerSucceeded:
// target runner pod finished without error? but it shouldn't finish
message := fmt.Sprintf("Target Pod (%s) completed suddenly", targetRunner.Name)
log.Info(message)
r.Recorder.Event(migration, "Warning", "Failed", message)
meta.SetStatusCondition(&migration.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachineMigration,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: message})
migration.Status.Phase = vmv1.VmmFailed
return r.updateMigrationStatus(ctx, migration)
case runnerFailed:
message := fmt.Sprintf("Target Pod (%s) failed", targetRunner.Name)
log.Info(message)
r.Recorder.Event(migration, "Warning", "Failed", message)
meta.SetStatusCondition(&migration.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachineMigration,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: message})
migration.Status.Phase = vmv1.VmmFailed
return r.updateMigrationStatus(ctx, migration)
case runnerUnknown:
message := fmt.Sprintf("Target Pod (%s) in Unknown phase", targetRunner.Name)
log.Info(message)
r.Recorder.Event(migration, "Warning", "Unknown", message)
meta.SetStatusCondition(&migration.Status.Conditions,
metav1.Condition{Type: typeAvailableVirtualMachineMigration,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: message})
migration.Status.Phase = vmv1.VmmPending
return r.updateMigrationStatus(ctx, migration)
default:
// not sure what to do, so try rqueue
return ctrl.Result{RequeueAfter: time.Second}, nil
}
case vmv1.VmmRunning:
// retrieve target pod details
targetRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: migration.Status.TargetPodName, Namespace: migration.Namespace}, targetRunner)
if err != nil && apierrors.IsNotFound(err) {
// lost target pod for running Migration ?
message := fmt.Sprintf("Target Pod (%s) disappeared", migration.Status.TargetPodName)
r.Recorder.Event(migration, "Error", "NotFound", message)
meta.SetStatusCondition(&migration.Status.Conditions,
metav1.Condition{Type: typeDegradedVirtualMachineMigration,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: message})
migration.Status.Phase = vmv1.VmmFailed
return r.updateMigrationStatus(ctx, migration)
} else if err != nil {
log.Error(err, "Failed to get target runner Pod")
return ctrl.Result{}, err
}
// Update the metadata (including "usage" annotation) before anything else, so that it
// will be correctly set even if the rest of the reconcile operation fails.
if err := updatePodMetadataIfNecessary(ctx, r.Client, vm, targetRunner); err != nil {
log.Error(err, "Failed to sync pod labels and annotations", "TargetPod.Name", targetRunner.Name)
}
// retrieve migration statistics
migrationInfo, err := QmpGetMigrationInfo(QmpAddr(vm))
if err != nil {
log.Error(err, "Failed to get migration info")
return ctrl.Result{}, err
}
// check if migration done
if migrationInfo.Status == "completed" {
message := fmt.Sprintf("Migration finished with success to target pod (%s)",
targetRunner.Name)
log.Info(message)
r.Recorder.Event(migration, "Normal", "Finished", message)
// re-fetch the vm
err := r.Get(ctx, types.NamespacedName{Name: migration.Spec.VmName, Namespace: migration.Namespace}, vm)
if err != nil {
log.Error(err, "Failed to re-fetch VM", "VmName", migration.Spec.VmName)
return ctrl.Result{}, err
}
// Redefine runner Pod for VM
vm.Status.PodName = migration.Status.TargetPodName
vm.Status.PodIP = migration.Status.TargetPodIP
vm.Status.Phase = vmv1.VmRunning
// update VM status
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to redefine runner pod in VM")
return ctrl.Result{}, err
}
// Redefine ownerRef for the target Pod
targetRunner.OwnerReferences = []metav1.OwnerReference{}
if err := ctrl.SetControllerReference(vm, targetRunner, r.Scheme); err != nil {
return ctrl.Result{}, err
}
if err := r.Update(ctx, targetRunner); err != nil {
log.Error(err, "Failed to update ownerRef for target runner pod")
return ctrl.Result{}, err
}
// Redefine ownerRef for the source Pod
sourceRunner := &corev1.Pod{}
err = r.Get(ctx, types.NamespacedName{Name: migration.Status.SourcePodName, Namespace: migration.Namespace}, sourceRunner)
if err == nil {
sourceRunner.OwnerReferences = []metav1.OwnerReference{}
if err := ctrl.SetControllerReference(migration, sourceRunner, r.Scheme); err != nil {
return ctrl.Result{}, err
}
if err := r.Update(ctx, sourceRunner); err != nil {
log.Error(err, "Failed to update ownerRef for source runner pod")
return ctrl.Result{}, err
}
} else if !apierrors.IsNotFound(err) {
return ctrl.Result{}, err
}
// try to stop hypervisor in source runner if it running still
if sourceRunner.Status.Phase == corev1.PodRunning {
if err := QmpQuit(migration.Status.SourcePodIP, vm.Spec.QMP); err != nil {
log.Error(err, "Failed stop hypervisor in source runner pod")
} else {
log.Info("Hypervisor in source runner pod stopped")
}
} else {
log.Info("Skip stopping hypervisor in source runner pod", "pod.Status.Phase", sourceRunner.Status.Phase)
}
// finally update migration phase to Succeeded
migration.Status.Phase = vmv1.VmmSucceeded
migration.Status.Info.Status = migrationInfo.Status
return r.updateMigrationStatus(ctx, migration)
}
// check if migration failed
if migrationInfo.Status == "failed" {
// oops, migration failed
message := fmt.Sprintf("Migration to target pod (%s) was failed",
targetRunner.Name)
log.Info(message)
r.Recorder.Event(migration, "Warning", "Failed", message)
// try to stop hypervisor in target runner
if targetRunner.Status.Phase == corev1.PodRunning {
if err := QmpQuit(migration.Status.TargetPodIP, vm.Spec.QMP); err != nil {
log.Error(err, "Failed stop hypervisor in target runner pod")
} else {
log.Info("Hypervisor in target runner pod stopped")
}
} else {
log.Info("Skip stopping hypervisor in target runner pod", "pod.Status.Phase", targetRunner.Status.Phase)
}
// change VM status to Running
vm.Status.Phase = vmv1.VmRunning
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to update VM status from Migrating back to Running as Migration was failed")
return ctrl.Result{}, err
}
// finally update migration phase to Failed
migration.Status.Phase = vmv1.VmmFailed
migration.Status.Info.Status = migrationInfo.Status
return r.updateMigrationStatus(ctx, migration)
}
// seems migration still going on, just update status with migration progress once per second
time.Sleep(time.Second)
// re-retrieve migration statistics
migrationInfo, err = QmpGetMigrationInfo(QmpAddr(vm))
if err != nil {
log.Error(err, "Failed to re-get migration info")
return ctrl.Result{}, err
}
// re-fetch the vm
err = r.Get(ctx, types.NamespacedName{Name: migration.Spec.VmName, Namespace: migration.Namespace}, vm)
if err != nil {
log.Error(err, "Failed to re-fetch VM before Mgration progress update", "VmName", migration.Spec.VmName)
return ctrl.Result{}, err
}
migration.Status.Info.Status = migrationInfo.Status
migration.Status.Info.TotalTimeMs = migrationInfo.TotalTimeMs
migration.Status.Info.SetupTimeMs = migrationInfo.SetupTimeMs
migration.Status.Info.DowntimeMs = migrationInfo.DowntimeMs
migration.Status.Info.Ram.Transferred = migrationInfo.Ram.Transferred
migration.Status.Info.Ram.Remaining = migrationInfo.Ram.Remaining
migration.Status.Info.Ram.Total = migrationInfo.Ram.Total
migration.Status.Info.Compression.CompressedSize = migrationInfo.Compression.CompressedSize
migration.Status.Info.Compression.CompressionRate = int64(math.Round(migrationInfo.Compression.CompressionRate))
return r.updateMigrationStatus(ctx, migration)
case vmv1.VmmSucceeded:
// do additional VM status checks
if vm.Status.Phase == vmv1.VmMigrating {
// migration Succeeded and VM should have status Running
vm.Status.Phase = vmv1.VmRunning
// update VM status
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to update VM status from Migrating to Running as Migration succeeded")
return ctrl.Result{}, err
}
}
if len(migration.Status.SourcePodName) > 0 {
// try to find and remove source runner Pod
sourceRunner := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: migration.Status.SourcePodName, Namespace: migration.Namespace}, sourceRunner)
if err != nil && !apierrors.IsNotFound(err) {
log.Error(err, "Failed to get source runner Pod for deletion")
return ctrl.Result{}, err
}
var msg, eventReason string
if buildtag.NeverDeleteRunnerPods {
msg = fmt.Sprintf("Source runner pod deletion was skipped due to '%s' build tag", buildtag.TagnameNeverDeleteRunnerPods)
eventReason = "DeleteSkipped"
} else {
if err := r.Delete(ctx, sourceRunner); err != nil {
log.Error(err, "Failed to delete source runner Pod")
return ctrl.Result{}, err
}
msg = "Source runner was deleted"
eventReason = "Deleted"
}
log.Info(msg, "Pod.Namespace", sourceRunner.Namespace, "Pod.Name", sourceRunner.Name)
r.Recorder.Event(migration, "Normal", eventReason, fmt.Sprintf("%s: %s", msg, sourceRunner.Name))
migration.Status.SourcePodName = ""
migration.Status.SourcePodIP = ""
return r.updateMigrationStatus(ctx, migration)
}
// all done, stop reconciliation
return ctrl.Result{}, nil
case vmv1.VmmFailed:
// do additional VM status checks
if vm.Status.Phase == vmv1.VmMigrating {
// migration Failed and VM should back to Running state
vm.Status.Phase = vmv1.VmRunning
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to update VM status from Migrating back to Running as Migration was failed")
return ctrl.Result{}, err
}
}
// all done, stop reconciliation
return ctrl.Result{}, nil
default:
// not sure what to do, so try rqueue
log.Info("Requeuing current request")
return ctrl.Result{RequeueAfter: time.Second}, nil
}
// MAIN RECONCILE LOOP END
return ctrl.Result{}, nil
}
func (r *VirtualMachineMigrationReconciler) updateMigrationStatus(ctx context.Context, migration *vmv1.VirtualMachineMigration) (ctrl.Result, error) {
log := log.FromContext(ctx)
if err := r.Status().Update(ctx, migration); err != nil {
log.Error(err, "Failed update Migration status")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
// finalizeVirtualMachineMigration will perform the required operations before delete the CR.
func (r *VirtualMachineMigrationReconciler) doFinalizerOperationsForVirtualMachineMigration(ctx context.Context, migration *vmv1.VirtualMachineMigration, vm *vmv1.VirtualMachine) error {
log := log.FromContext(ctx)
if migration.Status.Phase == vmv1.VmmRunning || vm.Status.Phase == vmv1.VmPreMigrating {
message := fmt.Sprintf("Running Migration (%s) is being deleted", migration.Name)
log.Info(message)
r.Recorder.Event(migration, "Warning", "Deleting", message)
// try to cancel migration
log.Info("Canceling migration")
if err := QmpCancelMigration(QmpAddr(vm)); err != nil {
// inform about error but not return error to avoid stuckness in reconciliation cycle
log.Error(err, "Migration canceling failed")
}
if vm.Status.Phase == vmv1.VmMigrating || vm.Status.Phase == vmv1.VmPreMigrating {
// migration being deleted and VM should have status Running
vm.Status.Phase = vmv1.VmRunning
// update VM status
if err := r.Status().Update(ctx, vm); err != nil {
log.Error(err, "Failed to update VM status from Migrating to Running on Migration deletion")
return err
}
}
// try to remove target runner pod
if len(migration.Status.TargetPodName) > 0 {
pod := &corev1.Pod{}
err := r.Get(ctx, types.NamespacedName{Name: migration.Status.TargetPodName, Namespace: migration.Namespace}, pod)
if err != nil && !apierrors.IsNotFound(err) {
log.Error(err, "Failed to get target runner Pod for deletion")
return err
}
if apierrors.IsNotFound(err) {
// pod already deleted ?
return nil
}
// NB: here, we ignore buildtag.NeverDeleteRunnerPods because we delete runner pods on
// VM object deletion with the tag anyways, so it's more consistent to keep the same
// behavior for VMMs.
if err := r.Delete(ctx, pod); err != nil {
log.Error(err, "Failed to delete target runner Pod")
return err
}
message := fmt.Sprintf("Target runner (%s) was deleted", pod.Name)
log.Info(message)
r.Recorder.Event(migration, "Normal", "Deleted", message)
}
}
return nil
}
// SetupWithManager sets up the controller with the Manager.
// Note that the Pods will be also watched in order to ensure its
// desirable state on the cluster
func (r *VirtualMachineMigrationReconciler) SetupWithManager(mgr ctrl.Manager) (ReconcilerWithMetrics, error) {
cntrlName := "virtualmachinemigration"
reconciler := WithMetrics(
withCatchPanic(r),
r.Metrics,
cntrlName,
r.Config.FailurePendingPeriod,
r.Config.FailingRefreshInterval,
)
err := ctrl.NewControllerManagedBy(mgr).
For(&vmv1.VirtualMachineMigration{}).
Owns(&corev1.Pod{}).
WithOptions(controller.Options{MaxConcurrentReconciles: r.Config.MaxConcurrentReconciles}).
Named(cntrlName).
Complete(reconciler)
return reconciler, err
}
// targetPodForVirtualMachine returns a VirtualMachine Pod object
func (r *VirtualMachineMigrationReconciler) targetPodForVirtualMachine(
vm *vmv1.VirtualMachine,
migration *vmv1.VirtualMachineMigration,
sshSecret *corev1.Secret,
) (*corev1.Pod, error) {
if vm.Status.MemoryProvider == nil {
return nil, errors.New("cannot create target pod because vm.status.memoryProvider is not set")
}
// TODO: this is technically racy because target pod creation happens before we set the
// migration source pod, so in between reading this and starting the migration, it's
// *technically* possible that we create a target pod with a different memory provider than a
// newer source pod.
// Given that this requires (a) restart *during* initial live migration, and (b) that restart to
// change the memory provider, this is low enough risk that it's ok to leave to a follow-up.
memoryProvider := *vm.Status.MemoryProvider
pod, err := podSpec(vm, memoryProvider, sshSecret, r.Config)
if err != nil {
return nil, err
}
// override pod name
pod.Name = migration.Status.TargetPodName
// add env variable to turn on migration receiver
pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, corev1.EnvVar{Name: "RECEIVE_MIGRATION", Value: "true"})
// add podAntiAffinity to schedule target pod to another k8s node
if migration.Spec.PreventMigrationToSameHost {
if pod.Spec.Affinity == nil {
pod.Spec.Affinity = &corev1.Affinity{}
}
if pod.Spec.Affinity.PodAntiAffinity == nil {
pod.Spec.Affinity.PodAntiAffinity = &corev1.PodAntiAffinity{}
}
if pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil {
pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = []corev1.PodAffinityTerm{}
}
pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = append(pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, corev1.PodAffinityTerm{
LabelSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{
vmv1.VirtualMachineNameLabel: migration.Spec.VmName,
},
},
TopologyKey: "kubernetes.io/hostname",
})
}
// Set the ownerRef for the Pod
if err := ctrl.SetControllerReference(migration, pod, r.Scheme); err != nil {
return nil, err
}
return pod, nil
}
package controllers
// Wrapper around the default VirtualMachine/VirtualMachineMigration webhook interfaces so that the
// controller has a bit more control over them, without needing to actually implement that control
// inside of the apis package.
import (
"context"
"fmt"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/webhook"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/util/stack"
)
func validateUpdate(
ctx context.Context,
cfg *ReconcilerConfig,
recorder record.EventRecorder,
oldObj runtime.Object,
newObj interface {
webhook.Validator
metav1.Object
},
) (admission.Warnings, error) {
log := log.FromContext(ctx)
namespacedName := client.ObjectKeyFromObject(newObj)
_, skipValidation := cfg.SkipUpdateValidationFor[namespacedName]
warnings, err := func() (w admission.Warnings, e error) {
// if we plan to skip validation, catch any panics so that they can be ignored.
if skipValidation {
defer func() {
if err := recover(); err != nil {
e = fmt.Errorf("validation panicked with: %v", err)
st := stack.GetStackTrace(nil, 1).String()
log.Error(e, "webhook update validation panicked", "stack", st)
}
}()
}
return newObj.ValidateUpdate(oldObj)
}()
if err != nil && skipValidation {
recorder.Event(
newObj,
"Warning",
"SkippedValidation",
"Ignoring failed webhook validation because of controller's '--skip-update-validation-for' flag",
)
log.Error(err, "Ignoring failed webhook validation")
return warnings, nil
}
return warnings, err
}
type VMWebhook struct {
Recorder record.EventRecorder
Config *ReconcilerConfig
}
func (w *VMWebhook) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewWebhookManagedBy(mgr).
For(&vmv1.VirtualMachine{}).
WithDefaulter(w).
WithValidator(w).
Complete()
}
var _ webhook.CustomDefaulter = (*VMWebhook)(nil)
// Default implements webhook.CustomDefaulter
func (w *VMWebhook) Default(ctx context.Context, obj runtime.Object) error {
vm := obj.(*vmv1.VirtualMachine)
vm.Default()
return nil
}
var _ webhook.CustomValidator = (*VMWebhook)(nil)
// ValidateCreate implements webhook.CustomValidator
func (w *VMWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
vm := obj.(*vmv1.VirtualMachine)
return vm.ValidateCreate()
}
// ValidateUpdate implements webhook.CustomValidator
func (w *VMWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
newVM := newObj.(*vmv1.VirtualMachine)
return validateUpdate(ctx, w.Config, w.Recorder, oldObj, newVM)
}
// ValidateDelete implements webhook.CustomValidator
func (w *VMWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
vm := obj.(*vmv1.VirtualMachine)
return vm.ValidateDelete()
}
type VMMigrationWebhook struct {
Recorder record.EventRecorder
Config *ReconcilerConfig
}
func (w *VMMigrationWebhook) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewWebhookManagedBy(mgr).
For(&vmv1.VirtualMachineMigration{}).
WithDefaulter(w).
WithValidator(w).
Complete()
}
var _ webhook.CustomDefaulter = (*VMWebhook)(nil)
// Default implements webhook.CustomDefaulter
func (w *VMMigrationWebhook) Default(ctx context.Context, obj runtime.Object) error {
vmm := obj.(*vmv1.VirtualMachineMigration)
vmm.Default()
return nil
}
var _ webhook.CustomValidator = (*VMWebhook)(nil)
// ValidateCreate implements webhook.CustomValidator
func (w *VMMigrationWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
vmm := obj.(*vmv1.VirtualMachineMigration)
return vmm.ValidateCreate()
}
// ValidateUpdate implements webhook.CustomValidator
func (w *VMMigrationWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
newVMM := newObj.(*vmv1.VirtualMachineMigration)
return validateUpdate(ctx, w.Config, w.Recorder, oldObj, newVMM)
}
// ValidateDelete implements webhook.CustomValidator
func (w *VMMigrationWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
vmm := obj.(*vmv1.VirtualMachineMigration)
return vmm.ValidateDelete()
}
package core
import (
"time"
"go.uber.org/zap/zapcore"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
)
type ActionSet struct {
Wait *ActionWait `json:"wait,omitempty"`
PluginRequest *ActionPluginRequest `json:"pluginRequest,omitempty"`
NeonVMRequest *ActionNeonVMRequest `json:"neonvmRequest,omitempty"`
MonitorDownscale *ActionMonitorDownscale `json:"monitorDownscale,omitempty"`
MonitorUpscale *ActionMonitorUpscale `json:"monitorUpscale,omitempty"`
}
type ActionWait struct {
Duration time.Duration `json:"duration"`
}
type ActionPluginRequest struct {
LastPermit *api.Resources `json:"current"`
Target api.Resources `json:"target"`
Metrics *api.Metrics `json:"metrics"`
TargetRevision vmv1.RevisionWithTime `json:"targetRevision"`
}
type ActionNeonVMRequest struct {
Current api.Resources `json:"current"`
Target api.Resources `json:"target"`
TargetRevision vmv1.RevisionWithTime `json:"targetRevision"`
}
type ActionMonitorDownscale struct {
Current api.Resources `json:"current"`
Target api.Resources `json:"target"`
TargetRevision vmv1.RevisionWithTime `json:"targetRevision"`
}
type ActionMonitorUpscale struct {
Current api.Resources `json:"current"`
Target api.Resources `json:"target"`
TargetRevision vmv1.RevisionWithTime `json:"targetRevision"`
}
func addObjectPtr[T zapcore.ObjectMarshaler](enc zapcore.ObjectEncoder, key string, value *T) error {
if value != nil {
return enc.AddObject(key, *value)
} else {
// nil ObjectMarshaler is not sound, but nil reflected is, and it shortcuts reflection
return enc.AddReflected(key, nil)
}
}
func (s ActionSet) MarshalLogObject(enc zapcore.ObjectEncoder) error {
_ = addObjectPtr(enc, "wait", s.Wait)
_ = addObjectPtr(enc, "pluginRequest", s.PluginRequest)
_ = addObjectPtr(enc, "neonvmRequest", s.NeonVMRequest)
_ = addObjectPtr(enc, "monitorDownscale", s.MonitorDownscale)
_ = addObjectPtr(enc, "monitorUpscale", s.MonitorUpscale)
return nil
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionWait can be used with zap.Object
func (a ActionWait) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddDuration("duration", a.Duration)
return nil
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionPluginRequest can be used with zap.Object
func (a ActionPluginRequest) MarshalLogObject(enc zapcore.ObjectEncoder) error {
_ = addObjectPtr(enc, "lastPermit", a.LastPermit)
_ = enc.AddObject("target", a.Target)
_ = enc.AddReflected("metrics", a.Metrics)
return nil
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionNeonVMRequest can be used with zap.Object
func (a ActionNeonVMRequest) MarshalLogObject(enc zapcore.ObjectEncoder) error {
_ = enc.AddObject("current", a.Current)
_ = enc.AddObject("target", a.Target)
return nil
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionMonitorDownscale can be used with zap.Object
func (a ActionMonitorDownscale) MarshalLogObject(enc zapcore.ObjectEncoder) error {
_ = enc.AddObject("current", a.Current)
_ = enc.AddObject("target", a.Target)
return nil
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that ActionMonitorUpscale can be used with zap.Object
func (a ActionMonitorUpscale) MarshalLogObject(enc zapcore.ObjectEncoder) error {
_ = enc.AddObject("current", a.Current)
_ = enc.AddObject("target", a.Target)
return nil
}
package core
// Implementation of (*State).Dump()
import (
"encoding/json"
"time"
"github.com/neondatabase/autoscaling/pkg/api"
)
func shallowCopy[T any](ptr *T) *T {
if ptr == nil {
return nil
} else {
x := *ptr
return &x
}
}
// StateDump provides introspection into the current values of the fields of State
//
// It implements json.Marshaler.
type StateDump struct {
internal state
}
func (d StateDump) MarshalJSON() ([]byte, error) {
return json.Marshal(d.internal)
}
// Dump produces a JSON-serializable copy of the State
func (s *State) Dump() StateDump {
return StateDump{
internal: state{
Debug: s.internal.Debug,
Config: s.internal.Config,
VM: s.internal.VM,
Plugin: s.internal.Plugin.deepCopy(),
Monitor: s.internal.Monitor.deepCopy(),
NeonVM: s.internal.NeonVM.deepCopy(),
Metrics: shallowCopy[SystemMetrics](s.internal.Metrics),
LFCMetrics: shallowCopy[LFCMetrics](s.internal.LFCMetrics),
TargetRevision: s.internal.TargetRevision,
LastDesiredResources: s.internal.LastDesiredResources,
},
}
}
func (s *pluginState) deepCopy() pluginState {
return pluginState{
OngoingRequest: s.OngoingRequest,
LastRequest: shallowCopy[pluginRequested](s.LastRequest),
LastFailureAt: shallowCopy[time.Time](s.LastFailureAt),
Permit: shallowCopy[api.Resources](s.Permit),
CurrentRevision: s.CurrentRevision,
}
}
func (s *monitorState) deepCopy() monitorState {
return monitorState{
OngoingRequest: shallowCopy[ongoingMonitorRequest](s.OngoingRequest),
RequestedUpscale: shallowCopy[requestedUpscale](s.RequestedUpscale),
DeniedDownscale: shallowCopy[deniedDownscale](s.DeniedDownscale),
Approved: shallowCopy[api.Resources](s.Approved),
DownscaleFailureAt: shallowCopy[time.Time](s.DownscaleFailureAt),
UpscaleFailureAt: shallowCopy[time.Time](s.UpscaleFailureAt),
CurrentRevision: s.CurrentRevision,
}
}
func (s *neonvmState) deepCopy() neonvmState {
return neonvmState{
LastSuccess: shallowCopy[api.Resources](s.LastSuccess),
OngoingRequested: shallowCopy[api.Resources](s.OngoingRequested),
RequestFailedAt: shallowCopy[time.Time](s.RequestFailedAt),
TargetRevision: s.TargetRevision,
CurrentRevision: s.CurrentRevision,
}
}
package core
// Definition of the Metrics type, plus reading it from vector.dev's prometheus format host metrics
import (
"cmp"
"fmt"
"io"
"slices"
"strconv"
"time"
promtypes "github.com/prometheus/client_model/go"
promfmt "github.com/prometheus/common/expfmt"
"github.com/tychoish/fun/erc"
"github.com/neondatabase/autoscaling/pkg/api"
)
type SystemMetrics struct {
LoadAverage1Min float64
MemoryUsageBytes float64
}
func (m SystemMetrics) ToAPI() api.Metrics {
return api.Metrics{
LoadAverage1Min: float32(m.LoadAverage1Min),
LoadAverage5Min: nil,
MemoryUsageBytes: nil,
}
}
type LFCMetrics struct {
CacheHitsTotal float64
CacheMissesTotal float64
CacheWritesTotal float64
// lfc_approximate_working_set_size_windows, currently requires that values are exactly every
// minute
ApproximateworkingSetSizeBuckets []float64
}
// FromPrometheus represents metric types that can be parsed from prometheus output.
type FromPrometheus interface {
fromPrometheus(map[string]*promtypes.MetricFamily) error
}
// ParseMetrics reads the prometheus text-format content, parses it, and uses M's implementation of
// FromPrometheus to populate it before returning.
func ParseMetrics(content io.Reader, metrics FromPrometheus) error {
var parser promfmt.TextParser
mfs, err := parser.TextToMetricFamilies(content)
if err != nil {
return fmt.Errorf("failed to parse content as prometheus text format: %w", err)
}
if err := metrics.fromPrometheus(mfs); err != nil {
return fmt.Errorf("failed to extract metrics: %w", err)
}
return nil
}
func extractFloatGauge(mf *promtypes.MetricFamily) (float64, error) {
if mf.GetType() != promtypes.MetricType_GAUGE {
return 0, fmt.Errorf("wrong metric type: expected %s but got %s", promtypes.MetricType_GAUGE, mf.GetType())
} else if len(mf.Metric) != 1 {
return 0, fmt.Errorf("expected 1 metric, found %d", len(mf.Metric))
}
return mf.Metric[0].GetGauge().GetValue(), nil
}
// Helper function to return an error for a missing metric
func missingMetric(name string) error {
return fmt.Errorf("missing expected metric %s", name)
}
// fromPrometheus implements FromPrometheus, so SystemMetrics can be used with ParseMetrics.
func (m *SystemMetrics) fromPrometheus(mfs map[string]*promtypes.MetricFamily) error {
ec := &erc.Collector{}
getFloat := func(metricName string) float64 {
if mf := mfs[metricName]; mf != nil {
f, err := extractFloatGauge(mf)
ec.Add(err) // does nothing if err == nil
return f
} else {
ec.Add(missingMetric(metricName))
return 0
}
}
load1 := getFloat("host_load1")
memTotal := getFloat("host_memory_total_bytes")
memAvailable := getFloat("host_memory_available_bytes")
tmp := SystemMetrics{
LoadAverage1Min: load1,
// Add an extra 100 MiB to account for kernel memory usage
MemoryUsageBytes: memTotal - memAvailable + 100*(1<<20),
}
if err := ec.Resolve(); err != nil {
return err
}
*m = tmp
return nil
}
// fromPrometheus implements FromPrometheus, so LFCMetrics can be used with ParseMetrics.
func (m *LFCMetrics) fromPrometheus(mfs map[string]*promtypes.MetricFamily) error {
ec := &erc.Collector{}
getFloat := func(metricName string) float64 {
if mf := mfs[metricName]; mf != nil {
f, err := extractFloatGauge(mf)
ec.Add(err) // does nothing if err == nil
return f
} else {
ec.Add(missingMetric(metricName))
return 0
}
}
wssBuckets, err := extractWorkingSetSizeWindows(mfs)
ec.Add(err)
tmp := LFCMetrics{
CacheHitsTotal: getFloat("lfc_hits"),
CacheMissesTotal: getFloat("lfc_misses"),
CacheWritesTotal: getFloat("lfc_writes"),
ApproximateworkingSetSizeBuckets: wssBuckets,
}
if err := ec.Resolve(); err != nil {
return err
}
*m = tmp
return nil
}
func extractWorkingSetSizeWindows(mfs map[string]*promtypes.MetricFamily) ([]float64, error) {
metricName := "lfc_approximate_working_set_size_windows"
mf := mfs[metricName]
if mf == nil {
return nil, missingMetric(metricName)
}
if mf.GetType() != promtypes.MetricType_GAUGE {
return nil, fmt.Errorf("wrong metric type: expected %s, but got %s", promtypes.MetricType_GAUGE, mf.GetType())
} else if len(mf.Metric) < 1 {
return nil, fmt.Errorf("expected >= metric, found %d", len(mf.Metric))
}
type pair struct {
duration time.Duration
value float64
}
var pairs []pair
for _, m := range mf.Metric {
// Find the duration label
durationLabel := "duration_seconds"
durationIndex := slices.IndexFunc(m.Label, func(l *promtypes.LabelPair) bool {
return l.GetName() == durationLabel
})
if durationIndex == -1 {
return nil, fmt.Errorf("metric missing label %q", durationLabel)
}
durationSeconds, err := strconv.Atoi(m.Label[durationIndex].GetValue())
if err != nil {
return nil, fmt.Errorf("couldn't parse metric's %q label as int: %w", durationLabel, err)
}
pairs = append(pairs, pair{
duration: time.Second * time.Duration(durationSeconds),
value: m.GetGauge().GetValue(),
})
}
slices.SortFunc(pairs, func(x, y pair) int {
return cmp.Compare(x.duration, y.duration)
})
// Check that the values make are as expected: they should all be 1 minute apart, starting
// at 1 minute.
// NOTE: this assumption is relied on elsewhere for scaling on ApproximateworkingSetSizeBuckets.
// Please search for usages before changing this behavior.
if pairs[0].duration != time.Minute {
return nil, fmt.Errorf("expected smallest duration to be %v, got %v", time.Minute, pairs[0].duration)
}
for i := range pairs {
expected := time.Minute * time.Duration(i+1)
if pairs[i].duration != expected {
return nil, fmt.Errorf(
"expected duration values to be exactly 1m apart, got unexpected value %v instead of %v",
pairs[i].duration,
expected,
)
}
}
var values []float64
for _, p := range pairs {
values = append(values, p.value)
}
return values, nil
}
package revsource
import (
"errors"
"time"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)
const (
Upscale vmv1.Flag = 1 << iota
Downscale
)
// MaxRevisions is the maximum number of revisions that can be stored in the RevisionSource.
// This is to prevent memory leaks.
// Upon reaching it, the oldest revisions are discarded.
const MaxRevisions = 100
// RevisionSource can generate and observe revisions.
// Each Revision is a value and a set of flags (for meta-information).
// Once RevisionSource observes a previously generated Revision after some time,
// the time it took since that Revision was generated.
type RevisionSource struct {
cb ObserveCallback
// The in-flight revisions are stored in-order.
// After the revision is observed, it is removed from the measurements, and the offset is increased.
measurements []time.Time
offset int64
}
func NewRevisionSource(initialRevision int64, cb ObserveCallback) *RevisionSource {
return &RevisionSource{
cb: cb,
measurements: nil,
offset: initialRevision + 1, // Will start from the next one
}
}
func (c *RevisionSource) nextValue() int64 {
return c.offset + int64(len(c.measurements))
}
func (c *RevisionSource) Next(now time.Time, flags vmv1.Flag) vmv1.Revision {
ret := vmv1.Revision{
Value: c.nextValue(),
Flags: flags,
}
c.measurements = append(c.measurements, now)
if len(c.measurements) > MaxRevisions {
c.measurements = c.measurements[1:]
c.offset++
}
return ret
}
func (c *RevisionSource) Observe(moment time.Time, rev vmv1.Revision) error {
if rev.Value < c.offset {
// Already observed
return nil
}
idx := rev.Value - c.offset
if idx > int64(len(c.measurements)) {
return errors.New("revision is in the future")
}
diff := moment.Sub(c.measurements[idx])
if c.cb != nil {
c.cb(diff, rev.Flags)
}
// Forget the measurement, and all the measurements before it.
c.offset = rev.Value + 1
c.measurements = c.measurements[idx+1:]
return nil
}
type ObserveCallback func(dur time.Duration, flags vmv1.Flag)
// Propagate sets the target revision to be current, optionally measuring the time it took
// for propagation.
func Propagate(
now time.Time,
target vmv1.RevisionWithTime,
currentSlot *vmv1.Revision,
cb ObserveCallback,
) {
if currentSlot == nil {
return
}
if currentSlot.Value >= target.Value {
return
}
if cb != nil {
diff := now.Sub(target.UpdatedAt.Time)
cb(diff, target.Flags)
}
*currentSlot = target.Revision
}
package core
// The core scaling logic at the heart of the autoscaler-agent. This file implements everything with
// mostly pure-ish functions, so that all the making & receiving requests can be done elsewhere.
//
// Broadly our strategy is to mimic the kind of eventual consistency that is itself used in
// Kubernetes. The scaling logic wasn't always implemented like this, but because the
// autoscaler-agent *fundamentally* exists in an eventual consistency world, we have to either:
// (a) make assumptions that we know are false; or
// (b) design our system so it assumes less.
// We used to solve this by (a). We ran into¹ issues² going that way, because sometimes those false
// assumptions come back to haunt you.
//
// That said, there's still some tricky semantics we want to maintain. Internally, the
// autoscaler-agent must be designed around eventual consistency, but the API we expose to the
// vm-monitor is strictly synchronous. As such, there's some subtle logic to make sure that we're
// not violating our own guarantees unless required to.
//
// ---
// ¹ https://github.com/neondatabase/autoscaling/issues/23
// ² https://github.com/neondatabase/autoscaling/issues/350
import (
"errors"
"fmt"
"math"
"strings"
"time"
"github.com/samber/lo"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/agent/core/revsource"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
)
type ObservabilityCallbacks struct {
PluginLatency revsource.ObserveCallback
MonitorLatency revsource.ObserveCallback
NeonVMLatency revsource.ObserveCallback
}
type RevisionSource interface {
Next(ts time.Time, flags vmv1.Flag) vmv1.Revision
Observe(moment time.Time, rev vmv1.Revision) error
}
// Config represents some of the static configuration underlying the decision-making of State
type Config struct {
// ComputeUnit is the desired ratio between CPU and memory, copied from the global
// autoscaler-agent config.
ComputeUnit api.Resources
// DefaultScalingConfig is just copied from the global autoscaler-agent config.
// If the VM's ScalingConfig is nil, we use this field instead.
DefaultScalingConfig api.ScalingConfig
// NeonVMRetryWait gives the amount of time to wait to retry after a failed request
NeonVMRetryWait time.Duration
// PluginRequestTick gives the period at which we should be making requests to the scheduler
// plugin, even if nothing's changed.
PluginRequestTick time.Duration
// PluginRetryWait gives the amount of time to wait to retry after a failed request
PluginRetryWait time.Duration
// PluginDeniedRetryWait gives the amount of time we must wait before re-requesting resources
// that were not fully granted.
PluginDeniedRetryWait time.Duration
// MonitorDeniedDownscaleCooldown gives the time we must wait between making duplicate
// downscale requests to the vm-monitor where the previous failed.
MonitorDeniedDownscaleCooldown time.Duration
// MonitorRequestedUpscaleValidPeriod gives the duration for which requested upscaling from the
// vm-monitor must be respected.
MonitorRequestedUpscaleValidPeriod time.Duration
// MonitorRetryWait gives the amount of time to wait to retry after a *failed* request.
MonitorRetryWait time.Duration
// Log provides an outlet for (*State).NextActions() to give informative messages or warnings
// about conditions that are impeding its ability to execute.
Log LogConfig `json:"-"`
// RevisionSource is the source of revisions to track the progress during scaling.
RevisionSource RevisionSource `json:"-"`
// ObservabilityCallbacks are the callbacks to submit datapoints for observability.
ObservabilityCallbacks ObservabilityCallbacks `json:"-"`
}
type LogConfig struct {
// Info, if not nil, will be called to provide information during normal functioning.
// For example, we log the calculated desired resources on every call to NextActions.
Info func(string, ...zap.Field)
// Warn, if not nil, will be called to log conditions that are impeding the ability to move the
// current resources to what's desired.
// A typical warning may be something like "wanted to do X but couldn't because of Y".
Warn func(string, ...zap.Field)
}
// State holds all of the necessary internal state for a VM in order to make scaling
// decisions
type State struct {
internal state
}
// one level of indirection below State so that the fields can be public, and JSON-serializable
type state struct {
Config Config
// unused. Exists to make it easier to add print debugging (via .config.Warn) for a single call
// to NextActions.
Debug bool
// VM gives the current state of the VM - or at least, the state of the fields we care about.
//
// NB: any contents behind pointers in VM are immutable. Any time the field is updated, we
// replace it with a fresh object.
VM api.VmInfo
// Plugin records all state relevant to communications with the scheduler plugin
Plugin pluginState
// Monitor records all state relevant to communications with the vm-monitor
Monitor monitorState
// NeonVM records all state relevant to the NeonVM k8s API
NeonVM neonvmState
Metrics *SystemMetrics
LFCMetrics *LFCMetrics
// TargetRevision is the revision agent works towards.
TargetRevision vmv1.Revision
// LastDesiredResources is the last target agent wanted to scale to.
LastDesiredResources *api.Resources
}
type pluginState struct {
// OngoingRequest is true iff there is currently an ongoing request to *this* scheduler plugin.
OngoingRequest bool
// LastRequest, if not nil, gives information about the most recently started request to the
// plugin (maybe unfinished!)
LastRequest *pluginRequested
// LastFailureAt, if not nil, gives the time of the most recent request failure
LastFailureAt *time.Time
// Permit, if not nil, stores the Permit in the most recent PluginResponse. This field will be
// nil if we have not been able to contact *any* scheduler.
Permit *api.Resources
// CurrentRevision is the most recent revision the plugin has acknowledged.
CurrentRevision vmv1.Revision
}
type pluginRequested struct {
At time.Time
Resources api.Resources
}
type monitorState struct {
OngoingRequest *ongoingMonitorRequest
// RequestedUpscale, if not nil, stores the most recent *unresolved* upscaling requested by the
// vm-monitor, along with the time at which it occurred.
RequestedUpscale *requestedUpscale
// DeniedDownscale, if not nil, stores the result of the latest denied /downscale request.
DeniedDownscale *deniedDownscale
// Approved stores the most recent Resources associated with either (a) an accepted downscale
// request, or (b) a successful upscale notification.
Approved *api.Resources
// DownscaleFailureAt, if not nil, stores the time at which a downscale request most recently
// failed (where "failed" means that some unexpected error occurred, not that it was merely
// denied).
DownscaleFailureAt *time.Time
// UpscaleFailureAt, if not nil, stores the time at which an upscale request most recently
// failed
UpscaleFailureAt *time.Time
// CurrentRevision is the most recent revision the monitor has acknowledged.
CurrentRevision vmv1.Revision
}
func (ms *monitorState) active() bool {
return ms.Approved != nil
}
type ongoingMonitorRequest struct {
Kind monitorRequestKind
Requested api.Resources
}
type monitorRequestKind string
const (
monitorRequestKindDownscale monitorRequestKind = "downscale"
monitorRequestKindUpscale monitorRequestKind = "upscale"
)
type requestedUpscale struct {
At time.Time
Base api.Resources
Requested api.MoreResources
}
type deniedDownscale struct {
At time.Time
Current api.Resources
Requested api.Resources
}
type neonvmState struct {
LastSuccess *api.Resources
// OngoingRequested, if not nil, gives the resources requested
OngoingRequested *api.Resources
RequestFailedAt *time.Time
// TargetRevision is the revision agent works towards. Contrary to monitor/plugin, we
// store it not only in action, but also here. This is needed, because for NeonVM propagation
// happens after the changes are actually applied, when the action object is long gone.
TargetRevision vmv1.RevisionWithTime
CurrentRevision vmv1.Revision
}
func (ns *neonvmState) ongoingRequest() bool {
return ns.OngoingRequested != nil
}
func NewState(vm api.VmInfo, config Config) *State {
return &State{
internal: state{
Config: config,
Debug: false,
VM: vm,
Plugin: pluginState{
OngoingRequest: false,
LastRequest: nil,
LastFailureAt: nil,
Permit: nil,
CurrentRevision: vmv1.ZeroRevision,
},
Monitor: monitorState{
OngoingRequest: nil,
RequestedUpscale: nil,
DeniedDownscale: nil,
Approved: nil,
DownscaleFailureAt: nil,
UpscaleFailureAt: nil,
CurrentRevision: vmv1.ZeroRevision,
},
NeonVM: neonvmState{
LastSuccess: nil,
OngoingRequested: nil,
RequestFailedAt: nil,
TargetRevision: vmv1.ZeroRevision.WithTime(time.Time{}),
CurrentRevision: vmv1.ZeroRevision,
},
Metrics: nil,
LFCMetrics: nil,
LastDesiredResources: nil,
TargetRevision: vmv1.ZeroRevision,
},
}
}
func (s *state) info(msg string, fields ...zap.Field) {
if s.Config.Log.Info != nil {
s.Config.Log.Info(msg, fields...)
}
}
func (s *state) warn(msg string /* , fields ...zap.Field */) {
if s.Config.Log.Warn != nil {
s.Config.Log.Warn(msg /* , fields... */)
}
}
func (s *state) warnf(msg string, args ...any) {
s.warn(fmt.Sprintf(msg, args...))
}
// NextActions is used to implement the state machine. It's a pure function that *just* indicates
// what the executor should do.
func (s *State) NextActions(now time.Time) ActionSet {
return s.internal.nextActions(now)
}
func (s *state) nextActions(now time.Time) ActionSet {
var actions ActionSet
desiredResources, calcDesiredResourcesWait := s.desiredResourcesFromMetricsOrRequestedUpscaling(now)
if calcDesiredResourcesWait == nil {
// our handling later on is easier if we can assume it's non-nil
calcDesiredResourcesWait = func(ActionSet) *time.Duration { return nil }
}
// ----
// Requests to the scheduler plugin:
var pluginRequiredWait *time.Duration
actions.PluginRequest, pluginRequiredWait = s.calculatePluginAction(now, desiredResources)
// ----
// Requests to NeonVM:
var pluginRequested *api.Resources
var pluginRequestedPhase string = "<this string should not appear>"
if s.Plugin.OngoingRequest {
pluginRequested = &s.Plugin.LastRequest.Resources
pluginRequestedPhase = "ongoing"
} else if actions.PluginRequest != nil {
pluginRequested = &actions.PluginRequest.Target
pluginRequestedPhase = "planned"
}
var neonvmRequiredWait *time.Duration
actions.NeonVMRequest, neonvmRequiredWait = s.calculateNeonVMAction(now, desiredResources, pluginRequested, pluginRequestedPhase)
// ----
// Requests to vm-monitor (upscaling)
//
// NB: upscaling takes priority over downscaling requests, because otherwise we'd potentially
// forego notifying the vm-monitor of increased resources because we were busy asking if it
// could downscale.
var monitorUpscaleRequiredWait *time.Duration
actions.MonitorUpscale, monitorUpscaleRequiredWait = s.calculateMonitorUpscaleAction(now, desiredResources)
// ----
// Requests to vm-monitor (downscaling)
plannedUpscale := actions.MonitorUpscale != nil
var monitorDownscaleRequiredWait *time.Duration
actions.MonitorDownscale, monitorDownscaleRequiredWait = s.calculateMonitorDownscaleAction(now, desiredResources, plannedUpscale)
// --- and that's all the request types! ---
// If there's anything waiting, we should also note how long we should wait for.
// There's two components we could be waiting on: the scheduler plugin, and the vm-monitor.
maximumDuration := time.Duration(int64(uint64(1)<<63 - 1))
requiredWait := maximumDuration
requiredWaits := []*time.Duration{
calcDesiredResourcesWait(actions),
pluginRequiredWait,
neonvmRequiredWait,
monitorUpscaleRequiredWait,
monitorDownscaleRequiredWait,
}
for _, w := range requiredWaits {
if w != nil {
requiredWait = util.Min(requiredWait, *w)
}
}
// If we're waiting on anything, add it as an action
if requiredWait != maximumDuration {
actions.Wait = &ActionWait{Duration: requiredWait}
}
return actions
}
func (s *state) calculatePluginAction(
now time.Time,
desiredResources api.Resources,
) (*ActionPluginRequest, *time.Duration) {
logFailureReason := func(reason string) {
s.warnf("Wanted to make a request to the scheduler plugin, but %s", reason)
}
// additional resources we want to request OR previous downscaling we need to inform the plugin of
// NOTE: only valid if s.plugin.permit != nil AND there's no ongoing NeonVM request.
requestResources := s.clampResources(
s.VM.Using(),
desiredResources,
ptr(s.VM.Using()), // don't decrease below VM using (decrease happens *before* telling the plugin)
nil, // but any increase is ok
)
// resources if we're just informing the plugin of current resource usage.
currentResources := s.VM.Using()
if s.NeonVM.OngoingRequested != nil {
// include any ongoing NeonVM request, because we're already using that.
currentResources = currentResources.Max(*s.NeonVM.OngoingRequested)
}
// We want to make a request to the scheduler plugin if:
// 1. it's been long enough since the previous request (so we're obligated by PluginRequestTick); or
// 2.a. we want to request resources / inform it of downscale;
// b. there isn't any ongoing, conflicting request; and
// c. we haven't recently been denied these resources
var timeUntilNextRequestTick time.Duration
if s.Plugin.LastRequest != nil {
timeUntilNextRequestTick = s.Config.PluginRequestTick - now.Sub(s.Plugin.LastRequest.At)
}
timeForRequest := timeUntilNextRequestTick <= 0
var timeUntilRetryBackoffExpires time.Duration
requestPreviouslyDenied := !s.Plugin.OngoingRequest &&
s.Plugin.LastRequest != nil &&
s.Plugin.Permit != nil &&
s.Plugin.LastRequest.Resources.HasFieldGreaterThan(*s.Plugin.Permit)
if requestPreviouslyDenied {
timeUntilRetryBackoffExpires = s.Plugin.LastRequest.At.Add(s.Config.PluginDeniedRetryWait).Sub(now)
}
waitingOnRetryBackoff := timeUntilRetryBackoffExpires > 0
// changing the resources we're requesting from the plugin
wantToRequestNewResources := s.Plugin.LastRequest != nil && s.Plugin.Permit != nil &&
requestResources != *s.Plugin.Permit
// ... and this isn't a duplicate (or, at least it's been long enough)
shouldRequestNewResources := wantToRequestNewResources && !waitingOnRetryBackoff
permittedRequestResources := requestResources
if !shouldRequestNewResources {
permittedRequestResources = currentResources
}
// Can't make a duplicate request
if s.Plugin.OngoingRequest {
// ... but if the desired request is different from what we would be making,
// then it's worth logging
if s.Plugin.LastRequest.Resources != permittedRequestResources {
logFailureReason("there's already an ongoing request for different resources")
}
return nil, nil
}
// Can't make a request if we failed too recently
if s.Plugin.LastFailureAt != nil {
timeUntilFailureBackoffExpires := s.Plugin.LastFailureAt.Add(s.Config.PluginRetryWait).Sub(now)
if timeUntilFailureBackoffExpires > 0 {
logFailureReason("previous request failed too recently")
return nil, &timeUntilFailureBackoffExpires
}
}
// At this point, all that's left is either making the request, or saying to wait.
// The rest of the complication is just around accurate logging.
if timeForRequest || shouldRequestNewResources {
return &ActionPluginRequest{
LastPermit: s.Plugin.Permit,
Target: permittedRequestResources,
// convert maybe-nil '*Metrics' to maybe-nil '*core.Metrics'
Metrics: func() *api.Metrics {
if s.Metrics != nil {
return lo.ToPtr(s.Metrics.ToAPI())
} else {
return nil
}
}(),
TargetRevision: s.TargetRevision.WithTime(now),
}, nil
} else {
if wantToRequestNewResources && waitingOnRetryBackoff {
logFailureReason("previous request for more resources was denied too recently")
}
waitTime := timeUntilNextRequestTick
if waitingOnRetryBackoff {
waitTime = util.Min(waitTime, timeUntilRetryBackoffExpires)
}
return nil, &waitTime
}
}
func ptr[T any](t T) *T { return &t }
func (s *state) calculateNeonVMAction(
now time.Time,
desiredResources api.Resources,
pluginRequested *api.Resources,
pluginRequestedPhase string,
) (*ActionNeonVMRequest, *time.Duration) {
targetRevision := s.TargetRevision
if desiredResources.HasFieldLessThan(s.VM.Using()) && s.Monitor.CurrentRevision.Value > 0 {
// We are downscaling, so we needed a permit from the monitor
targetRevision = targetRevision.Min(s.Monitor.CurrentRevision)
}
if desiredResources.HasFieldGreaterThan(s.VM.Using()) && s.Plugin.CurrentRevision.Value > 0 {
// We are upscaling, so we needed a permit from the plugin
targetRevision = targetRevision.Min(s.Plugin.CurrentRevision)
}
// clamp desiredResources to what we're allowed to make a request for
desiredResources = s.clampResources(
s.VM.Using(), // current: what we're using already
desiredResources, // target: desired resources
ptr(s.monitorApprovedLowerBound()), // lower bound: downscaling that the monitor has approved
ptr(s.pluginApprovedUpperBound()), // upper bound: upscaling that the plugin has approved
)
// If we're already using the desired resources, then no need to make a request
if s.VM.Using() == desiredResources {
return nil, nil
}
conflictingPluginRequest := pluginRequested != nil && pluginRequested.HasFieldLessThan(desiredResources)
if !s.NeonVM.ongoingRequest() && !conflictingPluginRequest {
// We *should* be all clear to make a request; not allowed to make one if we failed too
// recently
if s.NeonVM.RequestFailedAt != nil {
timeUntilFailureBackoffExpires := s.NeonVM.RequestFailedAt.Add(s.Config.NeonVMRetryWait).Sub(now)
if timeUntilFailureBackoffExpires > 0 {
s.warn("Wanted to make a request to NeonVM API, but recent request failed too recently")
return nil, &timeUntilFailureBackoffExpires
}
}
s.NeonVM.TargetRevision = targetRevision.WithTime(now)
return &ActionNeonVMRequest{
Current: s.VM.Using(),
Target: desiredResources,
TargetRevision: s.NeonVM.TargetRevision,
}, nil
} else {
var reqs []string
if s.Plugin.OngoingRequest {
reqs = append(reqs, fmt.Sprintf("plugin request %s", pluginRequestedPhase))
}
if s.NeonVM.ongoingRequest() && *s.NeonVM.OngoingRequested != desiredResources {
reqs = append(reqs, "NeonVM request (for different resources) ongoing")
}
if len(reqs) != 0 {
s.warnf("Wanted to make a request to NeonVM API, but there's already %s", strings.Join(reqs, " and "))
}
return nil, nil
}
}
func (s *state) calculateMonitorUpscaleAction(
now time.Time,
desiredResources api.Resources,
) (*ActionMonitorUpscale, *time.Duration) {
// can't do anything if we don't have an active connection to the vm-monitor
if !s.Monitor.active() {
return nil, nil
}
requestResources := s.clampResources(
*s.Monitor.Approved, // current: last resources we got the OK from the monitor on
s.VM.Using(), // target: what the VM is currently using
ptr(*s.Monitor.Approved), // don't decrease below what the monitor is currently set to (this is an *upscale* request)
ptr(desiredResources.Max(*s.Monitor.Approved)), // don't increase above desired resources
)
// Clamp the request resources so we're not increasing by more than 1 CU:
requestResources = s.clampResources(
*s.Monitor.Approved,
requestResources,
nil, // no lower bound
ptr(requestResources.Add(s.Config.ComputeUnit)), // upper bound: must not increase by >1 CU
)
// Check validity of the request that we would send, before sending it
if requestResources.HasFieldLessThan(*s.Monitor.Approved) {
panic(fmt.Errorf(
"resources for vm-monitor upscaling are less than what was last approved: %+v has field less than %+v",
requestResources,
*s.Monitor.Approved,
))
}
wantToDoRequest := requestResources != *s.Monitor.Approved
if !wantToDoRequest {
return nil, nil
}
// Can't make another request if there's already one ongoing
if s.Monitor.OngoingRequest != nil {
var requestDescription string
if s.Monitor.OngoingRequest.Kind == monitorRequestKindUpscale && s.Monitor.OngoingRequest.Requested != requestResources {
requestDescription = "upscale request (for different resources)"
} else if s.Monitor.OngoingRequest.Kind == monitorRequestKindDownscale {
requestDescription = "downscale request"
}
if requestDescription != "" {
s.warnf("Wanted to send vm-monitor upscale request, but waiting on ongoing %s", requestDescription)
}
return nil, nil
}
// Can't make another request if we failed too recently:
if s.Monitor.UpscaleFailureAt != nil {
timeUntilFailureBackoffExpires := s.Monitor.UpscaleFailureAt.Add(s.Config.MonitorRetryWait).Sub(now)
if timeUntilFailureBackoffExpires > 0 {
s.warn("Wanted to send vm-monitor upscale request, but failed too recently")
return nil, &timeUntilFailureBackoffExpires
}
}
// Otherwise, we can make the request:
return &ActionMonitorUpscale{
Current: *s.Monitor.Approved,
Target: requestResources,
TargetRevision: s.TargetRevision.WithTime(now),
}, nil
}
func (s *state) calculateMonitorDownscaleAction(
now time.Time,
desiredResources api.Resources,
plannedUpscaleRequest bool,
) (*ActionMonitorDownscale, *time.Duration) {
// can't do anything if we don't have an active connection to the vm-monitor
if !s.Monitor.active() {
if desiredResources.HasFieldLessThan(s.VM.Using()) {
s.warn("Wanted to send vm-monitor downscale request, but there's no active connection")
}
return nil, nil
}
requestResources := s.clampResources(
*s.Monitor.Approved, // current: what the monitor is already aware of
desiredResources, // target: what we'd like the VM to be using
nil, // lower bound: any decrease is fine
ptr(*s.Monitor.Approved), // upper bound: don't increase (this is only downscaling!)
)
// Clamp the request resources so we're not decreasing by more than 1 CU:
requestResources = s.clampResources(
*s.Monitor.Approved,
requestResources,
ptr(s.Monitor.Approved.SaturatingSub(s.Config.ComputeUnit)), // Must not decrease by >1 CU
nil, // no upper bound
)
// Check validity of the request that we would send, before sending it
if requestResources.HasFieldGreaterThan(*s.Monitor.Approved) {
panic(fmt.Errorf(
"resources for vm-monitor downscaling are greater than what was last approved: %+v has field greater than %+v",
requestResources,
*s.Monitor.Approved,
))
}
wantToDoRequest := requestResources != *s.Monitor.Approved
if !wantToDoRequest {
return nil, nil
}
// Can't make another request if there's already one ongoing (or if an upscaling request is
// planned)
if plannedUpscaleRequest {
s.warn("Wanted to send vm-monitor downscale request, but waiting on other planned upscale request")
return nil, nil
} else if s.Monitor.OngoingRequest != nil {
var requestDescription string
if s.Monitor.OngoingRequest.Kind == monitorRequestKindDownscale && s.Monitor.OngoingRequest.Requested != requestResources {
requestDescription = "downscale request (for different resources)"
} else if s.Monitor.OngoingRequest.Kind == monitorRequestKindUpscale {
requestDescription = "upscale request"
}
if requestDescription != "" {
s.warnf("Wanted to send vm-monitor downscale request, but waiting on other ongoing %s", requestDescription)
}
return nil, nil
}
// Can't make another request if we failed too recently:
if s.Monitor.DownscaleFailureAt != nil {
timeUntilFailureBackoffExpires := s.Monitor.DownscaleFailureAt.Add(s.Config.MonitorRetryWait).Sub(now)
if timeUntilFailureBackoffExpires > 0 {
s.warn("Wanted to send vm-monitor downscale request but failed too recently")
return nil, &timeUntilFailureBackoffExpires
}
}
// Can't make another request if a recent request for resources less than or equal to the
// proposed request was denied. In general though, this should be handled by
// DesiredResourcesFromMetricsOrRequestedUpscaling, so it's we're better off panicking here.
if s.timeUntilDeniedDownscaleExpired(now) > 0 && !s.Monitor.DeniedDownscale.Requested.HasFieldLessThan(requestResources) {
panic(errors.New(
"Wanted to send vm-monitor downscale request, but too soon after previously denied downscaling that should have been handled earlier",
))
}
// Nothing else to check, we're good to make the request
return &ActionMonitorDownscale{
Current: *s.Monitor.Approved,
Target: requestResources,
TargetRevision: s.TargetRevision.WithTime(now),
}, nil
}
func (s *state) scalingConfig() api.ScalingConfig {
// nb: WithOverrides allows its arg to be nil, in which case it does nothing.
return s.Config.DefaultScalingConfig.WithOverrides(s.VM.Config.ScalingConfig)
}
// public version, for testing.
func (s *State) DesiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) (api.Resources, func(ActionSet) *time.Duration) {
return s.internal.desiredResourcesFromMetricsOrRequestedUpscaling(now)
}
func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) (api.Resources, func(ActionSet) *time.Duration) {
// There's some annoying edge cases that this function has to be able to handle properly. For
// the sake of completeness, they are:
//
// 1. s.vm.Using() is not a multiple of s.computeUnit
// 2. s.vm.Max() is less than s.computeUnit (or: has at least one resource that is)
// 3. s.vm.Using() is a fractional multiple of s.computeUnit, but !allowDecrease and rounding up
// is greater than s.vm.Max()
// 4. s.vm.Using() is much larger than s.vm.Min() and not a multiple of s.computeUnit, but load
// is low so we should just decrease *anyways*.
//
// ---
//
// Broadly, the implementation works like this:
// For CPU:
// Based on load average, calculate the "goal" number of CPUs (and therefore compute units)
//
// For Memory:
// Based on memory usage, calculate the VM's desired memory allocation and extrapolate a
// goal number of CUs from that.
//
// 1. Take the maximum of these two goal CUs to create a unified goal CU
// 2. Cap the goal CU by min/max, etc
// 3. that's it!
// Record whether we have all the metrics we'll need.
// If not, we'll later prevent downscaling to avoid flushing the VM's cache on autoscaler-agent
// restart if we have SystemMetrics but not LFCMetrics.
hasAllMetrics := s.Metrics != nil && (!*s.scalingConfig().EnableLFCMetrics || s.LFCMetrics != nil)
if !hasAllMetrics {
s.warn("Making scaling decision without all required metrics available")
}
var goalCU uint32
if s.Metrics != nil {
// For CPU:
// Goal compute unit is at the point where (CPUs) × (LoadAverageFractionTarget) == (load
// average),
// which we can get by dividing LA by LAFT, and then dividing by the number of CPUs per CU
goalCPUs := s.Metrics.LoadAverage1Min / *s.scalingConfig().LoadAverageFractionTarget
cpuGoalCU := uint32(math.Round(goalCPUs / s.Config.ComputeUnit.VCPU.AsFloat64()))
// For Mem:
// Goal compute unit is at the point where (Mem) * (MemoryUsageFractionTarget) == (Mem Usage)
// We can get the desired memory allocation in bytes by dividing MU by MUFT, and then convert
// that to CUs
//
// NOTE: use uint64 for calculations on bytes as uint32 can overflow
memGoalBytes := api.Bytes(math.Round(s.Metrics.MemoryUsageBytes / *s.scalingConfig().MemoryUsageFractionTarget))
memGoalCU := uint32(memGoalBytes / s.Config.ComputeUnit.Mem)
goalCU = util.Max(cpuGoalCU, memGoalCU)
}
// For LFC metrics, if enabled:
var lfcLogFields func(zapcore.ObjectEncoder) error
if s.LFCMetrics != nil {
cfg := s.scalingConfig()
wssValues := s.LFCMetrics.ApproximateworkingSetSizeBuckets
// At this point, we can assume that the values are equally spaced at 1 minute apart,
// starting at 1 minute.
offsetIndex := *cfg.LFCMinWaitBeforeDownscaleMinutes - 1 // -1 because values start at 1m
windowSize := *cfg.LFCWindowSizeMinutes
// Handle invalid metrics:
if len(wssValues) < offsetIndex+windowSize {
s.warn("not enough working set size values to make scaling determination")
} else {
estimateWss := EstimateTrueWorkingSetSize(wssValues, WssEstimatorConfig{
MaxAllowedIncreaseFactor: 3.0, // hard-code this for now.
InitialOffset: offsetIndex,
WindowSize: windowSize,
})
projectSliceEnd := offsetIndex // start at offsetIndex to avoid panics if not monotonically non-decreasing
for ; projectSliceEnd < len(wssValues) && wssValues[projectSliceEnd] <= estimateWss; projectSliceEnd++ {
}
projectLen := 0.5 // hard-code this for now.
predictedHighestNextMinute := ProjectNextHighest(wssValues[:projectSliceEnd], projectLen)
// predictedHighestNextMinute is still in units of 8KiB pages. Let's convert that
// into GiB, then convert that into CU, and then invert the discount from only some
// of the memory going towards LFC to get the actual CU required to fit the
// predicted working set size.
requiredCU := predictedHighestNextMinute * 8192 / s.Config.ComputeUnit.Mem.AsFloat64() / *cfg.LFCToMemoryRatio
lfcGoalCU := uint32(math.Ceil(requiredCU))
goalCU = util.Max(goalCU, lfcGoalCU)
lfcLogFields = func(obj zapcore.ObjectEncoder) error {
obj.AddFloat64("estimateWssPages", estimateWss)
obj.AddFloat64("predictedNextWssPages", predictedHighestNextMinute)
obj.AddFloat64("requiredCU", requiredCU)
return nil
}
}
}
// Copy the initial value of the goal CU so that we can accurately track whether either
// requested upscaling or denied downscaling affected the outcome.
// Otherwise as written, it'd be possible to update goalCU from requested upscaling and
// incorrectly miss that denied downscaling could have had the same effect.
initialGoalCU := goalCU
var requestedUpscalingAffectedResult bool
// Update goalCU based on any explicitly requested upscaling
timeUntilRequestedUpscalingExpired := s.timeUntilRequestedUpscalingExpired(now)
requestedUpscalingInEffect := timeUntilRequestedUpscalingExpired > 0
if requestedUpscalingInEffect {
reqCU := s.requiredCUForRequestedUpscaling(s.Config.ComputeUnit, *s.Monitor.RequestedUpscale)
if reqCU > initialGoalCU {
// FIXME: this isn't quite correct, because if initialGoalCU is already equal to the
// maximum goal CU we *could* have, this won't actually have an effect.
requestedUpscalingAffectedResult = true
goalCU = util.Max(goalCU, reqCU)
}
}
var deniedDownscaleAffectedResult bool
// Update goalCU based on any previously denied downscaling
timeUntilDeniedDownscaleExpired := s.timeUntilDeniedDownscaleExpired(now)
deniedDownscaleInEffect := timeUntilDeniedDownscaleExpired > 0
if deniedDownscaleInEffect {
reqCU := s.requiredCUForDeniedDownscale(s.Config.ComputeUnit, s.Monitor.DeniedDownscale.Requested)
if reqCU > initialGoalCU {
deniedDownscaleAffectedResult = true
goalCU = util.Max(goalCU, reqCU)
}
}
// resources for the desired "goal" compute units
goalResources := s.Config.ComputeUnit.Mul(uint16(goalCU))
// If we don't have all the metrics we need to make a proper decision, make sure that we aren't
// going to scale down below the current resources.
// Otherwise, we can make an under-informed decision that has undesirable impacts (e.g., scaling
// down because we don't have LFC metrics and flushing the cache because of it).
if !hasAllMetrics {
goalResources = goalResources.Max(s.VM.Using())
}
// bound goalResources by the minimum and maximum resource amounts for the VM
result := goalResources.Min(s.VM.Max()).Max(s.VM.Min())
// ... but if we aren't allowed to downscale, then we *must* make sure that the VM's usage value
// won't decrease to the previously denied amount, even if it's greater than the maximum.
//
// We can run into siutations like this when VM scale-down on bounds change fails, so we end up
// with a usage value greater than the maximum.
//
// It's not a great situation to be in, but it's easier to make the policy "give the users a
// little extra if we mess up" than "oops we OOM-killed your DB, hope you weren't doing anything".
if deniedDownscaleInEffect {
// roughly equivalent to "result >= s.monitor.deniedDownscale.requested"
if !result.HasFieldGreaterThan(s.Monitor.DeniedDownscale.Requested) {
// This can only happen if s.vm.Max() is less than goalResources, because otherwise this
// would have been factored into goalCU, affecting goalResources. Hence, the warning.
s.warn("Can't decrease desired resources to within VM maximum because of vm-monitor previously denied downscale request")
}
preMaxResult := result
result = result.Max(s.minRequiredResourcesForDeniedDownscale(s.Config.ComputeUnit, *s.Monitor.DeniedDownscale))
if result != preMaxResult {
deniedDownscaleAffectedResult = true
}
}
// Check that the result is sound.
//
// With the current (naive) implementation, this is trivially ok. In future versions, it might
// not be so simple, so it's good to have this integrity check here.
if !deniedDownscaleAffectedResult && result.HasFieldGreaterThan(s.VM.Max()) {
panic(fmt.Errorf(
"produced invalid desired state: result has field greater than max. this = %+v", *s,
))
} else if result.HasFieldLessThan(s.VM.Min()) {
panic(fmt.Errorf(
"produced invalid desired state: result has field less than min. this = %+v", *s,
))
}
calculateWaitTime := func(actions ActionSet) *time.Duration {
var waiting bool
waitTime := time.Duration(int64(1<<63 - 1)) // time.Duration is an int64. As an "unset" value, use the maximum.
if deniedDownscaleAffectedResult && actions.MonitorDownscale == nil && s.Monitor.OngoingRequest == nil {
waitTime = util.Min(waitTime, timeUntilDeniedDownscaleExpired)
waiting = true
}
if requestedUpscalingAffectedResult {
waitTime = util.Min(waitTime, timeUntilRequestedUpscalingExpired)
waiting = true
}
if waiting {
return &waitTime
} else {
return nil
}
}
s.updateTargetRevision(now, result, s.VM.Using())
// TODO: we are both saving the result into LastDesiredResources and returning it. This is
// redundant, and we should remove one of the two.
s.LastDesiredResources = &result
logFields := []zap.Field{
zap.Object("current", s.VM.Using()),
zap.Object("target", result),
zap.Object("targetRevision", &s.TargetRevision),
}
if lfcLogFields != nil {
logFields = append(logFields, zap.Object("lfc", zapcore.ObjectMarshalerFunc(lfcLogFields)))
}
s.info("Calculated desired resources", logFields...)
return result, calculateWaitTime
}
func (s *state) updateTargetRevision(now time.Time, desired api.Resources, current api.Resources) {
if s.LastDesiredResources == nil {
s.LastDesiredResources = ¤t
}
if *s.LastDesiredResources == desired {
// Nothing changed, so no need to update the target revision
return
}
var flags vmv1.Flag
if desired.HasFieldGreaterThan(*s.LastDesiredResources) {
flags.Set(revsource.Upscale)
}
if desired.HasFieldLessThan(*s.LastDesiredResources) {
flags.Set(revsource.Downscale)
}
s.TargetRevision = s.Config.RevisionSource.Next(now, flags)
}
func (s *state) updateNeonVMCurrentRevision(currentRevision vmv1.RevisionWithTime) {
revsource.Propagate(currentRevision.UpdatedAt.Time,
s.NeonVM.TargetRevision,
&s.NeonVM.CurrentRevision,
s.Config.ObservabilityCallbacks.NeonVMLatency,
)
err := s.Config.RevisionSource.Observe(currentRevision.UpdatedAt.Time, currentRevision.Revision)
if err != nil {
s.warnf("Failed to observe clock source: %v", err)
}
// We also zero out LastDesiredResources, because we are now starting from
// a new current resources.
s.LastDesiredResources = nil
}
func (s *state) timeUntilRequestedUpscalingExpired(now time.Time) time.Duration {
if s.Monitor.RequestedUpscale != nil {
return s.Monitor.RequestedUpscale.At.Add(s.Config.MonitorRequestedUpscaleValidPeriod).Sub(now)
} else {
return 0
}
}
// NB: we could just use s.plugin.computeUnit or s.monitor.requestedUpscale from inside the
// function, but those are sometimes nil. This way, it's clear that it's the caller's responsibility
// to ensure that the values are non-nil.
func (s *state) requiredCUForRequestedUpscaling(computeUnit api.Resources, requestedUpscale requestedUpscale) uint32 {
var required uint32
requested := requestedUpscale.Requested
base := requestedUpscale.Base
// note: 1 + floor(x / M) gives the minimum integer value greater than x / M.
if requested.Cpu {
required = util.Max(required, 1+uint32(base.VCPU/computeUnit.VCPU))
}
if requested.Memory {
required = util.Max(required, 1+uint32(base.Mem/computeUnit.Mem))
}
return required
}
func (s *state) timeUntilDeniedDownscaleExpired(now time.Time) time.Duration {
if s.Monitor.DeniedDownscale != nil {
return s.Monitor.DeniedDownscale.At.Add(s.Config.MonitorDeniedDownscaleCooldown).Sub(now)
} else {
return 0
}
}
// NB: like requiredCUForRequestedUpscaling, we make the caller provide the values so that it's
// more clear that it's the caller's responsibility to ensure the values are non-nil.
func (s *state) requiredCUForDeniedDownscale(computeUnit, deniedResources api.Resources) uint32 {
// note: floor(x / M) + 1 gives the minimum integer value greater than x / M.
requiredFromCPU := 1 + uint32(deniedResources.VCPU/computeUnit.VCPU)
requiredFromMem := 1 + uint32(deniedResources.Mem/computeUnit.Mem)
return util.Max(requiredFromCPU, requiredFromMem)
}
func (s *state) minRequiredResourcesForDeniedDownscale(computeUnit api.Resources, denied deniedDownscale) api.Resources {
// for each resource, increase the value by one CU's worth, but not greater than the value we
// were at while attempting to downscale.
//
// phrasing it like this cleanly handles some subtle edge cases when denied.current isn't a
// multiple of the compute unit.
return api.Resources{
VCPU: util.Min(denied.Current.VCPU, computeUnit.VCPU*(1+denied.Requested.VCPU/computeUnit.VCPU)),
Mem: util.Min(denied.Current.Mem, computeUnit.Mem*(1+denied.Requested.Mem/computeUnit.Mem)),
}
}
// clampResources uses the directionality of the difference between s.vm.Using() and desired to
// clamp the desired resources with the upper *or* lower bound
func (s *state) clampResources(
current api.Resources,
desired api.Resources,
lowerBound *api.Resources,
upperBound *api.Resources,
) api.Resources {
// Internal validity checks:
if lowerBound != nil && lowerBound.HasFieldGreaterThan(current) {
panic(fmt.Errorf(
"clampResources called with invalid arguments: lowerBound=%+v has field greater than current=%+v",
lowerBound,
current,
))
} else if upperBound != nil && upperBound.HasFieldLessThan(current) {
panic(fmt.Errorf(
"clampResources called with invalid arguments: upperBound=%+v has field less than current=%+v",
upperBound,
current,
))
}
cpu := desired.VCPU
if desired.VCPU < current.VCPU && lowerBound != nil {
cpu = util.Max(desired.VCPU, lowerBound.VCPU)
} else if desired.VCPU > current.VCPU && upperBound != nil {
cpu = util.Min(desired.VCPU, upperBound.VCPU)
}
mem := desired.Mem
if desired.Mem < current.Mem && lowerBound != nil {
mem = util.Max(desired.Mem, lowerBound.Mem)
} else if desired.Mem > current.Mem && upperBound != nil {
mem = util.Min(desired.Mem, upperBound.Mem)
}
return api.Resources{VCPU: cpu, Mem: mem}
}
func (s *state) monitorApprovedLowerBound() api.Resources {
if s.Monitor.Approved != nil {
return *s.Monitor.Approved
} else {
return s.VM.Using()
}
}
func (s *state) pluginApprovedUpperBound() api.Resources {
if s.Plugin.Permit != nil {
return *s.Plugin.Permit
} else {
return s.VM.Using()
}
}
//////////////////////////////////////////
// PUBLIC FUNCTIONS TO UPDATE THE STATE //
//////////////////////////////////////////
// Debug sets s.debug = enabled. This method is exclusively meant to be used in tests, to make it
// easier to enable print debugging only for a single call to NextActions, via s.warn() or otherwise.
func (s *State) Debug(enabled bool) {
s.internal.Debug = enabled
}
func (s *State) UpdatedVM(vm api.VmInfo) {
// FIXME: overriding this is required right now because we trust that a successful request to
// NeonVM means the VM was already updated, which... isn't true, and otherwise we could run into
// sync issues.
// A first-pass solution is possible by reading the values of VirtualMachine.Spec, but the
// "proper" solution would read from VirtualMachine.Status, which (at time of writing) isn't
// sound. For more, see:
// - https://github.com/neondatabase/autoscaling/pull/371#issuecomment-1752110131
// - https://github.com/neondatabase/autoscaling/issues/462
vm.SetUsing(s.internal.VM.Using())
s.internal.VM = vm
if vm.CurrentRevision != nil {
s.internal.updateNeonVMCurrentRevision(*vm.CurrentRevision)
}
// Make sure that if LFC metrics are disabled & later enabled, we don't make decisions based on
// stale data.
if !*s.internal.scalingConfig().EnableLFCMetrics {
s.internal.LFCMetrics = nil
}
}
func (s *State) UpdateSystemMetrics(metrics SystemMetrics) {
s.internal.Metrics = &metrics
}
func (s *State) UpdateLFCMetrics(metrics LFCMetrics) {
s.internal.LFCMetrics = &metrics
}
// PluginHandle provides write access to the scheduler plugin pieces of an UpdateState
type PluginHandle struct {
s *state
}
func (s *State) Plugin() PluginHandle {
return PluginHandle{&s.internal}
}
func (h PluginHandle) StartingRequest(now time.Time, resources api.Resources) {
h.s.Plugin.LastRequest = &pluginRequested{
At: now,
Resources: resources,
}
h.s.Plugin.OngoingRequest = true
}
func (h PluginHandle) RequestFailed(now time.Time) {
h.s.Plugin.OngoingRequest = false
h.s.Plugin.LastFailureAt = &now
}
func (h PluginHandle) RequestSuccessful(
now time.Time,
targetRevision vmv1.RevisionWithTime,
resp api.PluginResponse,
) (_err error) {
h.s.Plugin.OngoingRequest = false
defer func() {
if _err != nil {
h.s.Plugin.LastFailureAt = &now
}
}()
if err := resp.Permit.ValidateNonZero(); err != nil {
return fmt.Errorf("Invalid permit: %w", err)
}
// Errors from resp in connection with the prior request
if resp.Permit.HasFieldGreaterThan(h.s.Plugin.LastRequest.Resources) {
return fmt.Errorf(
"Permit has resources greater than request (%+v vs. %+v)",
resp.Permit, h.s.Plugin.LastRequest.Resources,
)
}
// Errors from resp in connection with the prior request AND the VM state
if vmUsing := h.s.VM.Using(); resp.Permit.HasFieldLessThan(vmUsing) {
return fmt.Errorf("Permit has resources less than VM (%+v vs %+v)", resp.Permit, vmUsing)
}
// All good - set everything.
// NOTE: We don't set the compute unit, even though the plugin response contains it. We're in
// the process of moving the source of truth for ComputeUnit from the scheduler plugin to the
// autoscaler-agent.
h.s.Plugin.Permit = &resp.Permit
revsource.Propagate(now,
targetRevision,
&h.s.Plugin.CurrentRevision,
h.s.Config.ObservabilityCallbacks.PluginLatency,
)
return nil
}
// MonitorHandle provides write access to the vm-monitor pieces of an UpdateState
type MonitorHandle struct {
s *state
}
func (s *State) Monitor() MonitorHandle {
return MonitorHandle{&s.internal}
}
func (h MonitorHandle) Reset() {
h.s.Monitor = monitorState{
OngoingRequest: nil,
RequestedUpscale: nil,
DeniedDownscale: nil,
Approved: nil,
DownscaleFailureAt: nil,
UpscaleFailureAt: nil,
CurrentRevision: vmv1.ZeroRevision,
}
}
func (h MonitorHandle) Active(active bool) {
if active {
approved := h.s.VM.Using()
h.s.Monitor.Approved = &approved // TODO: this is racy
} else {
h.s.Monitor.Approved = nil
}
}
func (h MonitorHandle) UpscaleRequested(now time.Time, resources api.MoreResources) {
h.s.Monitor.RequestedUpscale = &requestedUpscale{
At: now,
Base: *h.s.Monitor.Approved,
Requested: resources,
}
}
func (h MonitorHandle) StartingUpscaleRequest(now time.Time, resources api.Resources) {
h.s.Monitor.OngoingRequest = &ongoingMonitorRequest{
Kind: monitorRequestKindUpscale,
Requested: resources,
}
h.s.Monitor.UpscaleFailureAt = nil
}
func (h MonitorHandle) UpscaleRequestSuccessful(now time.Time) {
h.s.Monitor.Approved = &h.s.Monitor.OngoingRequest.Requested
h.s.Monitor.OngoingRequest = nil
}
func (h MonitorHandle) UpscaleRequestFailed(now time.Time) {
h.s.Monitor.OngoingRequest = nil
h.s.Monitor.UpscaleFailureAt = &now
}
func (h MonitorHandle) StartingDownscaleRequest(now time.Time, resources api.Resources) {
h.s.Monitor.OngoingRequest = &ongoingMonitorRequest{
Kind: monitorRequestKindDownscale,
Requested: resources,
}
h.s.Monitor.DownscaleFailureAt = nil
}
func (h MonitorHandle) DownscaleRequestAllowed(now time.Time, rev vmv1.RevisionWithTime) {
h.s.Monitor.Approved = &h.s.Monitor.OngoingRequest.Requested
h.s.Monitor.OngoingRequest = nil
revsource.Propagate(now,
rev,
&h.s.Monitor.CurrentRevision,
h.s.Config.ObservabilityCallbacks.MonitorLatency,
)
}
// Downscale request was successful but the monitor denied our request.
func (h MonitorHandle) DownscaleRequestDenied(now time.Time, targetRevision vmv1.RevisionWithTime) {
h.s.Monitor.DeniedDownscale = &deniedDownscale{
At: now,
Current: *h.s.Monitor.Approved,
Requested: h.s.Monitor.OngoingRequest.Requested,
}
h.s.Monitor.OngoingRequest = nil
revsource.Propagate(now,
targetRevision,
&h.s.Monitor.CurrentRevision,
h.s.Config.ObservabilityCallbacks.MonitorLatency,
)
}
func (h MonitorHandle) DownscaleRequestFailed(now time.Time) {
h.s.Monitor.OngoingRequest = nil
h.s.Monitor.DownscaleFailureAt = &now
}
type NeonVMHandle struct {
s *state
}
func (s *State) NeonVM() NeonVMHandle {
return NeonVMHandle{&s.internal}
}
func (h NeonVMHandle) StartingRequest(now time.Time, resources api.Resources) {
// FIXME: add time to ongoing request info (or maybe only in RequestFailed?)
h.s.NeonVM.OngoingRequested = &resources
}
func (h NeonVMHandle) RequestSuccessful(now time.Time) {
if h.s.NeonVM.OngoingRequested == nil {
panic("received NeonVM().RequestSuccessful() update without ongoing request")
}
resources := *h.s.NeonVM.OngoingRequested
// FIXME: This is actually incorrect; we shouldn't trust that the VM has already been updated
// just because the request completed. It takes longer for the reconcile cycle(s) to make the
// necessary changes.
// See the comments in (*State).UpdatedVM() for more info.
h.s.VM.SetUsing(resources)
h.s.NeonVM.OngoingRequested = nil
}
func (h NeonVMHandle) RequestFailed(now time.Time) {
h.s.NeonVM.OngoingRequested = nil
h.s.NeonVM.RequestFailedAt = &now
}
package core
// Working set size estimation
// For more, see: https://www.notion.so/neondatabase/874ef1cc942a4e6592434dbe9e609350
import (
"fmt"
)
type WssEstimatorConfig struct {
// MaxAllowedIncreaseFactor is the maximum tolerable increase in slope between windows.
// If the slope increases by more than this factor, we will cut off the working set size as the
// border between the two windows.
MaxAllowedIncreaseFactor float64
// InitialOffset is the index of the minimum working set size we must consider.
//
// In practice, this is taken from the scaling config's LFCMinWaitBeforeDownscaleMinutes, with
// the expectation that datapoints are all one minute apart, starting at 1m. So a value of 15m
// translates to an InitialOffset of 14 (-1 because indexes start at zero, but the first
// datapoint is 1m).
InitialOffset int
// WindowSize sets the offset for datapoints used in the calculation of the slope before & after
// a point. For window size W, we calculate the slope at point P as value[P]-value[P-(W-1)].
// This value must be >= 2.
//
// In practice, this value is taken from the scaling config's LFCWindowSizeMinutes, with the
// expectation that datapoints are all one minute apart. So, a value of 5 minutes translates to
// a WindowSize of 5.
WindowSize int
}
// EstimateTrueWorkingSetSize returns an estimate of the "true" current working set size, given a
// series of datapoints for the observed working set size over increasing time intervals.
//
// In practice, the 'series' is e.g., values of 'neon.lfc_approximate_working_set_size_seconds(d)'
// for equidistant values of 'd' from 1 minute to 60 minutes.
//
// This function panics if:
// * cfg.WindowSize < 2
// * cfg.InitialOffset < cfg.WindowSize - 1
func EstimateTrueWorkingSetSize(
series []float64,
cfg WssEstimatorConfig,
) float64 {
if cfg.WindowSize < 2 {
panic(fmt.Errorf("cfg.WindowSize must be >= 2 (got %v)", cfg.WindowSize))
} else if cfg.InitialOffset < cfg.WindowSize-1 {
panic(fmt.Errorf("cfg.InitialOffset must be >= cfg.WindowSize - 1 (got %v < %v - 1)", cfg.InitialOffset, cfg.WindowSize))
}
// For a window size of e.g. 5 points, we're looking back from series[t] to series[t-4], because
// series[t] is already included. (and similarly for looking forward to series[t+4]).
// 'w' is a shorthand for that -1 to make the code in the loop below cleaner.
w := cfg.WindowSize - 1
for t := cfg.InitialOffset; t < len(series)-w; t += 1 {
// In theory the HLL estimator will guarantee that - at any instant - increasing the
// duration for the working set will not decrease the value.
// However in practice, the individual values are not calculated at the same time, so we
// must still account for the possibility that series[t] < series[t-w], or similarly for
// series[t+w] and series[t].
// Hence, max(0.0, ...)
d0 := max(0.0, series[t]-series[t-w])
d1 := max(0.0, series[t+w]-series[t])
if d1 > d0*cfg.MaxAllowedIncreaseFactor {
return series[t]
}
}
return series[len(series)-1]
}
// ProjectNextHighest looks at the rate of change between points in 'series', returning the maximum
// value if any of these slopes were to continue for 'projectLen' additional datapoints.
//
// For example, given the series '0, 1, 3, 4, 5', projectLen of 3, and ceil equal to 6,
// ProjectNextHighest will return 9 (because 1 → 3 would reach 9 if it continued for another 3
// datapoints (→ 5 → 7 → 9).
//
// Internally, ProjectNextHighest is used to allow preemptive scale-up when we can see that the
// observed working set size is increasing, but we don't know how big it'll get.
// In short, this function helps answer: "How much should we scale-up to accommodate expected
// increases in demand?".
func ProjectNextHighest(series []float64, projectLen float64) float64 {
if len(series) < 2 {
panic(fmt.Errorf("Cannot ProjectNextHighest with series of length %d (must be >= 2)", len(series)))
}
highest := series[0]
for i := 1; i < len(series); i += 1 {
x0 := series[i-1]
x1 := max(x0, series[i]) // ignore decreases
predicted := x1 + (x1-x0)*projectLen
highest = max(highest, predicted)
}
return highest
}
package billing
import (
"context"
"fmt"
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/policy"
"github.com/Azure/azure-sdk-for-go/sdk/azidentity"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
)
type AzureAuthSharedKey struct {
AccountName string `json:"accountName"`
AccountKey string `json:"accountKey"`
}
type AzureBlobStorageClientConfig struct {
// In Azure a Container is close to a bucket in AWS S3
Container string `json:"container"`
// Files will be created with name starting with PrefixInContainer
PrefixInContainer string `json:"prefixInContainer"`
// Example Endpoint: "https://MYSTORAGEACCOUNT.blob.core.windows.net/"
Endpoint string `json:"endpoint"`
//
// Unexported attributes follow this comment.
//
// Use generateKey for tests.
// Otherwise, keep empty.
generateKey func() string
// Use getClient for tests.
// Otherwise keep empty.
getClient func() (*azblob.Client, error)
}
type AzureError struct {
Err error
}
func (e AzureError) Error() string {
return fmt.Sprintf("Azure Blob error: %s", e.Err.Error())
}
func (e AzureError) Unwrap() error {
return e.Err
}
type AzureClient struct {
cfg AzureBlobStorageClientConfig
c *azblob.Client
}
func (c AzureClient) LogFields() zap.Field {
return zap.Inline(zapcore.ObjectMarshalerFunc(func(enc zapcore.ObjectEncoder) error {
enc.AddString("container", c.cfg.Container)
enc.AddString("prefixInContainer", c.cfg.PrefixInContainer)
enc.AddString("endpoint", c.cfg.Endpoint)
return nil
}))
}
func (c AzureClient) generateKey() string {
return c.cfg.generateKey()
}
func (c AzureClient) send(ctx context.Context, payload []byte, _ TraceID) error {
payload, err := compress(payload)
if err != nil {
return err
}
_, err = c.c.UploadBuffer(ctx, c.cfg.Container, c.generateKey(), payload,
&azblob.UploadBufferOptions{}, //nolint:exhaustruct // It's part of Azure SDK
)
return handleAzureError(err)
}
func defaultGenerateKey(cfg AzureBlobStorageClientConfig) func() string {
return func() string {
return keyTemplate(cfg.PrefixInContainer)
}
}
func defaultGetClient(cfg AzureBlobStorageClientConfig) func() (*azblob.Client, error) {
return func() (*azblob.Client, error) {
//nolint:exhaustruct // It's part of Azure SDK
clientOptions := &azblob.ClientOptions{
ClientOptions: azcore.ClientOptions{
Telemetry: policy.TelemetryOptions{ApplicationID: "neon-autoscaler"},
},
}
credential, err := azidentity.NewDefaultAzureCredential(nil)
if err != nil {
return nil, err
}
client, err := azblob.NewClient(cfg.Endpoint, credential, clientOptions)
if err != nil {
return nil, &AzureError{err}
}
return client, nil
}
}
func NewAzureBlobStorageClient(cfg AzureBlobStorageClientConfig) (*AzureClient, error) {
var client *azblob.Client
if cfg.generateKey == nil {
cfg.generateKey = defaultGenerateKey(cfg)
}
if cfg.getClient == nil {
cfg.getClient = defaultGetClient(cfg)
}
client, err := cfg.getClient()
if err != nil {
return nil, err
}
return &AzureClient{
cfg: cfg,
c: client,
}, nil
}
func handleAzureError(err error) error {
if err == nil {
return nil
}
return AzureError{err}
}
package billing
import (
"bytes"
"compress/gzip"
"context"
"encoding/json"
"fmt"
"math/rand"
"net/http"
"os"
"time"
awsconfig "github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/s3"
"github.com/lithammer/shortuuid"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
)
var hostname string
func init() {
var err error
hostname, err = os.Hostname()
if err != nil {
hostname = fmt.Sprintf("unknown-%d", rand.Intn(1000))
}
}
// GetHostname returns the hostname to be used for enriching billing events (see Enrich())
//
// This function MUST NOT be run before init has finished.
func GetHostname() string {
return hostname
}
type Client interface {
LogFields() zap.Field
send(ctx context.Context, payload []byte, traceID TraceID) error
}
type TraceID string
func GenerateTraceID() TraceID {
return TraceID(shortuuid.New())
}
type HTTPClient struct {
URL string
httpc *http.Client
}
func NewHTTPClient(url string, c *http.Client) HTTPClient {
return HTTPClient{URL: fmt.Sprintf("%s/usage_events", url), httpc: c}
}
func (c HTTPClient) send(ctx context.Context, payload []byte, traceID TraceID) error {
r, err := http.NewRequestWithContext(ctx, http.MethodPost, c.URL, bytes.NewReader(payload))
if err != nil {
return RequestError{Err: err}
}
r.Header.Set("content-type", "application/json")
r.Header.Set("x-trace-id", string(traceID))
resp, err := c.httpc.Do(r)
if err != nil {
return RequestError{Err: err}
}
defer resp.Body.Close()
// theoretically if wanted/needed, we should use an http handler that
// does the retrying, to avoid writing that logic here.
if resp.StatusCode != http.StatusOK {
return UnexpectedStatusCodeError{StatusCode: resp.StatusCode}
}
return nil
}
func (c HTTPClient) LogFields() zap.Field {
return zap.String("url", c.URL)
}
type S3ClientConfig struct {
Bucket string `json:"bucket"`
Region string `json:"region"`
PrefixInBucket string `json:"prefixInBucket"`
Endpoint string `json:"endpoint"`
}
type S3Client struct {
cfg S3ClientConfig
client *s3.Client
}
type S3Error struct {
Err error
}
func (e S3Error) Error() string {
return fmt.Sprintf("S3 error: %s", e.Err.Error())
}
func (e S3Error) Unwrap() error {
return e.Err
}
func NewS3Client(ctx context.Context, cfg S3ClientConfig) (*S3Client, error) {
// Timeout in case we have hidden IO inside config creation
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
s3Config, err := awsconfig.LoadDefaultConfig(ctx, awsconfig.WithRegion(cfg.Region))
if err != nil {
return nil, S3Error{Err: err}
}
client := s3.NewFromConfig(s3Config, func(o *s3.Options) {
if cfg.Endpoint != "" {
o.BaseEndpoint = &cfg.Endpoint
}
o.UsePathStyle = true // required for minio
})
return &S3Client{
cfg: cfg,
client: client,
}, nil
}
// Example: prefixInContainer/year=2021/month=01/day=26/hh:mm:ssZ_{uuid}.ndjson.gz
func keyTemplate(prefix string) string {
now := time.Now()
id := shortuuid.New()
return fmt.Sprintf("%s/year=%d/month=%02d/day=%02d/%s_%s.ndjson.gz",
prefix,
now.Year(), now.Month(), now.Day(),
now.Format("15:04:05Z"),
id,
)
}
func (c S3Client) generateKey() string {
return keyTemplate(c.cfg.PrefixInBucket)
}
func (c S3Client) LogFields() zap.Field {
return zap.Inline(zapcore.ObjectMarshalerFunc(func(enc zapcore.ObjectEncoder) error {
enc.AddString("bucket", c.cfg.Bucket)
enc.AddString("prefixInBucket", c.cfg.PrefixInBucket)
enc.AddString("region", c.cfg.Region)
enc.AddString("endpoint", c.cfg.Endpoint)
return nil
}))
}
func compress(payload []byte) ([]byte, error) {
buf := bytes.Buffer{}
gzW := gzip.NewWriter(&buf)
_, err := gzW.Write(payload)
if err != nil {
return nil, err
}
err = gzW.Close() // Have to close it before reading the buffer
if err != nil {
return nil, err
}
return buf.Bytes(), nil
}
func (c S3Client) send(ctx context.Context, payload []byte, _ TraceID) error {
// Source of truth for the storage format:
// https://github.com/neondatabase/cloud/issues/11199#issuecomment-1992549672
key := c.generateKey()
payload, err := compress(payload)
if err != nil {
return S3Error{Err: err}
}
r := bytes.NewReader(payload)
_, err = c.client.PutObject(ctx, &s3.PutObjectInput{ //nolint:exhaustruct // AWS SDK
Bucket: &c.cfg.Bucket,
Key: &key,
Body: r,
})
if err != nil {
return S3Error{Err: err}
}
return nil
}
// Enrich sets the event's Type and IdempotencyKey fields, so that users of this API don't need to
// manually set them
func Enrich[E Event](now time.Time, hostname string, countInBatch, batchSize int, event E) E {
event.setType()
// RFC3339 with microsecond precision. Possible to get collisions with millis, nanos are extra.
// And everything's in UTC, so there's no sense including the offset.
formattedTime := now.In(time.UTC).Format("2006-01-02T15:04:05.999999Z")
key := event.getIdempotencyKey()
if *key == "" {
*key = fmt.Sprintf("%s-%s-%d/%d", formattedTime, hostname, countInBatch, batchSize)
}
return event
}
// Send attempts to push the events to the remote endpoint.
//
// On failure, the error is guaranteed to be one of: JSONError, RequestError, or
// UnexpectedStatusCodeError.
func Send[E Event](ctx context.Context, client Client, traceID TraceID, events []E) error {
if len(events) == 0 {
return nil
}
payload, err := json.Marshal(struct {
Events []E `json:"events"`
}{Events: events})
if err != nil {
return JSONError{Err: err}
}
return client.send(ctx, payload, traceID)
}
type JSONError struct {
Err error
}
func (e JSONError) Error() string {
return fmt.Sprintf("Error marshaling events: %s", e.Err.Error())
}
func (e JSONError) Unwrap() error {
return e.Err
}
type RequestError struct {
Err error
}
func (e RequestError) Error() string {
return fmt.Sprintf("Error making request: %s", e.Err.Error())
}
func (e RequestError) Unwrap() error {
return e.Err
}
type UnexpectedStatusCodeError struct {
StatusCode int
}
func (e UnexpectedStatusCodeError) Error() string {
return fmt.Sprintf("Unexpected HTTP status code %d", e.StatusCode)
}
package billing
import (
"time"
)
type Event interface {
*AbsoluteEvent | *IncrementalEvent
// eventMethods must be separate from Event so that we can assert that *AbsoluteEvent and
// *IncrementalEvent both implement it - Go does not allow converting to a value of type Event
// because it contains "*AbsoluteEvent | *IncrementalEvent", and such constraints can only be
// used inside of generics.
eventMethods
}
// eventMethods is a requirement for Event, but exists separately so that we can assert that the
// event types implement it.
//
// The reason this interface even exists in the first place is because we're not allowed to assume
// that a type E implementing Event actually has the common fields from AbsoluteEvent and
// IncrementalEvent, even though it's constrained to either of those types.
type eventMethods interface {
setType()
getIdempotencyKey() *string
}
var (
_ eventMethods = (*AbsoluteEvent)(nil)
_ eventMethods = (*IncrementalEvent)(nil)
)
type AbsoluteEvent struct {
IdempotencyKey string `json:"idempotency_key"`
MetricName string `json:"metric"`
Type string `json:"type"`
TenantID string `json:"tenant_id"`
TimelineID string `json:"timeline_id"`
Time time.Time `json:"time"`
Value int `json:"value"`
}
// setType implements eventMethods
func (e *AbsoluteEvent) setType() {
e.Type = "absolute"
}
// getIdempotencyKey implements eventMethods
func (e *AbsoluteEvent) getIdempotencyKey() *string {
return &e.IdempotencyKey
}
type IncrementalEvent struct {
IdempotencyKey string `json:"idempotency_key"`
MetricName string `json:"metric"`
Type string `json:"type"`
EndpointID string `json:"endpoint_id"`
StartTime time.Time `json:"start_time"`
StopTime time.Time `json:"stop_time"`
Value int `json:"value"`
}
// setType implements eventMethods
func (e *IncrementalEvent) setType() {
e.Type = "incremental"
}
// getIdempotencyKey implements eventMethods
func (e *IncrementalEvent) getIdempotencyKey() *string {
return &e.IdempotencyKey
}
package plugin
import (
"encoding/json"
"errors"
"fmt"
"os"
"golang.org/x/exp/slices"
"k8s.io/apimachinery/pkg/api/resource"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
)
//////////////////
// CONFIG TYPES //
//////////////////
type Config struct {
// NodeConfig defines our policies around node resources and scoring
NodeConfig nodeConfig `json:"nodeConfig"`
// SchedulerName informs the scheduler of its name, so that it can identify pods that a previous
// version handled.
SchedulerName string `json:"schedulerName"`
// EventQueueWorkers sets the number of worker threads responsible for handling items from the
// event queue.
EventQueueWorkers int `json:"eventQueueWorkers"`
// StartupEventHandlingTimeoutSeconds gives the maximum duration, in seconds, that we are
// allowed to wait to finish handling all of the initial events generated by reading the cluster
// state on startup.
//
// If event processing takes longer than this time, then plugin creation will fail, and the
// scheduler pod will retry.
StartupEventHandlingTimeoutSeconds int `json:"startupEventHandlingTimeoutSeconds"`
// RandomizeScores, if true, will cause the scheduler to score a node with a random number in
// the range [minScore + 1, trueScore], instead of the trueScore
RandomizeScores bool `json:"randomizeScores"`
// MigrationDeletionRetrySeconds gives the duration, in seconds, we should wait between retrying
// a failed attempt to delete a VirtualMachineMigration that's finished.
MigrationDeletionRetrySeconds uint `json:"migrationDeletionRetrySeconds"`
// DoMigration, if provided, allows VM migration to be disabled
//
// This flag is intended to be temporary, just until NeonVM supports mgirations and we can
// re-enable it.
DoMigration *bool `json:"doMigration"`
// K8sNodeGroupLabel, if provided, gives the label to use when recording k8s node groups in the
// metrics (like for autoscaling_plugin_node_{cpu,mem}_resources_current)
K8sNodeGroupLabel string `json:"k8sNodeGroupLabel"`
// K8sAvailabilityZoneLabel, if provided, gives the label to use when recording nodes'
// availability zones in the metrics (like for autoscaling_plugin_node_{cpu,mem}_resources_current)
K8sAvailabilityZoneLabel string `json:"k8sAvailabilityZoneLabel"`
// IgnoreNamespaces, if provided, gives a list of namespaces that the plugin should completely
// ignore, as if pods from those namespaces do not exist.
//
// This is specifically designed for our "overprovisioning" namespace, which creates paused pods
// to trigger cluster-autoscaler.
//
// The only exception to this rule is during Filter method calls, where we do still count the
// resources from such pods. The reason to do that is so that these overprovisioning pods can be
// evicted, which will allow cluster-autoscaler to trigger scale-up.
IgnoreNamespaces []string `json:"ignoreNamespaces"`
// DumpState, if provided, enables a server to dump internal state
DumpState *dumpStateConfig `json:"dumpState"`
// JSONString is the JSON string that was used to generate this config struct
JSONString string `json:"-"`
}
type nodeConfig struct {
Cpu resourceConfig `json:"cpu"`
Memory resourceConfig `json:"memory"`
// Details about node scoring:
// See also: https://www.desmos.com/calculator/wg8s0yn63s
// In the desmos, the value f(x,s) gives the score (from 0 to 1) of a node that's x amount full
// (where x is a fraction from 0 to 1), with a total size that is equal to the maximum size node
// times s (i.e. s (or: "scale") gives the ratio between this nodes's size and the biggest one).
// MinUsageScore gives the ratio of the score at the minimum usage (i.e. 0) relative to the
// score at the midpoint, which will have the maximum.
//
// This corresponds to y₀ in the desmos link above.
MinUsageScore float64 `json:"minUsageScore"`
// MaxUsageScore gives the ratio of the score at the maximum usage (i.e. full) relative to the
// score at the midpoint, which will have the maximum.
//
// This corresponds to y₁ in the desmos link above.
MaxUsageScore float64 `json:"maxUsageScore"`
// ScorePeak gives the fraction at which the "target" or highest score should be, with the score
// sloping down on either side towards MinUsageScore at 0 and MaxUsageScore at 1.
//
// This corresponds to xₚ in the desmos link.
ScorePeak float64 `json:"scorePeak"`
}
// resourceConfig configures the amount of a particular resource we're willing to allocate to VMs,
// both the soft limit (Watermark) and the hard limit (via System)
type resourceConfig struct {
// Watermark is the fraction of non-system resource allocation above which we should be
// migrating VMs away to reduce usage
//
// If empty, the watermark is set as equal to the "hard" limit from system resources.
//
// The word "watermark" was originally used by @zoete as a temporary stand-in term during a
// meeting, and so it has intentionally been made permanent to spite the concept of "temporary" 😛
Watermark float32 `json:"watermark,omitempty"`
}
func (c *Config) migrationEnabled() bool {
return c.DoMigration == nil || *c.DoMigration
}
///////////////////////
// CONFIG VALIDATION //
///////////////////////
// if the returned error is not nil, the string is a JSON path to the invalid value
func (c *Config) validate() (string, error) {
if path, err := c.NodeConfig.validate(); err != nil {
return fmt.Sprintf("nodeConfig.%s", path), err
}
if c.SchedulerName == "" {
return "schedulerName", errors.New("string cannot be empty")
}
if c.EventQueueWorkers <= 0 {
return "eventQueueWorkers", errors.New("value must be > 0")
}
if c.StartupEventHandlingTimeoutSeconds <= 0 {
return "startupEventHandlingTimeoutSeconds", errors.New("value must be > 0")
}
if c.DumpState != nil {
if path, err := c.DumpState.validate(); err != nil {
return fmt.Sprintf("dumpState.%s", path), err
}
}
if c.MigrationDeletionRetrySeconds == 0 {
return "migrationDeletionRetrySeconds", errors.New("value must be > 0")
}
return "", nil
}
func (c *nodeConfig) validate() (string, error) {
if path, err := c.Cpu.validate(); err != nil {
return fmt.Sprintf("cpu.%s", path), err
}
if path, err := c.Memory.validate(); err != nil {
return fmt.Sprintf("memory.%s", path), err
}
if c.MinUsageScore < 0 || c.MinUsageScore > 1 {
return "minUsageScore", errors.New("value must be between 0 and 1, inclusive")
} else if c.MaxUsageScore < 0 || c.MaxUsageScore > 1 {
return "maxUsageScore", errors.New("value must be between 0 and 1, inclusive")
} else if c.ScorePeak < 0 || c.ScorePeak > 1 {
return "scorePeak", errors.New("value must be between 0 and 1, inclusive")
}
return "", nil
}
func (c *resourceConfig) validate() (string, error) {
if c.Watermark <= 0.0 {
return "watermark", errors.New("value must be > 0")
} else if c.Watermark > 1.0 {
return "watermark", errors.New("value must be <= 1")
}
return "", nil
}
////////////////////
// CONFIG READING //
////////////////////
const DefaultConfigPath = "/etc/scheduler-plugin-config/autoscale-enforcer-config.json"
func ReadConfig(path string) (*Config, error) {
file, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("Error opening config file %q: %w", path, err)
}
defer file.Close()
var config Config
jsonDecoder := json.NewDecoder(file)
jsonDecoder.DisallowUnknownFields()
if err = jsonDecoder.Decode(&config); err != nil {
return nil, fmt.Errorf("Error decoding JSON config in %q: %w", path, err)
}
if path, err = config.validate(); err != nil {
return nil, fmt.Errorf("Invalid config at %s: %w", path, err)
}
return &config, nil
}
//////////////////////////////////////
// HELPER METHODS FOR USING CONFIGS //
//////////////////////////////////////
// ignoredNamespace returns whether items in the namespace should be treated as if they don't exist
func (c *Config) ignoredNamespace(namespace string) bool {
return slices.Contains(c.IgnoreNamespaces, namespace)
}
func (c *nodeConfig) vCpuLimits(total *resource.Quantity) nodeResourceState[vmapi.MilliCPU] {
totalMilli := total.MilliValue()
return nodeResourceState[vmapi.MilliCPU]{
Total: vmapi.MilliCPU(totalMilli),
Watermark: vmapi.MilliCPU(c.Cpu.Watermark * float32(totalMilli)),
Reserved: 0,
Buffer: 0,
CapacityPressure: 0,
PressureAccountedFor: 0,
}
}
func (c *nodeConfig) memoryLimits(total *resource.Quantity) nodeResourceState[api.Bytes] {
totalBytes := total.Value()
return nodeResourceState[api.Bytes]{
Total: api.Bytes(totalBytes),
Watermark: api.Bytes(c.Memory.Watermark * float32(totalBytes)),
Reserved: 0,
Buffer: 0,
CapacityPressure: 0,
PressureAccountedFor: 0,
}
}
package plugin
import (
"context"
"errors"
"fmt"
"net"
"net/http"
"time"
"github.com/samber/lo"
"go.uber.org/zap"
"golang.org/x/exp/slices"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
)
type dumpStateConfig struct {
Port uint16 `json:"port"`
TimeoutSeconds uint `json:"timeoutSeconds"`
}
func (c *dumpStateConfig) validate() (string, error) {
if c.Port == 0 {
return "port", errors.New("value must be > 0")
} else if c.TimeoutSeconds == 0 {
return "timeoutSeconds", errors.New("value must be > 0")
}
return "", nil
}
type stateDump struct {
Stopped bool `json:"stopped"`
BuildInfo util.BuildInfo `json:"buildInfo"`
State pluginStateDump `json:"state"`
}
func (p *AutoscaleEnforcer) startDumpStateServer(shutdownCtx context.Context, logger *zap.Logger) error {
// Manually start the TCP listener so we can minimize errors in the background thread.
addr := net.TCPAddr{IP: net.IPv4zero, Port: int(p.state.conf.DumpState.Port)}
listener, err := net.ListenTCP("tcp", &addr)
if err != nil {
return fmt.Errorf("Error binding to %v", addr)
}
go func() {
mux := http.NewServeMux()
util.AddHandler(logger, mux, "/", http.MethodGet, "<empty>", func(ctx context.Context, _ *zap.Logger, body *struct{}) (*stateDump, int, error) {
timeout := time.Duration(p.state.conf.DumpState.TimeoutSeconds) * time.Second
startTime := time.Now()
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
state, err := p.dumpState(ctx, shutdownCtx.Err() != nil)
if err != nil {
if ctx.Err() != nil && errors.Is(ctx.Err(), context.DeadlineExceeded) {
totalDuration := time.Since(startTime)
return nil, 500, fmt.Errorf("timed out after %s while getting state", totalDuration)
} else {
// some other type of cancel; 400 is a little weird, but there isn't a great
// option here.
return nil, 400, fmt.Errorf("error while getting state: %w", err)
}
}
return state, 200, nil
})
// note: we don't shut down this server. It should be possible to continue fetching the
// internal state after shutdown has started.
server := &http.Server{Handler: mux}
if err := server.Serve(listener); err != nil {
logger.Error("dump-state server exited", zap.Error(err))
}
}()
return nil
}
func (p *AutoscaleEnforcer) dumpState(ctx context.Context, stopped bool) (*stateDump, error) {
state, err := p.state.dump(ctx)
if err != nil {
return nil, err
}
return &stateDump{
Stopped: stopped,
BuildInfo: util.GetBuildInfo(),
State: *state,
}, nil
}
type keyed[K any, V any] struct {
Key K `json:"key"`
Value V `json:"value"`
}
type pluginStateDump struct {
OngoingMigrationDeletions []keyed[util.NamespacedName, int] `json:"ongoingMigrationDeletions"`
Nodes []keyed[string, nodeStateDump] `json:"nodes"`
Pods []podNameAndPointer `json:"pods"`
MaxTotalReservableCPU vmapi.MilliCPU `json:"maxTotalReservableCPU"`
MaxTotalReservableMem api.Bytes `json:"maxTotalReservableMem"`
Conf Config `json:"config"`
}
type podNameAndPointer struct {
Obj pointerString `json:"obj"`
PodName util.NamespacedName `json:"podName"`
}
type pointerString string
type nodeStateDump struct {
Obj pointerString `json:"obj"`
Name string `json:"name"`
NodeGroup string `json:"nodeGroup"`
AvailabilityZone string `json:"availabilityZone"`
CPU nodeResourceState[vmapi.MilliCPU] `json:"cpu"`
Mem nodeResourceState[api.Bytes] `json:"mem"`
Pods []keyed[util.NamespacedName, podStateDump] `json:"pods"`
Mq []*podNameAndPointer `json:"mq"`
}
type podStateDump struct {
Obj pointerString `json:"obj"`
Name util.NamespacedName `json:"name"`
Node pointerString `json:"node"`
CPU podResourceState[vmapi.MilliCPU] `json:"cpu"`
Mem podResourceState[api.Bytes] `json:"mem"`
VM *vmPodState `json:"vm"`
}
func makePointerString[T any](t *T) pointerString {
return pointerString(fmt.Sprintf("%p", t))
}
func sortSliceByPodName[T any](slice []T, name func(T) util.NamespacedName) {
slices.SortFunc(slice, func(a, b T) (less bool) {
aName := name(a)
bName := name(b)
return aName.Namespace < bName.Namespace && aName.Name < bName.Name
})
}
func (s *pluginState) dump(ctx context.Context) (*pluginStateDump, error) {
if err := s.lock.TryLock(ctx); err != nil {
return nil, err
}
defer s.lock.Unlock()
pods := make([]podNameAndPointer, 0, len(s.pods))
for _, p := range s.pods {
pods = append(pods, podNameAndPointer{Obj: makePointerString(p), PodName: p.name})
}
sortSliceByPodName(pods, func(p podNameAndPointer) util.NamespacedName { return p.PodName })
nodes := make([]keyed[string, nodeStateDump], 0, len(s.nodes))
for k, n := range s.nodes {
nodes = append(nodes, keyed[string, nodeStateDump]{Key: k, Value: n.dump()})
}
slices.SortFunc(nodes, func(kvx, kvy keyed[string, nodeStateDump]) (less bool) {
return kvx.Key < kvy.Key
})
ongoingMigrationDeletions := make([]keyed[util.NamespacedName, int], 0, len(s.ongoingMigrationDeletions))
for k, count := range s.ongoingMigrationDeletions {
ongoingMigrationDeletions = append(ongoingMigrationDeletions, keyed[util.NamespacedName, int]{Key: k, Value: count})
}
sortSliceByPodName(ongoingMigrationDeletions, func(kv keyed[util.NamespacedName, int]) util.NamespacedName { return kv.Key })
return &pluginStateDump{
OngoingMigrationDeletions: ongoingMigrationDeletions,
Nodes: nodes,
Pods: pods,
MaxTotalReservableCPU: s.maxTotalReservableCPU,
MaxTotalReservableMem: s.maxTotalReservableMem,
Conf: *s.conf,
}, nil
}
func (s *nodeState) dump() nodeStateDump {
pods := make([]keyed[util.NamespacedName, podStateDump], 0, len(s.pods))
for k, p := range s.pods {
pods = append(pods, keyed[util.NamespacedName, podStateDump]{Key: k, Value: p.dump()})
}
sortSliceByPodName(pods, func(kv keyed[util.NamespacedName, podStateDump]) util.NamespacedName { return kv.Key })
mq := make([]*podNameAndPointer, 0, len(s.mq))
for _, p := range s.mq {
if p == nil {
mq = append(mq, nil)
} else {
v := podNameAndPointer{Obj: makePointerString(p), PodName: p.Name}
mq = append(mq, &v)
}
}
return nodeStateDump{
Obj: makePointerString(s),
Name: s.name,
NodeGroup: s.nodeGroup,
AvailabilityZone: s.availabilityZone,
CPU: s.cpu,
Mem: s.mem,
Pods: pods,
Mq: mq,
}
}
func (s *podState) dump() podStateDump {
var vm *vmPodState
if s.vm != nil {
vm = lo.ToPtr(s.vm.dump())
}
return podStateDump{
Obj: makePointerString(s),
Name: s.name,
Node: makePointerString(s.node),
CPU: s.cpu,
Mem: s.mem,
VM: vm,
}
}
func (s *vmPodState) dump() vmPodState {
// Copy some of the "may be nil" pointer fields
var metrics *api.Metrics
if s.Metrics != nil {
metrics = lo.ToPtr(*s.Metrics)
}
var migrationState *podMigrationState
if s.MigrationState != nil {
migrationState = &podMigrationState{
Name: s.MigrationState.Name,
}
}
return vmPodState{
Name: s.Name,
MemSlotSize: s.MemSlotSize,
Config: s.Config,
Metrics: metrics,
MqIndex: s.MqIndex,
MigrationState: migrationState,
}
}
package plugin
import (
"context"
"hash/fnv"
"time"
"github.com/tychoish/fun/pubsub"
)
type queueItem[T any] struct {
item T
addTime time.Time
}
type eventQueueSet[T any] struct {
queues []*pubsub.Queue[queueItem[T]]
metrics PromMetrics
}
func newEventQueueSet[T any](size int, metrics PromMetrics) eventQueueSet[T] {
queues := make([]*pubsub.Queue[queueItem[T]], size)
for i := 0; i < size; i += 1 {
queues[i] = pubsub.NewUnlimitedQueue[queueItem[T]]()
}
return eventQueueSet[T]{
queues: queues,
metrics: metrics,
}
}
func (s eventQueueSet[T]) enqueue(key string, item T) error {
hasher := fnv.New64()
// nb: Hash guarantees that Write never returns an error
_, _ = hasher.Write([]byte(key))
hash := hasher.Sum64()
idx := int(hash % uint64(len(s.queues)))
s.metrics.eventQueueDepth.Inc()
s.metrics.eventQueueAddsTotal.Inc()
queueItem := queueItem[T]{
item: item,
addTime: time.Now(),
}
return s.queues[idx].Add(queueItem)
}
func (s eventQueueSet[T]) wait(ctx context.Context, idx int) (T, error) {
queueItem, err := s.queues[idx].Wait(ctx)
if err == nil {
s.metrics.eventQueueDepth.Dec()
s.metrics.eventQueueLatency.Observe(float64(time.Since(queueItem.addTime).Seconds()))
}
return queueItem.item, err
}
package plugin
import (
"context"
"fmt"
"math/rand"
"sync/atomic"
"time"
"go.uber.org/zap"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
scheme "k8s.io/client-go/kubernetes/scheme"
rest "k8s.io/client-go/rest"
"k8s.io/kubernetes/pkg/scheduler/framework"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
"github.com/neondatabase/autoscaling/pkg/util/watch"
)
const Name = "AutoscaleEnforcer"
const LabelPluginCreatedMigration = "autoscaling.neon.tech/created-by-scheduler"
// AutoscaleEnforcer is the scheduler plugin to coordinate autoscaling
type AutoscaleEnforcer struct {
logger *zap.Logger
handle framework.Handle
vmClient *vmclient.Clientset
state pluginState
metrics PromMetrics
// nodeStore provides access to the current-ish state of Nodes in the cluster. If something's
// missing, it can be updated with Relist().
nodeStore IndexedNodeStore
}
// abbreviations, because these types are pretty verbose
type IndexedVMStore = watch.IndexedStore[vmapi.VirtualMachine, *watch.NameIndex[vmapi.VirtualMachine]]
type IndexedNodeStore = watch.IndexedStore[corev1.Node, *watch.FlatNameIndex[corev1.Node]]
// Compile-time checks that AutoscaleEnforcer actually implements the interfaces we want it to
var _ framework.Plugin = (*AutoscaleEnforcer)(nil)
var _ framework.PreFilterPlugin = (*AutoscaleEnforcer)(nil)
var _ framework.PostFilterPlugin = (*AutoscaleEnforcer)(nil)
var _ framework.FilterPlugin = (*AutoscaleEnforcer)(nil)
var _ framework.ScorePlugin = (*AutoscaleEnforcer)(nil)
var _ framework.ReservePlugin = (*AutoscaleEnforcer)(nil)
func NewAutoscaleEnforcerPlugin(ctx context.Context, logger *zap.Logger, config *Config) func(runtime.Object, framework.Handle) (framework.Plugin, error) {
return func(obj runtime.Object, h framework.Handle) (framework.Plugin, error) {
return makeAutoscaleEnforcerPlugin(ctx, logger, obj, h, config)
}
}
// NewAutoscaleEnforcerPlugin produces the initial AutoscaleEnforcer plugin to be used by the
// scheduler
func makeAutoscaleEnforcerPlugin(
ctx context.Context,
logger *zap.Logger,
_obj runtime.Object,
h framework.Handle,
config *Config,
) (framework.Plugin, error) {
// obj can be used for taking in configuration. it's a bit tricky to figure out, and we don't
// quite need it yet.
logger.Info("Initializing plugin")
// create the NeonVM client
if err := vmapi.AddToScheme(scheme.Scheme); err != nil {
return nil, err
}
vmConfig := rest.CopyConfig(h.KubeConfig())
// The handler's ContentType is not the default "application/json" (it's protobuf), so we need
// to set it back to JSON because NeonVM doesn't support protobuf.
vmConfig.ContentType = "application/json"
vmClient, err := vmclient.NewForConfig(vmConfig)
if err != nil {
return nil, fmt.Errorf("Error creating NeonVM client: %w", err)
}
p := AutoscaleEnforcer{
logger: logger.Named("plugin"),
handle: h,
vmClient: vmClient,
state: pluginState{
lock: util.NewChanMutex(),
ongoingMigrationDeletions: make(map[util.NamespacedName]int),
pods: make(map[util.NamespacedName]*podState),
nodes: make(map[string]*nodeState),
maxTotalReservableCPU: 0, // set during event handling
maxTotalReservableMem: 0, // set during event handling
conf: config,
},
metrics: PromMetrics{}, //nolint:exhaustruct // set by makePrometheusRegistry
nodeStore: IndexedNodeStore{}, //nolint:exhaustruct // set below
}
if p.state.conf.DumpState != nil {
logger.Info("Starting 'dump state' server")
if err := p.startDumpStateServer(ctx, logger.Named("dump-state")); err != nil {
return nil, fmt.Errorf("Error starting 'dump state' server: %w", err)
}
}
// makePrometheusRegistry sets p.metrics, which we need to do before calling
// newEventQueueSet or handling events, because we set metrics in eventQueueSet and for each
// node as watch events get handled.
promReg := p.makePrometheusRegistry()
// Start watching Pod/VM events, adding them to a shared queue to process them in order
queueSet := newEventQueueSet[func()](config.EventQueueWorkers, p.metrics)
pushToQueue := func(logger *zap.Logger, key string, f func()) {
if err := queueSet.enqueue(key, f); err != nil {
logger.Warn("Error adding to pod/VM event queue", zap.Error(err))
}
}
// A note about event handling:
//
// Before returning from this function, we want to make sure that we're caught up to the watch
// events generated by initially reading the cluster state (i.e. the initial List()).
//
// Doing this is non-trivial, so we accomplish it in pieces:
//
// 1. Using watch.WatchModeSync to force queueing events *before* returning from creating the
// watcher (note: and therefore, before any start to be handled); and
// 2. For each event created from the initial List(), increment a counter to track the number of
// these events, and decrement it as events are handled.
//
// The initial state building is complete when the counter reaches zero, at which point we close
// the channel that this function will wait on.
var initEventsCount atomic.Int32
var initEvents *eventCounter
incEventCount := func() { initEventsCount.Add(1) }
hlogger := logger.Named("handlers")
nwc := nodeWatchCallbacks{
submitNodeDeletion: func(logger *zap.Logger, nodeName string) {
pushToQueue(logger, nodeName, func() { p.handleNodeDeletion(hlogger, nodeName) })
},
}
pwc := podWatchCallbacks{
submitStarted: func(logger *zap.Logger, pod *corev1.Pod, preexisting bool) {
if preexisting {
incEventCount()
}
pushToQueue(logger, pod.Name, func() {
p.handleStarted(hlogger, pod, preexisting)
if preexisting {
initEvents.dec()
}
})
},
submitDeletion: func(logger *zap.Logger, name util.NamespacedName) {
// NOTE: It's important that the name we use here is the same as the one we use for
// submitStarted - otherwise we can end up with out of order handling for start/stop
// events.
pushToQueue(logger, name.Name, func() { p.handleDeletion(hlogger, name) })
},
submitStartMigration: func(logger *zap.Logger, podName, migrationName util.NamespacedName, source bool) {
pushToQueue(logger, migrationName.Name, func() { p.handlePodStartMigration(logger, podName, migrationName, source) })
},
submitEndMigration: func(logger *zap.Logger, podName, migrationName util.NamespacedName) {
pushToQueue(logger, migrationName.Name, func() { p.handlePodEndMigration(logger, podName, migrationName) })
},
}
vwc := vmWatchCallbacks{
submitConfigUpdated: func(logger *zap.Logger, pod util.NamespacedName, newCfg api.VmConfig) {
pushToQueue(logger, pod.Name, func() { p.handleVMConfigUpdated(hlogger, pod, newCfg) })
},
submitBoundsChanged: func(logger *zap.Logger, vm *api.VmInfo, podName string) {
pushToQueue(logger, vm.Name, func() { p.handleUpdatedScalingBounds(hlogger, vm, podName) })
},
submitNonAutoscalingVmUsageChanged: func(logger *zap.Logger, vm *api.VmInfo, podName string) {
pushToQueue(logger, vm.Name, func() { p.handleNonAutoscalingUsageChange(hlogger, vm, podName) })
},
}
mwc := migrationWatchCallbacks{
submitMigrationFinished: func(vmm *vmapi.VirtualMachineMigration) {
// When cleaning up migrations, we don't want to process those events synchronously.
// So instead, we'll spawn a goroutine to delete the completed migration.
go p.cleanupMigration(hlogger, vmm)
},
}
watchMetrics := watch.NewMetrics("autoscaling_plugin_watchers")
logger.Info("Starting node watcher")
nodeStore, err := p.watchNodeEvents(ctx, logger, watchMetrics, nwc)
if err != nil {
return nil, fmt.Errorf("Error starting node watcher: %w", err)
}
p.nodeStore = watch.NewIndexedStore(nodeStore, watch.NewFlatNameIndex[corev1.Node]())
logger.Info("Starting pod watcher")
podStore, err := p.watchPodEvents(ctx, logger, watchMetrics, pwc)
if err != nil {
return nil, fmt.Errorf("Error starting pod watcher: %w", err)
}
podIndex := watch.NewIndexedStore(podStore, watch.NewNameIndex[corev1.Pod]())
logger.Info("Starting VM watcher")
_, err = p.watchVMEvents(ctx, logger, watchMetrics, vwc, podIndex)
if err != nil {
return nil, fmt.Errorf("Error starting VM watcher: %w", err)
}
logger.Info("Starting VM Migration watcher")
if _, err := p.watchMigrationEvents(ctx, logger, watchMetrics, mwc); err != nil {
return nil, fmt.Errorf("Error starting VM Migration watcher: %w", err)
}
watchMetrics.MustRegister(promReg)
// Set up tracking the initial events, now that we know the count:
totalQueued := initEventsCount.Load()
initEvents = newEventCounter(totalQueued)
// Start handling the queued events. Any handling of initial events will gradually reduce
// initEventsCount, and eventually we'll close(initEventsDone) to mark initial event handling as
// complete.
for i := 0; i < config.EventQueueWorkers; i += 1 {
// copy the loop variable to avoid it escaping pre Go 1.22
go func(ctx context.Context, idx int) {
for {
callback, err := queueSet.wait(ctx, idx) // NB: wait pulls from the front of the queue
if err != nil {
logger.Info("Stopped waiting on pod/VM queue", zap.Error(err))
break
}
callback()
}
}(ctx, i)
}
if err := util.StartPrometheusMetricsServer(ctx, logger.Named("prometheus"), 9100, promReg); err != nil {
return nil, fmt.Errorf("Error starting prometheus server: %w", err)
}
// Wait for all the initial events to be handled.
logger.Info("Waiting on initial events processing to be done", zap.Int32("count", totalQueued))
initEventsTimeout := time.Second * time.Duration(p.state.conf.StartupEventHandlingTimeoutSeconds)
select {
case <-initEvents.done():
// Done
case <-time.After(initEventsTimeout):
return nil, fmt.Errorf(
"Timed out waiting on initial events processing to complete after %s (%d remaining)",
initEventsTimeout,
initEvents.getRemaining(),
)
}
logger.Info("Initial events processing complete")
if err := p.startPermitHandler(ctx, logger.Named("agent-handler")); err != nil {
return nil, fmt.Errorf("permit handler: %w", err)
}
// Periodically check that we're not deadlocked
go func() {
defer func() {
if err := recover(); err != nil {
logger.Panic("deadlock checker for AutoscaleEnforcer.state.lock panicked", zap.String("error", fmt.Sprint(err)))
}
}()
p.state.lock.DeadlockChecker(time.Second, 5*time.Second)(ctx)
}()
logger.Info("Plugin initialization complete")
return &p, nil
}
// monotonically decreasing event counter that closes a channel once all events have been completed
// with dec().
//
// Used to make sure we've processed all the initial events before returning from
// makeAutoscaleEnforcerPlugin().
type eventCounter struct {
remaining atomic.Int32
signalDone chan struct{}
}
func newEventCounter(remaining int32) *eventCounter {
c := &eventCounter{
remaining: atomic.Int32{},
signalDone: make(chan struct{}),
}
c.remaining.Store(remaining)
return c
}
func (c *eventCounter) dec() {
r := c.remaining.Add(-1)
if r == 0 {
close(c.signalDone)
}
}
func (c *eventCounter) getRemaining() int32 {
return c.remaining.Load()
}
func (c *eventCounter) done() <-chan struct{} {
return c.signalDone
}
// Name returns the name of the AutoscaleEnforcer plugin
//
// Required for framework.Plugin
func (e *AutoscaleEnforcer) Name() string {
return Name
}
// getVmInfo is a helper for the plugin-related functions
//
// This function returns nil, nil if the pod is not associated with a NeonVM virtual machine.
func (e *AutoscaleEnforcer) getVmInfo(logger *zap.Logger, pod *corev1.Pod, action string) (*api.VmInfo, error) {
vmName := util.TryPodOwnerVirtualMachine(pod)
if vmName == nil {
return nil, nil
}
vmInfo, err := api.ExtractVmInfoFromPod(logger, pod)
if err != nil {
e.handle.EventRecorder().Eventf(
pod, // regarding
nil, // related
"Warning", // eventtype
"ExtractVmInfo", // reason
action, // action
"Failed to extract autoscaling info about VM: %s", // node
err,
)
return nil, fmt.Errorf("Error extracting VM info: %w", err)
}
return vmInfo, nil
}
// checkSchedulerName asserts that the SchedulerName field of a Pod matches what we're expecting,
// otherwise returns a non-nil framework.Status to return (and also logs the error)
func (e *AutoscaleEnforcer) checkSchedulerName(logger *zap.Logger, pod *corev1.Pod) *framework.Status {
if e.state.conf.SchedulerName != pod.Spec.SchedulerName {
err := fmt.Errorf(
"Mismatched SchedulerName for pod: our config has %q, but the pod has %q",
e.state.conf.SchedulerName, pod.Spec.SchedulerName,
)
logger.Error("Pod failed scheduler name check", zap.Error(err))
return framework.NewStatus(framework.Error, err.Error())
}
return nil
}
// PreFilter is called at the start of any Pod's filter cycle. We use it in combination with
// PostFilter (which is only called on failure) to provide metrics for pods that are rejected by
// this process.
func (e *AutoscaleEnforcer) PreFilter(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
) (_ *framework.PreFilterResult, status *framework.Status) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("PreFilter", pod, ignored)
defer func() {
e.metrics.IncFailIfNotSuccess("PreFilter", pod, ignored, status)
}()
return nil, nil
}
// PreFilterExtensions is required for framework.PreFilterPlugin, and can return nil if it's not used
func (e *AutoscaleEnforcer) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
// PostFilter is used by us for metrics on filter cycles that reject a Pod by filtering out all
// applicable nodes.
//
// Quoting the docs for PostFilter:
//
// > These plugins are called after Filter phase, but only when no feasible nodes were found for the
// > pod.
//
// Required for framework.PostFilterPlugin
func (e *AutoscaleEnforcer) PostFilter(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
filteredNodeStatusMap framework.NodeToStatusMap,
) (_ *framework.PostFilterResult, status *framework.Status) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("PostFilter", pod, ignored)
defer func() {
e.metrics.IncFailIfNotSuccess("PostFilter", pod, ignored, status)
}()
logger := e.logger.With(zap.String("method", "Filter"), util.PodNameFields(pod))
logger.Error("Pod rejected by all Filter method calls")
return nil, nil // PostFilterResult is optional, nil Status is success.
}
// Filter gives our plugin a chance to signal that a pod shouldn't be put onto a particular node
//
// Required for framework.FilterPlugin
func (e *AutoscaleEnforcer) Filter(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
nodeInfo *framework.NodeInfo,
) (status *framework.Status) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("Filter", pod, ignored)
defer func() {
e.metrics.IncFailIfNotSuccess("Filter", pod, ignored, status)
}()
nodeName := nodeInfo.Node().Name // TODO: nodes also have namespaces? are they used at all?
logger := e.logger.With(zap.String("method", "Filter"), zap.String("node", nodeName), util.PodNameFields(pod))
logger.Info("Handling Filter request")
if ignored {
logger.Warn("Received Filter request for pod in ignored namespace, continuing anyways.")
}
vmInfo, err := e.getVmInfo(logger, pod, "Filter")
if err != nil {
logger.Error("Error getting VM info for Pod", zap.Error(err))
return framework.NewStatus(
framework.UnschedulableAndUnresolvable,
fmt.Sprintf("Error getting pod vmInfo: %s", err),
)
}
var podResources api.Resources
if vmInfo != nil {
podResources = vmInfo.Using()
} else {
podResources = extractPodResources(pod)
}
// Check that the SchedulerName matches what we're expecting
if status := e.checkSchedulerName(logger, pod); status != nil {
return status
}
e.state.lock.Lock()
defer e.state.lock.Unlock()
node, err := e.state.getOrFetchNodeState(ctx, logger, e.metrics, e.nodeStore, nodeName)
if err != nil {
logger.Error("Error getting node state", zap.Error(err))
return framework.NewStatus(
framework.Error,
fmt.Sprintf("Error getting node state: %s", err),
)
}
// The pod will get resources according to vmInfo.{Cpu,Mem}.Use reserved for it when it does get
// scheduled. Now we can check whether this node has capacity for the pod.
//
// Technically speaking, the VM pods in nodeInfo might not match what we have recorded for the
// node -- simply because during preemption, the scheduler tries to see whether it could
// schedule the pod if other stuff was preempted, and gives us what the state WOULD be after
// preemption.
//
// So we have to actually count up the resource usage of all pods in nodeInfo:
var nodeTotal api.Resources
// As we process all pods, we should record all the pods that aren't present in both nodeInfo
// and e.state's maps, so that we can log any inconsistencies instead of silently using
// *potentially* bad data. Some differences are expected, but on the whole this extra
// information should be helpful.
missedPods := make(map[util.NamespacedName]struct{})
for name := range node.pods {
missedPods[name] = struct{}{}
}
var includedIgnoredPods []util.NamespacedName
for _, podInfo := range nodeInfo.Pods {
pn := util.NamespacedName{Name: podInfo.Pod.Name, Namespace: podInfo.Pod.Namespace}
if podState, ok := e.state.pods[pn]; ok {
nodeTotal.VCPU += podState.cpu.Reserved
nodeTotal.Mem += podState.mem.Reserved
delete(missedPods, pn)
} else {
name := util.GetNamespacedName(podInfo.Pod)
if util.PodCompleted(podInfo.Pod) {
logger.Warn(
"Skipping completed Pod in Filter node's pods",
zap.Object("pod", name),
zap.String("phase", string(podInfo.Pod.Status.Phase)),
)
continue
}
if !e.state.conf.ignoredNamespace(podInfo.Pod.Namespace) {
// FIXME: this gets us duplicated "pod" fields. Not great. But we're using
// logger.With pretty pervasively, and it's hard to avoid this while using that.
// For now, we can get around this by including the pod name in an error.
logger.Error(
"Unknown-but-not-ignored Pod in Filter node's pods",
zap.Object("pod", name),
zap.Error(fmt.Errorf("Pod %v is unknown but not ignored", name)),
)
} else {
includedIgnoredPods = append(includedIgnoredPods, name)
}
// We *also* need to count pods in ignored namespaces
resources := extractPodResources(podInfo.Pod)
nodeTotal.VCPU += resources.VCPU
nodeTotal.Mem += resources.Mem
}
}
if len(missedPods) != 0 {
var missedPodsList []util.NamespacedName
for name := range missedPods {
missedPodsList = append(missedPodsList, name)
}
logger.Warn("Some known Pods weren't included in Filter NodeInfo", zap.Objects("missedPods", missedPodsList))
}
var kind string
if vmInfo != nil {
kind = "VM"
} else {
kind = "non-VM"
}
makeMsg := func(resource, compareOp string, nodeUse, podUse, nodeMax any) string {
return fmt.Sprintf(
"node %s usage %v + %s pod %s %v %s node max %v",
resource, nodeUse, kind, resource, podUse, compareOp, nodeMax,
)
}
allowing := true
var cpuCompare string
if nodeTotal.VCPU+podResources.VCPU > node.cpu.Total {
cpuCompare = ">"
allowing = false
} else {
cpuCompare = "<="
}
cpuMsg := makeMsg("vCPU", cpuCompare, nodeTotal.VCPU, podResources.VCPU, node.cpu.Total)
var memCompare string
if nodeTotal.Mem+podResources.Mem > node.mem.Total {
memCompare = ">"
allowing = false
} else {
memCompare = "<="
}
memMsg := makeMsg("vCPU", memCompare, nodeTotal.Mem, podResources.Mem, node.mem.Total)
var message string
var logFunc func(string, ...zap.Field)
if allowing {
message = "Allowing Pod"
logFunc = logger.Info
} else {
message = "Rejecting Pod"
logFunc = logger.Warn
}
logFunc(
message,
zap.Objects("includedIgnoredPods", includedIgnoredPods),
zap.Object("verdict", verdictSet{
cpu: cpuMsg,
mem: memMsg,
}),
)
if !allowing {
return framework.NewStatus(framework.Unschedulable, "Not enough resources for pod")
} else {
return nil
}
}
// Score allows our plugin to express which nodes should be preferred for scheduling new pods onto
//
// Even though this function is given (pod, node) pairs, our scoring is only really dependent on
// values of the node. However, we have special handling for when the pod no longer fits in the node
// (even though it might have during the Filter plugin) - we can't return a failure, because that
// would cause *all* scheduling of the pod to fail, so we instead return the minimum score.
//
// The scores might not be consistent with each other, due to ongoing changes in the node. That's
// ok, because nothing relies on strict correctness here, and they should be approximately correct
// anyways.
//
// Required for framework.ScorePlugin
func (e *AutoscaleEnforcer) Score(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
nodeName string,
) (_ int64, status *framework.Status) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("Score", pod, ignored)
defer func() {
e.metrics.IncFailIfNotSuccess("Score", pod, ignored, status)
}()
logger := e.logger.With(zap.String("method", "Score"), zap.String("node", nodeName), util.PodNameFields(pod))
logger.Info("Handling Score request")
scoreLen := framework.MaxNodeScore - framework.MinNodeScore
// Double-check that the SchedulerName matches what we're expecting
if status := e.checkSchedulerName(logger, pod); status != nil {
return framework.MinNodeScore, status
}
vmInfo, err := e.getVmInfo(logger, pod, "Score")
if err != nil {
logger.Error("Error getting VM info for Pod", zap.Error(err))
return 0, framework.NewStatus(framework.Error, "Error getting info for pod")
}
// note: vmInfo may be nil here if the pod does not correspond to a NeonVM virtual machine
e.state.lock.Lock()
defer e.state.lock.Unlock()
// Score by total resources available:
node, err := e.state.getOrFetchNodeState(ctx, logger, e.metrics, e.nodeStore, nodeName)
if err != nil {
logger.Error("Error getting node state", zap.Error(err))
return 0, framework.NewStatus(framework.Error, "Error fetching state for node")
}
// Special case: return minimum score if we don't have room
overbudget, verdict := e.speculativeReserve(node, vmInfo, pod, false, func(_ verdictSet, _ bool) bool {
return false // never actually accept the pod; we're just doing this to ask if it's over-budget.
})
if overbudget {
score := framework.MinNodeScore
logger.Warn(
"No room on node, giving minimum score (typically handled by Filter method)",
zap.Int64("score", score),
zap.Object("verdict", verdict),
)
return score, nil
}
cpuRemaining := node.remainingReservableCPU()
cpuTotal := node.cpu.Total
memRemaining := node.remainingReservableMem()
memTotal := node.mem.Total
cpuFraction := 1 - cpuRemaining.AsFloat64()/cpuTotal.AsFloat64()
memFraction := 1 - memRemaining.AsFloat64()/memTotal.AsFloat64()
cpuScale := node.cpu.Total.AsFloat64() / e.state.maxTotalReservableCPU.AsFloat64()
memScale := node.mem.Total.AsFloat64() / e.state.maxTotalReservableMem.AsFloat64()
nodeConf := e.state.conf.NodeConfig
// Refer to the comments in nodeConfig for more. Also, see: https://www.desmos.com/calculator/wg8s0yn63s
calculateScore := func(fraction, scale float64) (float64, int64) {
y0 := nodeConf.MinUsageScore
y1 := nodeConf.MaxUsageScore
xp := nodeConf.ScorePeak
score := float64(1) // if fraction == nodeConf.ScorePeak
if fraction < nodeConf.ScorePeak {
score = y0 + (1-y0)/xp*fraction
} else if fraction > nodeConf.ScorePeak {
score = y1 + (1-y1)/(1-xp)*(1-fraction)
}
score *= scale
return score, framework.MinNodeScore + int64(float64(scoreLen)*score)
}
cpuFScore, cpuIScore := calculateScore(cpuFraction, cpuScale)
memFScore, memIScore := calculateScore(memFraction, memScale)
score := util.Min(cpuIScore, memIScore)
logger.Info(
"Scored pod placement for node",
zap.Int64("score", score),
zap.Object("verdict", verdictSet{
cpu: fmt.Sprintf(
"%d remaining reservable of %d total => fraction=%g, scale=%g => score=(%g :: %d)",
cpuRemaining, cpuTotal, cpuFraction, cpuScale, cpuFScore, cpuIScore,
),
mem: fmt.Sprintf(
"%d remaining reservable of %d total => fraction=%g, scale=%g => score=(%g :: %d)",
memRemaining, memTotal, memFraction, memScale, memFScore, memIScore,
),
}),
)
return score, nil
}
// NormalizeScore weights scores uniformly in the range [minScore, trueScore], where
// minScore is framework.MinNodeScore + 1.
func (e *AutoscaleEnforcer) NormalizeScore(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
scores framework.NodeScoreList,
) (status *framework.Status) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("NormalizeScore", pod, ignored)
defer func() {
e.metrics.IncFailIfNotSuccess("NormalizeScore", pod, ignored, status)
}()
logger := e.logger.With(zap.String("method", "NormalizeScore"), util.PodNameFields(pod))
logger.Info("Handling NormalizeScore request")
for _, node := range scores {
nodeScore := node.Score
nodeName := node.Name
// rand.Intn will panic if we pass in 0
if nodeScore == 0 {
logger.Info("Ignoring node as it was assigned a score of 0", zap.String("node", nodeName))
continue
}
// This is different from framework.MinNodeScore. We use framework.MinNodeScore
// to indicate that a pod should not be placed on a node. The lowest
// actual score we assign a node is thus framework.MinNodeScore + 1
minScore := framework.MinNodeScore + 1
// We want to pick a score in the range [minScore, score], so use
// score _+ 1_ - minscore, as rand.Intn picks a number in the _half open_
// range [0, n)
newScore := int64(rand.Intn(int(nodeScore+1-minScore))) + minScore
logger.Info(
"Randomly choosing newScore from range [minScore, trueScore]",
zap.String("node", nodeName),
zap.Int64("newScore", newScore),
zap.Int64("minScore", minScore),
zap.Int64("trueScore", nodeScore),
)
node.Score = newScore
}
return nil
}
// ScoreExtensions is required for framework.ScorePlugin, and can return nil if it's not used.
// However, we do use it, to randomize scores.
func (e *AutoscaleEnforcer) ScoreExtensions() framework.ScoreExtensions {
if e.state.conf.RandomizeScores {
return e
} else {
return nil
}
}
// Reserve signals to our plugin that a particular pod will (probably) be bound to a node, giving us
// a chance to both (a) reserve the resources it needs within the node and (b) reject the pod if
// there aren't enough.
//
// Required for framework.ReservePlugin
func (e *AutoscaleEnforcer) Reserve(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
nodeName string,
) (status *framework.Status) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("Reserve", pod, ignored)
defer func() {
e.metrics.IncFailIfNotSuccess("Reserve", pod, ignored, status)
}()
logger := e.logger.With(zap.String("method", "Reserve"), zap.String("node", nodeName), util.PodNameFields(pod))
if migrationName := util.TryPodOwnerVirtualMachineMigration(pod); migrationName != nil {
logger = logger.With(zap.Object("virtualmachinemigration", *migrationName))
}
logger.Info("Handling Reserve request")
if ignored {
// Generally, we shouldn't be getting plugin requests for resources that are ignored.
logger.Warn("Ignoring Reserve request for pod in ignored namespace")
return nil // success; allow the Pod onto the node.
}
// Double-check that the SchedulerName matches what we're expecting
if status := e.checkSchedulerName(logger, pod); status != nil {
return status
}
ok, verdict, err := e.reserveResources(ctx, logger, pod, "Reserve", reserveOptions{
// we *could* deny, but that's ultimately less reliable.
// For more, see https://github.com/neondatabase/autoscaling/issues/869
allowDeny: false,
// don't include buffer because we know that future changes by the autoscaler-agent must go
// through us.
includeBuffer: false,
preexisting: false,
})
if err != nil {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
}
if ok {
logger.Info("Allowing reserve Pod", zap.Object("verdict", verdict))
return nil // nil is success
} else {
logger.Error("Rejecting reserve Pod (not enough resources)", zap.Object("verdict", verdict))
return framework.NewStatus(framework.Unschedulable, "Not enough resources to reserve Pod")
}
}
// Unreserve marks a pod as no longer on-track to being bound to a node, so we can release the
// resources we previously reserved for it.
//
// Required for framework.ReservePlugin.
//
// Note: the documentation for ReservePlugin indicates that Unreserve both (a) must be idempotent
// and (b) may be called without a previous call to Reserve for the same pod.
func (e *AutoscaleEnforcer) Unreserve(
ctx context.Context,
state *framework.CycleState,
pod *corev1.Pod,
nodeName string,
) {
ignored := e.state.conf.ignoredNamespace(pod.Namespace)
e.metrics.IncMethodCall("Unreserve", pod, ignored)
podName := util.GetNamespacedName(pod)
logger := e.logger.With(zap.String("method", "Unreserve"), zap.String("node", nodeName), util.PodNameFields(pod))
logger.Info("Handling Unreserve request")
if ignored {
// Generally, we shouldn't be getting plugin requests for resources that are ignored.
logger.Warn("Ignoring Unreserve request for pod in ignored namespace")
return
}
logFields, kind, migrating, verdict := e.unreserveResources(logger, podName)
logger.With(logFields...).Info(
fmt.Sprintf("Unreserved %s Pod", kind),
zap.Bool("migrating", migrating),
zap.Object("verdict", verdict),
)
}
package plugin
// defines prometheus metrics and provides the server, via (*AutoscaleEnforcer).startPrometheusServer()
import (
"strconv"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
corev1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/scheduler/framework"
"github.com/neondatabase/autoscaling/pkg/util"
)
type PromMetrics struct {
pluginCalls *prometheus.CounterVec
pluginCallFails *prometheus.CounterVec
resourceRequests *prometheus.CounterVec
validResourceRequests *prometheus.CounterVec
nodeCPUResources *prometheus.GaugeVec
nodeMemResources *prometheus.GaugeVec
migrationCreations prometheus.Counter
migrationDeletions *prometheus.CounterVec
migrationCreateFails prometheus.Counter
migrationDeleteFails *prometheus.CounterVec
reserveShouldDeny *prometheus.CounterVec
eventQueueDepth prometheus.Gauge
eventQueueAddsTotal prometheus.Counter
eventQueueLatency prometheus.Histogram
}
func (p *AutoscaleEnforcer) makePrometheusRegistry() *prometheus.Registry {
reg := prometheus.NewRegistry()
// register stock collectors directly:
// (even though MustRegister is variadic, the function calls
// are cheap and calling it more than once means that when
// it panics, we know exactly which metric caused the error.)
reg.MustRegister(collectors.NewGoCollector())
reg.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
p.metrics = PromMetrics{
// the util.RegisterMetric() function registers the collector and returns
// it so we can set it directly on the output structure.
pluginCalls: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_extension_calls_total",
Help: "Number of calls to scheduler plugin extension points",
},
[]string{"method", "desired_availability_zone", "ignored_namespace"},
)),
pluginCallFails: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_extension_call_fails_total",
Help: "Number of unsuccessful calls to scheduler plugin extension points",
},
[]string{"method", "desired_availability_zone", "ignored_namespace", "status"},
)),
resourceRequests: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_resource_requests_total",
Help: "Number of resource requests received by the scheduler plugin",
},
[]string{"code"},
)),
validResourceRequests: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_resource_requests_results_total",
Help: "Number of resource requests to the scheduler plugin with various results",
},
[]string{"code", "node", "has_metrics"},
)),
nodeCPUResources: util.RegisterMetric(reg, prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "autoscaling_plugin_node_cpu_resources_current",
Help: "Current amount of CPU for 'nodeResourceState' fields",
},
[]string{"node", "node_group", "availability_zone", "field"},
)),
nodeMemResources: util.RegisterMetric(reg, prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "autoscaling_plugin_node_mem_resources_current",
Help: "Current amount of memory (in bytes) for 'nodeResourceState' fields",
},
[]string{"node", "node_group", "availability_zone", "field"},
)),
migrationCreations: util.RegisterMetric(reg, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "autoscaling_plugin_migrations_created_total",
Help: "Number of successful VirtualMachineMigration Create requests by the plugin",
},
)),
migrationDeletions: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_migrations_deleted_total",
Help: "Number of successful VirtualMachineMigration Delete requests by the plugin",
},
[]string{"phase"},
)),
migrationCreateFails: util.RegisterMetric(reg, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "autoscaling_plugin_migration_create_fails_total",
Help: "Number of failed VirtualMachineMigration Create requests by the plugin",
},
)),
migrationDeleteFails: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_migration_delete_fails_total",
Help: "Number of failed VirtualMachineMigration Delete requests by the plugin",
},
[]string{"phase"},
)),
reserveShouldDeny: util.RegisterMetric(reg, prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "autoscaling_plugin_reserve_should_deny_total",
Help: "Number of times the plugin should deny a reservation",
},
[]string{"availability_zone", "node", "node_group"},
)),
eventQueueDepth: util.RegisterMetric(reg, prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "autoscaling_plugin_eventqueue_depth",
Help: "Current sum depth of all event queues",
},
)),
eventQueueAddsTotal: util.RegisterMetric(reg, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "autoscaling_plugin_eventqueue_adds_total",
Help: "Total number of events added to event queues",
},
)),
eventQueueLatency: util.RegisterMetric(reg, prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "autoscaling_plugin_eventqueue_duration_seconds",
Help: "How long in seconds an item stays in an event queue before being processed",
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12),
},
)),
}
return reg
}
func (m *PromMetrics) IncMethodCall(method string, pod *corev1.Pod, ignored bool) {
m.pluginCalls.WithLabelValues(method, util.PodPreferredAZIfPresent(pod), strconv.FormatBool(ignored)).Inc()
}
func (m *PromMetrics) IncFailIfNotSuccess(method string, pod *corev1.Pod, ignored bool, status *framework.Status) {
if !status.IsSuccess() {
return
}
m.pluginCallFails.WithLabelValues(method, util.PodPreferredAZIfPresent(pod), strconv.FormatBool(ignored), status.Code().String())
}
func (m *PromMetrics) IncReserveShouldDeny(pod *corev1.Pod, node *nodeState) {
m.reserveShouldDeny.WithLabelValues(util.PodPreferredAZIfPresent(pod), node.name, node.nodeGroup).Inc()
}
package plugin
// Implementation of a metrics-based migration priority queue over vmPodStates
import (
"container/heap"
)
type migrationQueue []*vmPodState
///////////////////////
// package-local API //
///////////////////////
func (mq *migrationQueue) addOrUpdate(vm *vmPodState) {
if vm.MqIndex == -1 {
heap.Push(mq, vm)
} else {
heap.Fix(mq, vm.MqIndex)
}
}
func (mq migrationQueue) isNextInQueue(vm *vmPodState) bool {
// the documentation for heap.Pop says that it's equivalent to heap.Remove(h, 0). Therefore,
// checking whether something's the next pop target can just be done by checking if its index is
// zero.
return vm.MqIndex == 0
}
func (mq *migrationQueue) removeIfPresent(vm *vmPodState) {
if vm.MqIndex != -1 {
_ = heap.Remove(mq, vm.MqIndex)
vm.MqIndex = -1
}
}
//////////////////////////////////////
// container/heap.Interface methods //
//////////////////////////////////////
func (mq migrationQueue) Len() int { return len(mq) }
func (mq migrationQueue) Less(i, j int) bool {
return mq[i].isBetterMigrationTarget(mq[j])
}
func (mq migrationQueue) Swap(i, j int) {
mq[i], mq[j] = mq[j], mq[i]
mq[i].MqIndex = i
mq[j].MqIndex = j
}
func (mq *migrationQueue) Push(v any) {
n := len(*mq)
vm := v.(*vmPodState)
vm.MqIndex = n
*mq = append(*mq, vm)
}
func (mq *migrationQueue) Pop() any {
// Function body + comments taken from the example at https://pkg.go.dev/container/heap
old := *mq
n := len(old)
vm := old[n-1]
old[n-1] = nil // avoid memory leak
vm.MqIndex = -1 // for safety
*mq = old[0 : n-1]
return vm
}
package plugin
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"strconv"
"time"
"github.com/tychoish/fun/srv"
"go.uber.org/zap"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
)
const (
MaxHTTPBodySize int64 = 1 << 10 // 1 KiB
ContentTypeJSON string = "application/json"
ContentTypeError string = "text/plain"
)
// The scheduler plugin currently supports v3.0 to v5.0 of the agent<->scheduler plugin protocol.
//
// If you update either of these values, make sure to also update VERSIONING.md.
const (
MinPluginProtocolVersion api.PluginProtoVersion = api.PluginProtoV3_0
MaxPluginProtocolVersion api.PluginProtoVersion = api.PluginProtoV5_0
)
// startPermitHandler runs the server for handling each resourceRequest from a pod
func (e *AutoscaleEnforcer) startPermitHandler(ctx context.Context, logger *zap.Logger) error {
mux := http.NewServeMux()
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
logger := logger // copy locally, so that we can add fields and refer to it in defers
var finalStatus int
defer func() {
e.metrics.resourceRequests.WithLabelValues(strconv.Itoa(finalStatus)).Inc()
}()
// Catch any potential panics and report them as 500s
defer func() {
if err := recover(); err != nil {
msg := "request handler panicked"
logger.Error(msg, zap.String("error", fmt.Sprint(err)))
finalStatus = 500
w.WriteHeader(finalStatus)
_, _ = w.Write([]byte(msg))
}
}()
if r.Method != "POST" {
finalStatus = 400
w.WriteHeader(400)
_, _ = w.Write([]byte("must be POST"))
return
}
defer r.Body.Close()
var req api.AgentRequest
jsonDecoder := json.NewDecoder(io.LimitReader(r.Body, MaxHTTPBodySize))
if err := jsonDecoder.Decode(&req); err != nil {
logger.Warn("Received bad JSON in request", zap.Error(err))
w.Header().Add("Content-Type", ContentTypeError)
finalStatus = 400
w.WriteHeader(400)
_, _ = w.Write([]byte("bad JSON"))
return
}
logger = logger.With(
zap.Object("pod", req.Pod),
zap.String("client", r.RemoteAddr),
zap.Any("request", req),
)
resp, statusCode, err := e.handleAgentRequest(logger, req)
finalStatus = statusCode
if err != nil {
logFunc := logger.Warn
if 500 <= statusCode && statusCode < 600 {
logFunc = logger.Error
}
logFunc(
"Responding to autoscaler-agent request with error",
zap.Int("status", statusCode),
zap.Error(err),
)
w.Header().Add("Content-Type", ContentTypeError)
w.WriteHeader(statusCode)
_, _ = w.Write([]byte(err.Error()))
return
}
responseBody, err := json.Marshal(&resp)
if err != nil {
logger.Panic("Failed to encode response JSON", zap.Error(err))
}
w.Header().Add("Content-Type", ContentTypeJSON)
w.WriteHeader(statusCode)
_, _ = w.Write(responseBody)
})
orca := srv.GetOrchestrator(ctx)
logger.Info("Starting resource request server")
hs := srv.HTTP("resource-request", 5*time.Second, &http.Server{Addr: "0.0.0.0:10299", Handler: mux})
if err := hs.Start(ctx); err != nil {
return fmt.Errorf("Error starting resource request server: %w", err)
}
if err := orca.Add(hs); err != nil {
return fmt.Errorf("Error adding resource request server to orchestrator: %w", err)
}
return nil
}
// Returns body (if successful), status code, error (if unsuccessful)
func (e *AutoscaleEnforcer) handleAgentRequest(
logger *zap.Logger,
req api.AgentRequest,
) (_ *api.PluginResponse, status int, _ error) {
nodeName := "<none>" // override this later if we have a node name
defer func() {
hasMetrics := req.Metrics != nil
e.metrics.validResourceRequests.
WithLabelValues(strconv.Itoa(status), nodeName, strconv.FormatBool(hasMetrics)).
Inc()
}()
// Before doing anything, check that the version is within the range we're expecting.
expectedProtoRange := api.VersionRange[api.PluginProtoVersion]{
Min: MinPluginProtocolVersion,
Max: MaxPluginProtocolVersion,
}
if !req.ProtoVersion.IsValid() {
return nil, 400, fmt.Errorf("Invalid protocol version %v", req.ProtoVersion)
}
reqProtoRange := req.ProtocolRange()
if _, ok := expectedProtoRange.LatestSharedVersion(reqProtoRange); !ok {
return nil, 400, fmt.Errorf(
"Protocol version mismatch: Need %v but got %v", expectedProtoRange, reqProtoRange,
)
}
// if req.Metrics is nil, check that the protocol version allows that.
if req.Metrics == nil && !req.ProtoVersion.AllowsNilMetrics() {
return nil, 400, fmt.Errorf("nil metrics not supported for protocol version %v", req.ProtoVersion)
}
// check that req.ComputeUnit has no zeros
if err := req.ComputeUnit.ValidateNonZero(); err != nil {
return nil, 400, fmt.Errorf("computeUnit fields must be non-zero: %w", err)
}
// check that nil-ness of req.Metrics.{LoadAverage5Min,MemoryUsageBytes} match what's expected
// for the protocol version.
if req.Metrics != nil {
if (req.Metrics.LoadAverage5Min != nil) != (req.Metrics.MemoryUsageBytes != nil) {
return nil, 400, fmt.Errorf("presence of metrics.loadAvg5M must match presence of metrics.memoryUsageBytes")
} else if req.Metrics.LoadAverage5Min == nil && req.ProtoVersion.IncludesExtendedMetrics() {
return nil, 400, fmt.Errorf("nil metrics.{loadAvg5M,memoryUsageBytes} not supported for protocol version %v", req.ProtoVersion)
} else if req.Metrics.LoadAverage5Min != nil && !req.ProtoVersion.IncludesExtendedMetrics() {
return nil, 400, fmt.Errorf("non-nil metrics.{loadAvg5M,memoryUsageBytes} not supported for protocol version %v", req.ProtoVersion)
}
}
e.state.lock.Lock()
defer e.state.lock.Unlock()
pod, ok := e.state.pods[req.Pod]
if !ok {
logger.Warn("Received request for Pod we don't know") // pod already in the logger's context
return nil, 404, errors.New("pod not found")
}
if pod.vm == nil {
logger.Error("Received request for non-VM Pod")
return nil, 400, errors.New("pod is not associated with a VM")
}
// Check that req.ComputeUnit.Mem is divisible by the VM's memory slot size
if req.ComputeUnit.Mem%pod.vm.MemSlotSize != 0 {
return nil, 400, fmt.Errorf(
"computeUnit is not divisible by VM memory slot size: %v not divisible by %v",
req.ComputeUnit,
pod.vm.MemSlotSize,
)
}
// If the request was actually sending a quantity of *memory slots*, rather than bytes, then
// multiply memory resources to make it match the
if !req.ProtoVersion.RepresentsMemoryAsBytes() {
req.Resources.Mem *= pod.vm.MemSlotSize
}
node := pod.node
nodeName = node.name // set nodeName for deferred metrics
// Also, now that we know which VM this refers to (and which node it's on), add that to the logger for later.
logger = logger.With(zap.Object("virtualmachine", pod.vm.Name), zap.String("node", nodeName))
mustMigrate := pod.vm.MigrationState == nil &&
// Check whether the pod *will* migrate, then update its resources, and THEN start its
// migration, using the possibly-changed resources.
e.updateMetricsAndCheckMustMigrate(logger, pod.vm, node, req.Metrics)
supportsFractionalCPU := req.ProtoVersion.SupportsFractionalCPU()
verdict, permit, status, err := e.handleResources(
pod,
node,
req.ComputeUnit,
req.Resources,
req.LastPermit,
mustMigrate,
supportsFractionalCPU,
)
if err != nil {
return nil, status, err
}
var migrateDecision *api.MigrateResponse
if mustMigrate {
created, err := e.startMigration(context.Background(), logger, pod)
if err != nil {
return nil, 500, fmt.Errorf("Error starting migration for pod %v: %w", pod.name, err)
}
// We should only signal to the autoscaler-agent that we've started migrating if we actually
// *created* the migration. We're not *supposed* to receive requests for a VM that's already
// migrating, so receiving one means that *something*'s gone wrong. If that's on us, we
// should try to avoid
if created {
migrateDecision = &api.MigrateResponse{}
}
}
status = 200
resp := api.PluginResponse{
Permit: permit,
Migrate: migrateDecision,
}
logger.Info(
"Handled agent request",
zap.Object("verdict", verdict),
zap.Int("status", status),
zap.Any("response", resp),
)
return &resp, status, nil
}
func (e *AutoscaleEnforcer) handleResources(
pod *podState,
node *nodeState,
cu api.Resources,
req api.Resources,
lastPermit *api.Resources,
startingMigration bool,
supportsFractionalCPU bool,
) (verdictSet, api.Resources, int, error) {
if !supportsFractionalCPU && req.VCPU%1000 != 0 {
err := errors.New("agent requested fractional CPU with protocol version that does not support it")
return verdictSet{}, api.Resources{}, 400, err
}
// Check that we aren't being asked to do something during migration:
if pod.vm.currentlyMigrating() {
// The agent shouldn't have asked for a change after already receiving notice that it's
// migrating.
if req.VCPU != pod.cpu.Reserved || req.Mem != pod.mem.Reserved {
err := errors.New("cannot change resources: agent has already been informed that pod is migrating")
return verdictSet{}, api.Resources{}, 400, err
}
message := "No change because pod is migrating"
verdict := verdictSet{cpu: message, mem: message}
return verdict, api.Resources{VCPU: pod.cpu.Reserved, Mem: pod.mem.Reserved}, 200, nil
}
cpuFactor := cu.VCPU
if !supportsFractionalCPU {
cpuFactor = 1000
}
memFactor := cu.Mem
var lastCPUPermit *vmapi.MilliCPU
var lastMemPermit *api.Bytes
if lastPermit != nil {
lastCPUPermit = &lastPermit.VCPU
lastMemPermit = &lastPermit.Mem
}
cpuVerdict := makeResourceTransitioner(&node.cpu, &pod.cpu).
handleRequested(req.VCPU, lastCPUPermit, startingMigration, cpuFactor)
memVerdict := makeResourceTransitioner(&node.mem, &pod.mem).
handleRequested(req.Mem, lastMemPermit, startingMigration, memFactor)
verdict := verdictSet{cpu: cpuVerdict, mem: memVerdict}
permit := api.Resources{VCPU: pod.cpu.Reserved, Mem: pod.mem.Reserved}
return verdict, permit, 200, nil
}
func (e *AutoscaleEnforcer) updateMetricsAndCheckMustMigrate(
logger *zap.Logger,
vm *vmPodState,
node *nodeState,
metrics *api.Metrics,
) bool {
// This pod should migrate if (a) it's allowed to migrate, (b) node resource usage is high
// enough that we should migrate *something*, and (c) it's next up in the priority queue.
// We will give it a chance later to veto if the metrics have changed too much.
//
// Alternatively, "the pod is marked to always migrate" causes it to migrate even if none of
// the above conditions are met, so long as it has *previously* provided metrics.
canMigrate := vm.Config.AutoMigrationEnabled && e.state.conf.migrationEnabled()
shouldMigrate := node.mq.isNextInQueue(vm) && node.tooMuchPressure(logger)
forcedMigrate := vm.Config.AlwaysMigrate && vm.Metrics != nil
logger.Info("Updating pod metrics", zap.Any("metrics", metrics))
oldMetrics := vm.Metrics
vm.Metrics = metrics
if vm.currentlyMigrating() {
return false // don't do anything else; it's already migrating.
}
node.mq.addOrUpdate(vm)
// nb: forcedMigrate takes priority over canMigrate
if (!canMigrate || !shouldMigrate) && !forcedMigrate {
return false
}
// Give the pod a chance to veto migration if its metrics have significantly changed...
var veto error
if oldMetrics != nil && !forcedMigrate {
veto = vm.checkOkToMigrate(*oldMetrics)
}
// ... but override the veto if it's still the best candidate anyways.
stillFirst := node.mq.isNextInQueue(vm)
if forcedMigrate || stillFirst || veto == nil {
if veto != nil {
logger.Info("Pod attempted veto of self migration, still highest priority", zap.NamedError("veto", veto))
}
return true
} else {
logger.Warn("Pod vetoed self migration", zap.NamedError("veto", veto))
return false
}
}
package plugin
// Definitions and helper functions for managing plugin state
import (
"context"
"errors"
"fmt"
"time"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
"github.com/neondatabase/autoscaling/pkg/util/watch"
"github.com/neondatabase/autoscaling/pkg/util/xact"
)
// pluginState stores the private state for the plugin, used both within and outside of the
// predefined scheduler plugin points
//
// Accessing the individual fields MUST be done while holding the lock, with some exceptions.
type pluginState struct {
lock util.ChanMutex
ongoingMigrationDeletions map[util.NamespacedName]int
pods map[util.NamespacedName]*podState
nodes map[string]*nodeState
// maxTotalReservableCPU stores the maximum value of any node's totalReservableCPU(), so that we
// can appropriately scale our scoring
maxTotalReservableCPU vmapi.MilliCPU
// maxTotalReservableMem is the same as maxTotalReservableCPU, but for bytes of memory instead
// of CPU
maxTotalReservableMem api.Bytes
// conf stores the current configuration, and is nil if the configuration has not yet been set
//
// Proper initialization of the plugin guarantees conf is not nil.
//
// conf MAY be accessed without holding the lock; it MUST not be modified.
conf *Config
}
// nodeState is the information that we track for a particular
type nodeState struct {
// name is the name of the node, guaranteed by kubernetes to be unique
name string
// nodeGroup, if present, gives the node group that this node belongs to.
nodeGroup string
// availabilityZone, if present, gives the availability zone that this node is in.
availabilityZone string
// cpu tracks the state of vCPU resources -- what's available and how
cpu nodeResourceState[vmapi.MilliCPU]
// mem tracks the state of bytes of memory -- what's available and how
mem nodeResourceState[api.Bytes]
// pods tracks all the VM pods assigned to this node
//
// This includes both bound pods (i.e., pods fully committed to the node) and reserved pods
// (still may be unreserved)
pods map[util.NamespacedName]*podState
// mq is the priority queue tracking which pods should be chosen first for migration
mq migrationQueue
}
type nodeResourceStateField[T any] struct {
valueName string
value T
}
func (s *nodeResourceState[T]) fields() []nodeResourceStateField[T] {
return []nodeResourceStateField[T]{
{"Total", s.Total},
{"Watermark", s.Watermark},
{"Reserved", s.Reserved},
{"Buffer", s.Buffer},
{"CapacityPressure", s.CapacityPressure},
{"PressureAccountedFor", s.PressureAccountedFor},
}
}
func (s *nodeState) updateMetrics(metrics PromMetrics) {
s.cpu.updateMetrics(metrics.nodeCPUResources, s.name, s.nodeGroup, s.availabilityZone, vmapi.MilliCPU.AsFloat64)
s.mem.updateMetrics(metrics.nodeMemResources, s.name, s.nodeGroup, s.availabilityZone, api.Bytes.AsFloat64)
}
func (s *nodeResourceState[T]) updateMetrics(
metric *prometheus.GaugeVec,
nodeName string,
nodeGroup string,
availabilityZone string,
convert func(T) float64,
) {
for _, f := range s.fields() {
metric.WithLabelValues(nodeName, nodeGroup, availabilityZone, f.valueName).Set(convert(f.value))
}
}
func (s *nodeState) removeMetrics(metrics PromMetrics) {
gauges := []*prometheus.GaugeVec{metrics.nodeCPUResources, metrics.nodeMemResources}
fields := s.cpu.fields() // No particular reason to be CPU, we just want the valueNames, and CPU vs memory valueNames are the same
for _, g := range gauges {
for _, f := range fields {
g.DeleteLabelValues(s.name, s.nodeGroup, s.availabilityZone, f.valueName)
}
}
}
// nodeResourceState describes the state of a resource allocated to a node
type nodeResourceState[T any] struct {
// Total is the Total amount of T available on the node. This value does not change.
Total T `json:"total"`
// Watermark is the amount of T reserved to pods above which we attempt to reduce usage via
// migration.
Watermark T `json:"watermark"`
// Reserved is the current amount of T reserved to pods. It SHOULD be less than or equal to
// Total), and we take active measures reduce it once it is above Watermark.
//
// Reserved MAY be greater than Total on scheduler restart (because of buffering with VM scaling
// maximums), but (Reserved - Buffer) MUST be less than Total. In general, (Reserved - Buffer)
// SHOULD be less than or equal to Total, but this can be temporarily violated after restart.
//
// For more information, refer to the ARCHITECTURE.md file in this directory.
//
// Reserved is always exactly equal to the sum of all of this node's pods' Reserved T.
Reserved T `json:"reserved"`
// Buffer *mostly* matters during startup. It tracks the total amount of T that we don't
// *expect* is currently in use, but is still reserved to the pods because we can't prevent the
// autoscaler-agents from making use of it.
//
// Buffer is always exactly equal to the sum of all this node's pods' Buffer for T.
Buffer T `json:"buffer"`
// CapacityPressure is -- roughly speaking -- the amount of T that we're currently denying to
// pods in this node when they request it, due to not having space in remainingReservableCPU().
// This value is exactly equal to the sum of each pod's CapacityPressure.
//
// This value is used alongside the "logical pressure" (equal to Reserved - Watermark, if
// nonzero) in tooMuchPressure() to determine if more pods should be migrated off the node to
// free up pressure.
CapacityPressure T `json:"capacityPressure"`
// PressureAccountedFor gives the total pressure expected to be relieved by ongoing migrations.
// This is equal to the sum of Reserved + CapacityPressure for all pods currently migrating.
//
// The value may be larger than CapacityPressure.
PressureAccountedFor T `json:"pressureAccountedFor"`
}
// podState is the information we track for an individual pod, which may or may not be associated
// with a VM
type podState struct {
// name is the namespace'd name of the pod
//
// name will not change after initialization, so it can be accessed without holding a lock.
name util.NamespacedName
// node provides information about the node that this pod is bound to or reserved onto.
node *nodeState
// cpu is the current state of this pod's vCPU utilization and pressure
cpu podResourceState[vmapi.MilliCPU]
// memBytes is the current state of this pod's memory utilization and pressure
mem podResourceState[api.Bytes]
// vm stores the extra information associated with VMs
vm *vmPodState
}
type vmPodState struct {
// Name is the name of the VM, as given by the owner reference for the VM or VM migration that
// owns this pod
Name util.NamespacedName
// MemSlotSize stores the value of the VM's .Spec.Guest.MemorySlotSize, for compatibility with
// earlier versions of the agent<->plugin protocol.
MemSlotSize api.Bytes
// Config stores the values of per-VM settings for this VM
Config api.VmConfig
// Metrics is the most recent Metrics update we received for this pod. A nil pointer means that
// we have not yet received Metrics.
Metrics *api.Metrics
// MqIndex stores this pod's index in the migrationQueue. This value is -1 iff metrics is nil or
// it is currently migrating.
MqIndex int
// MigrationState gives current information about an ongoing migration, if this pod is currently
// migrating.
MigrationState *podMigrationState
}
// podMigrationState tracks the information about an ongoing VM pod's migration
type podMigrationState struct {
// Name gives the name of the VirtualMachineMigration that this pod is involved in
Name util.NamespacedName
}
type podResourceState[T any] struct {
// Reserved is the amount of T that this pod has reserved. It is guaranteed that the pod is
// using AT MOST Reserved T.
Reserved T `json:"reserved"`
// Buffer is the amount of T that we've included in Reserved to account for the possibility of
// unilateral increases by the autoscaler-agent
//
// This value is only nonzero during startup (between initial state load and first communication
// from the autoscaler-agent), and MUST be less than or equal to reserved.
//
// After the first communication from the autoscaler-agent, we update Reserved to match its
// value, and set Buffer to zero.
Buffer T `json:"buffer"`
// CapacityPressure is this pod's contribution to this pod's node's CapacityPressure for this
// resource
CapacityPressure T `json:"capacityPressure"`
// Min and Max give the minimum and maximum values of this resource that the VM may use.
Min T `json:"min"`
Max T `json:"max"`
}
func (p *podState) kind() string {
if p.vm != nil {
return "VM"
} else {
return "non-VM"
}
}
func (p *podState) logFields() []zap.Field {
podName := zap.Object("pod", p.name)
if p.vm != nil {
vmName := zap.Object("virtualmachine", p.vm.Name)
return []zap.Field{podName, vmName}
} else {
return []zap.Field{podName}
}
}
// remainingReservableCPU returns the remaining CPU that can be allocated to VM pods
func (s *nodeState) remainingReservableCPU() vmapi.MilliCPU {
return util.SaturatingSub(s.cpu.Total, s.cpu.Reserved)
}
// remainingReservableMem returns the remaining number of bytes of memory that can be allocated to
// VM pods
func (s *nodeState) remainingReservableMem() api.Bytes {
return util.SaturatingSub(s.mem.Total, s.mem.Reserved)
}
// tooMuchPressure is used to signal whether the node should start migrating pods out in order to
// relieve some of the pressure
func (s *nodeState) tooMuchPressure(logger *zap.Logger) bool {
if s.cpu.Reserved <= s.cpu.Watermark && s.mem.Reserved < s.mem.Watermark {
type okPair[T any] struct {
Reserved T
Watermark T
}
logger.Debug(
"tooMuchPressure = false (clearly)",
zap.Any("cpu", okPair[vmapi.MilliCPU]{Reserved: s.cpu.Reserved, Watermark: s.cpu.Watermark}),
zap.Any("mem", okPair[api.Bytes]{Reserved: s.mem.Reserved, Watermark: s.mem.Watermark}),
)
return false
}
type info[T any] struct {
LogicalPressure T
LogicalSlack T
Capacity T
AccountedFor T
TooMuch bool
}
var cpu info[vmapi.MilliCPU]
var mem info[api.Bytes]
cpu.LogicalPressure = util.SaturatingSub(s.cpu.Reserved, s.cpu.Watermark)
mem.LogicalPressure = util.SaturatingSub(s.mem.Reserved, s.mem.Watermark)
// Account for existing slack in the system, to counteract capacityPressure that hasn't been
// updated yet
cpu.LogicalSlack = s.cpu.Buffer + util.SaturatingSub(s.cpu.Watermark, s.cpu.Reserved)
mem.LogicalSlack = s.mem.Buffer + util.SaturatingSub(s.mem.Watermark, s.mem.Reserved)
cpu.TooMuch = cpu.LogicalPressure+s.cpu.CapacityPressure > s.cpu.PressureAccountedFor+cpu.LogicalSlack
mem.TooMuch = mem.LogicalPressure+s.mem.CapacityPressure > s.mem.PressureAccountedFor+mem.LogicalSlack
result := cpu.TooMuch || mem.TooMuch
logger.Debug(
fmt.Sprintf("tooMuchPressure = %v", result),
zap.Any("cpu", cpu),
zap.Any("mem", mem),
)
return result
}
// checkOkToMigrate allows us to check that it's still ok to start migrating a pod, after it was
// previously selected for migration
//
// A returned error indicates that the pod's resource usage has changed enough that we should try to
// migrate something else first. The error provides justification for this.
func (s *vmPodState) checkOkToMigrate(oldMetrics api.Metrics) error {
// TODO. Note: s.metrics may be nil.
return nil
}
func (s *vmPodState) currentlyMigrating() bool {
return s.MigrationState != nil
}
// this method can only be called while holding a lock. If we don't have the necessary information
// locally, then the lock is released temporarily while we query the API server
//
// A lock will ALWAYS be held on return from this function.
func (s *pluginState) getOrFetchNodeState(
ctx context.Context,
logger *zap.Logger,
metrics PromMetrics,
store IndexedNodeStore,
nodeName string,
) (*nodeState, error) {
logger = logger.With(zap.String("node", nodeName))
if n, ok := s.nodes[nodeName]; ok {
logger.Debug("Using stored information for node")
return n, nil
}
logger.Info("Node has not yet been processed, fetching from store")
accessor := func(index *watch.FlatNameIndex[corev1.Node]) (*corev1.Node, bool) {
return index.Get(nodeName)
}
// Before unlocking, try to get the node from the store.
node, ok := store.GetIndexed(accessor)
if !ok {
logger.Warn("Node is missing from local store. Relisting to try getting it from API server")
s.lock.Unlock() // Unlock to let other goroutines progress while we get the data we need
var locked bool // In order to prevent double-unlock panics, we always lock on return.
defer func() {
if !locked {
s.lock.Lock()
}
}()
// Use a reasonable timeout on the relist request, so that if the store is broken, we won't
// block forever.
//
// FIXME: make this configurable
timeout := 5 * time.Second
timer := time.NewTimer(timeout)
defer timer.Stop()
select {
case <-store.Relist():
case <-timer.C:
message := "Timed out waiting on Node store relist"
logger.Error(message, zap.Duration("timeout", timeout))
return nil, errors.New(message)
case <-ctx.Done():
err := ctx.Err()
message := "Context expired while waiting on Node store relist"
logger.Error(message, zap.Error(err))
return nil, errors.New(message)
}
node, ok = store.GetIndexed(accessor)
if !ok {
// Either the node is already gone, or there's a deeper problem.
message := "Could not find Node, even after relist"
logger.Error(message)
return nil, errors.New(message)
}
logger.Info("Found node after relisting")
// Re-lock and process API result
locked = true
s.lock.Lock()
// It's possible that the node was already added. Don't double-process nodes if we don't have
// to.
if n, ok := s.nodes[nodeName]; ok {
logger.Warn("Local information for node became available while waiting on relist, using it instead")
return n, nil
}
}
n, err := buildInitialNodeState(logger, node, s.conf)
if err != nil {
return nil, err
}
// update maxTotalReservableCPU and maxTotalReservableMem if there's new maxima
if n.cpu.Total > s.maxTotalReservableCPU {
s.maxTotalReservableCPU = n.cpu.Total
}
if n.mem.Total > s.maxTotalReservableMem {
s.maxTotalReservableMem = n.mem.Total
}
n.updateMetrics(metrics)
s.nodes[nodeName] = n
return n, nil
}
// this method must only be called while holding s.lock. It will not be released during this
// function.
//
// Note: buildInitialNodeState does not take any of the pods or VMs on the node into account; it
// only examines the total resources available to the node.
func buildInitialNodeState(logger *zap.Logger, node *corev1.Node, conf *Config) (*nodeState, error) {
// cpuQ = "cpu, as a K8s resource.Quantity"
// -A for allocatable, -C for capacity
var cpuQ *resource.Quantity
cpuQA := node.Status.Allocatable.Cpu()
cpuQC := node.Status.Capacity.Cpu()
if cpuQA != nil {
// Use Allocatable by default ...
cpuQ = cpuQA
} else if cpuQC != nil {
// ... but use Capacity if Allocatable is not available
cpuQ = cpuQC
} else {
return nil, errors.New("Node has no Allocatable or Capacity CPU limits")
}
cpu := conf.NodeConfig.vCpuLimits(cpuQ)
// memQ = "mem, as a K8s resource.Quantity"
// -A for allocatable, -C for capacity
var memQ *resource.Quantity
memQA := node.Status.Allocatable.Memory()
memQC := node.Status.Capacity.Memory()
if memQA != nil {
memQ = memQA
} else if memQC != nil {
memQ = memQC
} else {
return nil, errors.New("Node has no Allocatable or Capacity Memory limits")
}
mem := conf.NodeConfig.memoryLimits(memQ)
var nodeGroup string
if conf.K8sNodeGroupLabel != "" {
var ok bool
nodeGroup, ok = node.Labels[conf.K8sNodeGroupLabel]
if !ok {
logger.Warn("Node does not have node group label", zap.String("label", conf.K8sNodeGroupLabel))
}
}
var availabilityZone string
if conf.K8sAvailabilityZoneLabel != "" {
var ok bool
availabilityZone, ok = node.Labels[conf.K8sAvailabilityZoneLabel]
if !ok {
logger.Warn("Node does not have availability zone label", zap.String("label", conf.K8sAvailabilityZoneLabel))
}
}
n := &nodeState{
name: node.Name,
nodeGroup: nodeGroup,
availabilityZone: availabilityZone,
cpu: cpu,
mem: mem,
pods: make(map[util.NamespacedName]*podState),
mq: migrationQueue{},
}
type resourceInfo[T any] struct {
Total T
Watermark T
}
logger.Info(
"Built initial node state",
zap.Any("cpu", resourceInfo[vmapi.MilliCPU]{
Total: n.cpu.Total,
Watermark: n.cpu.Watermark,
}),
zap.Any("memSlots", resourceInfo[api.Bytes]{
Total: n.mem.Total,
Watermark: n.mem.Watermark,
}),
)
return n, nil
}
func extractPodResources(pod *corev1.Pod) api.Resources {
var cpu vmapi.MilliCPU
var mem api.Bytes
for _, container := range pod.Spec.Containers {
// For each resource, add the requests, if they're provided. We use this because it matches
// what cluster-autoscaler uses.
//
// NB: .Cpu() returns a pointer to a value equal to zero if the resource is not present. So
// we can just add it either way.
cpu += vmapi.MilliCPUFromResourceQuantity(*container.Resources.Requests.Cpu())
mem += api.BytesFromResourceQuantity(*container.Resources.Requests.Memory())
}
return api.Resources{VCPU: cpu, Mem: mem}
}
func (e *AutoscaleEnforcer) handleNodeDeletion(logger *zap.Logger, nodeName string) {
logger = logger.With(
zap.String("action", "Node deletion"),
zap.String("node", nodeName),
)
logger.Info("Handling deletion of Node")
e.state.lock.Lock()
defer e.state.lock.Unlock()
node, ok := e.state.nodes[nodeName]
if !ok {
logger.Warn("Cannot find node in nodeMap")
}
if logger.Core().Enabled(zapcore.DebugLevel) {
logger.Debug("Dump final node state", zap.Any("state", node.dump()))
}
// For any pods still on the node, remove them from the global state:
for name, pod := range node.pods {
logger.Warn(
fmt.Sprintf("Found %s pod still on node at time of deletion", pod.kind()),
pod.logFields()...,
)
delete(e.state.pods, name)
}
node.removeMetrics(e.metrics)
delete(e.state.nodes, nodeName)
logger.Info("Deleted node")
}
// handleStarted updates the state according to a pod that's already started, but may or may not
// have been scheduled via the plugin.
//
// We need to handle this so that we maintain an accurate view of the resource usage in the cluster;
// otherwise, we might (a) ignore resources from pods that weren't scheduled here, or (b) fail to
// include pods that *were* scheduled here, but had spurious Unreserves.
// (for more, see: https://github.com/neondatabase/autoscaling/pull/435)
func (e *AutoscaleEnforcer) handleStarted(logger *zap.Logger, pod *corev1.Pod, preexisting bool) {
nodeName := pod.Spec.NodeName
logger = logger.With(
zap.String("action", "Pod started"),
zap.String("node", nodeName),
util.PodNameFields(pod),
)
if migrationName := util.TryPodOwnerVirtualMachineMigration(pod); migrationName != nil {
logger = logger.With(zap.Object("virtualmachinemigration", *migrationName))
}
logger.Info("Handling Pod start event")
_, _, _ = e.reserveResources(context.TODO(), logger, pod, "Pod started", reserveOptions{
// pod already started, out of our control - we don't have a mechanism to deny it
allowDeny: false,
// this may be a preexisting VM. If so, we should include it in "buffer" as long it's
// supposed to be handled by us (otherwise, the "buffer" will never be resolved)
includeBuffer: pod.Spec.SchedulerName == e.state.conf.SchedulerName,
preexisting: preexisting,
})
}
type reserveOptions struct {
allowDeny bool
includeBuffer bool
preexisting bool
}
// reserveResources attempts to set aside resources on the node for the pod.
//
// If allowDeny is false, reserveResources is not "allowed" to reject the pod if there isn't enough
// room - it must instead set aside resources that don't exist.
//
// If an unexpected error occurs, the first two return values are unspecified, and the error will be
// non-nil. Otherwise, 'ok' will indicate whether the pod was accepted and the verdictSet will
// provide messages describing the result, suitable for being logged.
func (e *AutoscaleEnforcer) reserveResources(
ctx context.Context,
logger *zap.Logger,
pod *corev1.Pod,
action string,
opts reserveOptions,
) (ok bool, _ *verdictSet, _ error) {
nodeName := pod.Spec.NodeName
if e.state.conf.ignoredNamespace(pod.Namespace) {
panic(fmt.Errorf("reserveResources called with ignored pod %v", util.GetNamespacedName(pod)))
}
vmInfo, err := e.getVmInfo(logger, pod, action)
if err != nil {
msg := "Error getting VM info for Pod"
logger.Error(msg, zap.Error(err))
return false, nil, fmt.Errorf("%s: %w", msg, err)
}
e.state.lock.Lock()
defer e.state.lock.Unlock()
podName := util.GetNamespacedName(pod)
// If the pod already exists, nothing to do
_, isPodInState := e.state.pods[podName]
if isPodInState {
logger.Info("Pod already exists in global state")
return true, &verdictSet{cpu: "", mem: ""}, nil
}
// If the following conditions are met, the pod has bypassed neon scheduler which might be a sign
// of a bug or misbehavior:
// - pod is assigned to autoscaler scheduler
// - pod not in the state
// - pod is not preexisting pod
// - pod has the node name
if !isPodInState && !opts.preexisting && pod.Spec.SchedulerName == e.state.conf.SchedulerName && pod.Spec.NodeName != "" {
logger.Warn("Pod has bypassed neon scheduler")
}
// Get information about the node
node, err := e.state.getOrFetchNodeState(ctx, logger, e.metrics, e.nodeStore, nodeName)
if err != nil {
msg := "Failed to get state for node"
logger.Error(msg, zap.Error(err))
return false, nil, fmt.Errorf("%s: %w", msg, err)
}
accept := func(verdict verdictSet, overBudget bool) bool {
shouldDeny := overBudget
if shouldDeny {
e.metrics.IncReserveShouldDeny(pod, node)
}
if shouldDeny && opts.allowDeny {
logger.Error(
"Can't reserve resources for Pod (not enough available)",
zap.Object("verdict", verdict),
)
return false
}
if opts.allowDeny {
logger.Info("Allowing reserve resources for Pod", zap.Object("verdict", verdict))
} else if shouldDeny /* want to deny, but can't */ {
logger.Warn("Reserved resources for Pod above totals", zap.Object("verdict", verdict))
} else /* don't want to deny, but also couldn't if we wanted to */ {
logger.Info("Reserved resources for Pod", zap.Object("verdict", verdict))
}
return true
}
ok, verdict := e.speculativeReserve(node, vmInfo, pod, opts.includeBuffer, accept)
return ok, &verdict, nil
}
// speculativeReserve reserves the pod, and then calls accept() to see whether the pod should
// actually be added.
//
// If accept() returns false, no changes to the state will be made.
func (e *AutoscaleEnforcer) speculativeReserve(
node *nodeState,
vmInfo *api.VmInfo,
pod *corev1.Pod,
includeBuffer bool,
accept func(verdict verdictSet, overBudget bool) bool,
) (ok bool, _ verdictSet) {
// Construct the speculative state of the pod
//
// We'll pass this into (resourceTransitioner).handleReserve(), but only commit the changes if
// the caller allows us to.
var cpuState podResourceState[vmapi.MilliCPU]
var memState podResourceState[api.Bytes]
var vmState *vmPodState
if vmInfo != nil {
vmState = &vmPodState{
Name: vmInfo.NamespacedName(),
MemSlotSize: vmInfo.Mem.SlotSize,
Config: vmInfo.Config,
Metrics: nil,
MqIndex: -1,
MigrationState: nil,
}
// initially build the resource states assuming that we're including buffer, and then update
// later to remove it if that turns out not to be right.
cpuState = podResourceState[vmapi.MilliCPU]{
Reserved: vmInfo.Max().VCPU,
Buffer: util.SaturatingSub(vmInfo.Max().VCPU, vmInfo.Using().VCPU),
CapacityPressure: 0,
Min: vmInfo.Min().VCPU,
Max: vmInfo.Max().VCPU,
}
memState = podResourceState[api.Bytes]{
Reserved: vmInfo.Max().Mem,
Buffer: util.SaturatingSub(vmInfo.Max().Mem, vmInfo.Using().Mem),
CapacityPressure: 0,
Min: vmInfo.Min().Mem,
Max: vmInfo.Max().Mem,
}
// If scaling isn't enabled *or* the pod is involved in an ongoing migration *or* the caller
// has opted out of setting Buffer, then we can be more precise about usage.
//
// Buffer exists to handle scaling that may happen due to a prior scheduler's approval.
// If scaling is disabled, we don't have to worry about this, and if there's an ongoing
// migration, scaling is forbidden.
migrating := util.TryPodOwnerVirtualMachineMigration(pod) != nil
if !vmInfo.Config.ScalingEnabled || migrating || !includeBuffer {
cpuState.Buffer = 0
cpuState.Reserved = vmInfo.Using().VCPU
memState.Buffer = 0
memState.Reserved = vmInfo.Using().Mem
}
} else {
res := extractPodResources(pod)
cpuState = podResourceState[vmapi.MilliCPU]{
Reserved: res.VCPU,
Buffer: 0,
CapacityPressure: 0,
Min: res.VCPU,
Max: res.VCPU,
}
memState = podResourceState[api.Bytes]{
Reserved: res.Mem,
Buffer: 0,
CapacityPressure: 0,
Min: res.Mem,
Max: res.Mem,
}
}
podName := util.GetNamespacedName(pod)
ps := &podState{
name: podName,
node: node,
cpu: cpuState,
mem: memState,
vm: vmState,
}
// Speculatively try reserving the pod.
nodeXactCPU := xact.New(&node.cpu)
nodeXactMem := xact.New(&node.mem)
cpuOverBudget, cpuVerdict := makeResourceTransitioner(nodeXactCPU.Value(), &ps.cpu).handleReserve()
memOverBudget, memVerdict := makeResourceTransitioner(nodeXactMem.Value(), &ps.mem).handleReserve()
overBudget := cpuOverBudget || memOverBudget
verdict := verdictSet{
cpu: cpuVerdict,
mem: memVerdict,
}
const verdictNotEnough = "NOT ENOUGH"
const verdictOk = "OK"
if overBudget {
cpuShortVerdict := verdictNotEnough
if !cpuOverBudget {
cpuShortVerdict = verdictOk
}
verdict.cpu = fmt.Sprintf("%s: %s", cpuShortVerdict, verdict.cpu)
memShortVerdict := verdictNotEnough
if !memOverBudget {
memShortVerdict = verdictOk
}
verdict.mem = fmt.Sprintf("%s: %s", memShortVerdict, verdict.mem)
}
if !accept(verdict, overBudget) {
return false, verdict
}
nodeXactCPU.Commit()
nodeXactMem.Commit()
node.pods[podName] = ps
e.state.pods[podName] = ps
node.updateMetrics(e.metrics)
return true, verdict
}
// This method is /basically/ the same as e.Unreserve, but the API is different and it has different
// logs, so IMO it's worthwhile to have this separate.
func (e *AutoscaleEnforcer) handleDeletion(logger *zap.Logger, podName util.NamespacedName) {
logger = logger.With(
zap.String("action", "VM deletion"),
zap.Object("pod", podName),
)
logger.Info("Handling deletion of VM pod")
logFields, kind, migrating, verdict := e.unreserveResources(logger, podName)
logger.With(logFields...).Info(
fmt.Sprintf("Deleted %s Pod", kind),
zap.Bool("migrating", migrating),
zap.Object("verdict", verdict),
)
}
// unreserveResources is *essentially* the inverse of reserveResources, but with two main
// differences:
//
// 1. unreserveResources cannot "deny" unreserving, whereas reserveResources may choose whether to
// accept the additional reservation.
// 2. unreserveResources returns additional information for logging.
//
// Also note that because unreserveResources is expected to be called by the plugin's Unreserve()
// method, it may be called for pods that no longer exist.
func (e *AutoscaleEnforcer) unreserveResources(
logger *zap.Logger,
podName util.NamespacedName,
) (_ []zap.Field, kind string, migrating bool, _ verdictSet) {
e.state.lock.Lock()
defer e.state.lock.Unlock()
ps, ok := e.state.pods[podName]
if !ok {
logger.Warn("Cannot find Pod in global pods map")
return
}
logFields := []zap.Field{zap.String("node", ps.node.name)}
if ps.vm != nil {
logFields = append(logFields, zap.Object("virtualmachine", ps.vm.Name))
}
// Mark the resources as no longer reserved
currentlyMigrating := ps.vm != nil && ps.vm.currentlyMigrating()
cpuVerdict := makeResourceTransitioner(&ps.node.cpu, &ps.cpu).
handleDeleted(currentlyMigrating)
memVerdict := makeResourceTransitioner(&ps.node.mem, &ps.mem).
handleDeleted(currentlyMigrating)
// Delete our record of the pod
delete(e.state.pods, podName)
delete(ps.node.pods, podName)
if ps.vm != nil {
ps.node.mq.removeIfPresent(ps.vm)
}
ps.node.updateMetrics(e.metrics)
return logFields, ps.kind(), currentlyMigrating, verdictSet{cpu: cpuVerdict, mem: memVerdict}
}
func (e *AutoscaleEnforcer) handleVMConfigUpdated(logger *zap.Logger, podName util.NamespacedName, newCfg api.VmConfig) {
logger = logger.With(
zap.String("action", "VM config updated"),
zap.Object("pod", podName),
)
logger.Info("Handling updated config for VM pod")
e.state.lock.Lock()
defer e.state.lock.Unlock()
ps, ok := e.state.pods[podName]
if !ok {
logger.Error("Cannot find Pod in global pods map")
return
}
logger = logger.With(zap.String("node", ps.node.name))
if ps.vm == nil {
logger.Error("handleVMConfigUpdated called for non-VM Pod")
return
}
logger = logger.With(zap.Object("virtualmachine", ps.vm.Name))
// Broadly, we want to update the value of the vmPodState.Config field.
// But *also*, if autoscaling is newly disabled, we should update update the pod/node state.
// And if auto-migration is disabled, we should remove the VM from the migration queue.
oldCfg := ps.vm.Config
ps.vm.Config = newCfg
// worth logging all of this in case we hit issues.
logger.Info("Config updated for VM", zap.Any("oldCfg", newCfg), zap.Any("newCfg", newCfg))
if oldCfg.AutoMigrationEnabled && !newCfg.AutoMigrationEnabled {
ps.node.mq.removeIfPresent(ps.vm)
}
if oldCfg.ScalingEnabled && !newCfg.ScalingEnabled {
cpuVerdict := makeResourceTransitioner(&ps.node.cpu, &ps.cpu).
handleAutoscalingDisabled()
memVerdict := makeResourceTransitioner(&ps.node.mem, &ps.mem).
handleAutoscalingDisabled()
ps.node.updateMetrics(e.metrics)
logger.Info(
"Disabled autoscaling for VM pod",
zap.Object("verdict", verdictSet{
cpu: cpuVerdict,
mem: memVerdict,
}),
)
}
}
func (e *AutoscaleEnforcer) handlePodStartMigration(logger *zap.Logger, podName, migrationName util.NamespacedName, source bool) {
logger = logger.With(
zap.String("action", "VM pod start migration"),
zap.Object("pod", podName),
zap.Object("virtualmachinemigration", migrationName),
)
logger.Info("Handling VM pod migration start")
e.state.lock.Lock()
defer e.state.lock.Unlock()
ps, ok := e.state.pods[podName]
if !ok {
logger.Error("Cannot find Pod in global pods map")
return
}
logger = logger.With(zap.String("node", ps.node.name))
if ps.vm == nil {
logger.Error("handlePodStartMigration called for non-VM Pod")
return
}
logger = logger.With(zap.Object("virtualmachine", ps.vm.Name))
// Reset buffer to zero, remove from migration queue (if in it), and set pod's migrationState
cpuVerdict := makeResourceTransitioner(&ps.node.cpu, &ps.cpu).
handleStartMigration(source)
memVerdict := makeResourceTransitioner(&ps.node.mem, &ps.mem).
handleStartMigration(source)
ps.node.mq.removeIfPresent(ps.vm)
ps.vm.MigrationState = &podMigrationState{Name: migrationName}
ps.node.updateMetrics(e.metrics)
logger.Info(
"Handled start of migration involving pod",
zap.Object("verdict", verdictSet{
cpu: cpuVerdict,
mem: memVerdict,
}),
)
}
func (e *AutoscaleEnforcer) handlePodEndMigration(logger *zap.Logger, podName, migrationName util.NamespacedName) {
logger = logger.With(
zap.String("action", "VM pod end migration"),
zap.Object("pod", podName),
zap.Object("virtualmachinemigration", migrationName),
)
logger.Info("Handling VM pod migration end")
e.state.lock.Lock()
defer e.state.lock.Unlock()
ps, ok := e.state.pods[podName]
if !ok {
logger.Error("Cannot find Pod in global pods map")
return
}
logger = logger.With(zap.String("node", ps.node.name))
if ps.vm == nil {
logger.Error("handlePodEndMigration called for non-VM Pod")
return
}
logger = logger.With(zap.Object("virtualmachine", ps.vm.Name))
ps.vm.MigrationState = nil
//nolint:gocritic // NOTE: not *currently* needed, but this should be kept here as a reminder, in case that changes.
// ps.node.updateMetrics(e.metrics)
logger.Info("Recorded end of migration for VM pod")
}
func (e *AutoscaleEnforcer) handleUpdatedScalingBounds(logger *zap.Logger, vm *api.VmInfo, unqualifiedPodName string) {
podName := util.NamespacedName{Namespace: vm.Namespace, Name: unqualifiedPodName}
logger = logger.With(
zap.String("action", "VM updated scaling bounds"),
zap.Object("pod", podName),
zap.Object("virtualmachine", vm.NamespacedName()),
)
logger.Info("Handling updated scaling bounds for VM")
e.state.lock.Lock()
defer e.state.lock.Unlock()
ps, ok := e.state.pods[podName]
if !ok {
logger.Error("Cannot find Pod in global pods map")
return
}
logger = logger.With(zap.String("node", ps.node.name))
if ps.vm == nil {
logger.Error("handleUpdatedScalingBounds called for non-VM Pod")
return
}
cpuVerdict := handleUpdatedLimits(&ps.node.cpu, &ps.cpu, vm.Cpu.Min, vm.Cpu.Max)
memVerdict := handleUpdatedLimits(&ps.node.mem, &ps.mem, vm.Min().Mem, vm.Max().Mem)
ps.node.updateMetrics(e.metrics)
logger.Info(
"Updated scaling bounds for VM pod",
zap.Object("verdict", verdictSet{
cpu: cpuVerdict,
mem: memVerdict,
}),
)
}
func (e *AutoscaleEnforcer) handleNonAutoscalingUsageChange(logger *zap.Logger, vm *api.VmInfo, unqualifiedPodName string) {
e.state.lock.Lock()
defer e.state.lock.Unlock()
podName := util.NamespacedName{Namespace: vm.Namespace, Name: unqualifiedPodName}
logger = logger.With(
zap.String("action", "non-autoscaling VM usage change"),
zap.Object("pod", podName),
zap.Object("virtualmachine", vm.NamespacedName()),
)
ps, ok := e.state.pods[podName]
if !ok {
logger.Error("Cannot find Pod in global pods map")
return
}
cpuVerdict := makeResourceTransitioner(&ps.node.cpu, &ps.cpu).
handleNonAutoscalingUsageChange(vm.Using().VCPU)
memVerdict := makeResourceTransitioner(&ps.node.mem, &ps.mem).
handleNonAutoscalingUsageChange(vm.Using().Mem)
ps.node.updateMetrics(e.metrics)
logger.Info(
"Updated non-autoscaling VM usage",
zap.Object("verdict", verdictSet{
cpu: cpuVerdict,
mem: memVerdict,
}),
)
}
// NB: expected to be run in its own thread.
func (e *AutoscaleEnforcer) cleanupMigration(logger *zap.Logger, vmm *vmapi.VirtualMachineMigration) {
vmmName := util.GetNamespacedName(vmm)
logger = logger.With(
// note: use the "virtualmachinemigration" key here for just the name, because it mirrors
// what we log in startMigration.
zap.Object("virtualmachinemigration", vmmName),
// also include the VM, for better association.
zap.Object("virtualmachine", util.NamespacedName{
Name: vmm.Spec.VmName,
Namespace: vmm.Namespace,
}),
)
// Failed migrations should be noisy. Everything to do with cleaning up a failed migration
// should be logged at "Warn" or higher.
var logInfo func(string, ...zap.Field)
if vmm.Status.Phase == vmapi.VmmSucceeded {
logInfo = logger.Info
} else {
logInfo = logger.Warn
}
logInfo(
"Going to delete VirtualMachineMigration",
// Explicitly include "phase" here because we have metrics for it.
zap.String("phase", string(vmm.Status.Phase)),
// ... and then log the rest of the information about the migration:
zap.Any("spec", vmm.Spec),
zap.Any("status", vmm.Status),
)
// mark the operation as ongoing
func() {
e.state.lock.Lock()
defer e.state.lock.Unlock()
newCount := e.state.ongoingMigrationDeletions[vmmName] + 1
if newCount != 1 {
// context included by logger
logger.Error(
"More than one ongoing deletion for VirtualMachineMigration",
zap.Int("count", newCount),
)
}
e.state.ongoingMigrationDeletions[vmmName] = newCount
}()
// ... and remember to clean up when we're done:
defer func() {
e.state.lock.Lock()
defer e.state.lock.Unlock()
newCount := e.state.ongoingMigrationDeletions[vmmName] - 1
if newCount == 0 {
delete(e.state.ongoingMigrationDeletions, vmmName)
} else {
// context included by logger
logger.Error(
"More than one ongoing deletion for VirtualMachineMigration",
zap.Int("count", newCount),
)
e.state.ongoingMigrationDeletions[vmmName] = newCount
}
}()
// Continually retry the operation, until we're successful (or the VM doesn't exist anymore)
retryWait := time.Second * time.Duration(e.state.conf.MigrationDeletionRetrySeconds)
for {
logInfo("Attempting to delete VirtualMachineMigration")
err := e.vmClient.NeonvmV1().
VirtualMachineMigrations(vmmName.Namespace).
Delete(context.TODO(), vmmName.Name, metav1.DeleteOptions{})
if err == nil /* NB! This condition is inverted! */ {
logInfo("Successfully deleted VirtualMachineMigration")
e.metrics.migrationDeletions.WithLabelValues(string(vmm.Status.Phase)).Inc()
return
} else if apierrors.IsNotFound(err) {
logger.Warn("Deletion was handled for us; VirtualMachineMigration no longer exists")
return
}
logger.Error(
"Failed to delete VirtualMachineMigration, will try again after delay",
zap.Duration("delay", retryWait),
zap.Error(err),
)
e.metrics.migrationDeleteFails.WithLabelValues(string(vmm.Status.Phase)).Inc()
// retry after a delay
time.Sleep(retryWait)
continue
}
}
func (s *vmPodState) isBetterMigrationTarget(other *vmPodState) bool {
// TODO: this deprioritizes VMs whose metrics we can't collect. Maybe we don't want that?
if s.Metrics == nil || other.Metrics == nil {
return s.Metrics != nil && other.Metrics == nil
}
// TODO - this is just a first-pass approximation. Maybe it's ok for now? Maybe it's not. Idk.
return s.Metrics.LoadAverage1Min < other.Metrics.LoadAverage1Min
}
// this method can only be called while holding a lock. It will be released temporarily while we
// send requests to the API server
//
// A lock will ALWAYS be held on return from this function.
func (e *AutoscaleEnforcer) startMigration(ctx context.Context, logger *zap.Logger, pod *podState) (created bool, _ error) {
if pod.vm.currentlyMigrating() {
return false, fmt.Errorf("Pod is already migrating")
}
// Unlock to make the API request(s), then make sure we're locked on return.
e.state.lock.Unlock()
defer e.state.lock.Lock()
vmmName := util.NamespacedName{
Name: fmt.Sprintf("schedplugin-%s", pod.vm.Name.Name),
Namespace: pod.name.Namespace,
}
logger = logger.With(zap.Object("virtualmachinemigration", vmmName))
logger.Info("Starting VirtualMachineMigration for VM")
// Check that the migration doesn't already exist. If it does, then there's no need to recreate
// it.
//
// We technically don't *need* this additional request here (because we can check the return
// from the Create request with apierrors.IsAlreadyExists). However: the benefit we get from
// this is that the logs are significantly clearer.
_, err := e.vmClient.NeonvmV1().
VirtualMachineMigrations(pod.name.Namespace).
Get(ctx, vmmName.Name, metav1.GetOptions{})
if err == nil {
logger.Warn("VirtualMachineMigration already exists, nothing to do")
return false, nil
} else if !apierrors.IsNotFound(err) {
// We're *expecting* to get IsNotFound = true; if err != nil and isn't NotFound, then
// there's some unexpected error.
logger.Error("Unexpected error doing Get request to check if migration already exists", zap.Error(err))
return false, fmt.Errorf("Error checking if migration exists: %w", err)
}
gitVersion := util.GetBuildInfo().GitInfo
// FIXME: make this not depend on GetBuildInfo() internals.
if gitVersion == "<unknown>" {
gitVersion = "unknown"
}
vmm := &vmapi.VirtualMachineMigration{
ObjectMeta: metav1.ObjectMeta{
// TODO: it's maybe possible for this to run into name length limits? Unclear what we
// should do if that happens.
Name: vmmName.Name,
Namespace: pod.name.Namespace,
Labels: map[string]string{
// NB: There's requirements on what constitutes a valid label. Thankfully, the
// output of `git describe` always will.
//
// See also:
// https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set
LabelPluginCreatedMigration: gitVersion,
},
},
Spec: vmapi.VirtualMachineMigrationSpec{
VmName: pod.vm.Name.Name,
// FIXME: NeonVM's VirtualMachineMigrationSpec has a bunch of boolean fields that aren't
// pointers, which means we need to explicitly set them when using the Go API.
PreventMigrationToSameHost: true,
CompletionTimeout: 3600,
Incremental: true,
AutoConverge: true,
MaxBandwidth: resource.MustParse("1Gi"),
AllowPostCopy: false,
},
}
logger.Info("Migration doesn't already exist, creating one for VM", zap.Any("spec", vmm.Spec))
_, err = e.vmClient.NeonvmV1().VirtualMachineMigrations(pod.name.Namespace).Create(ctx, vmm, metav1.CreateOptions{})
if err != nil {
e.metrics.migrationCreateFails.Inc()
// log here, while the logger's fields are in scope
logger.Error("Unexpected error doing Create request for new migration", zap.Error(err))
return false, fmt.Errorf("Error creating migration: %w", err)
}
e.metrics.migrationCreations.Inc()
logger.Info("VM migration request successful")
return true, nil
}
package plugin
// this file primarily contains the type resourceTransition[T], for handling a number of operations
// on resources, and pretty-formatting summaries of the operations. There are also other, unrelated
// methods to perform similar functionality.
//
// resourceTransitions are created with the collectResourceTransition function.
//
// Handling requested resources from the autoscaler-agent is done with the handleRequested method,
// and changes from VM deletion are handled by handleDeleted.
import (
"errors"
"fmt"
"go.uber.org/zap/zapcore"
"golang.org/x/exp/constraints"
"github.com/neondatabase/autoscaling/pkg/util"
)
// resourceTransitioner maintains the current state of its resource and handles the transition
// into a new state. A resource is associated with a pod, and the pod is associated with a node.
type resourceTransitioner[T constraints.Unsigned] struct {
// node represents the current resource state of the node
node *nodeResourceState[T]
// pod represents the current resource state of the pod.
// pod belongs to the node.
pod *podResourceState[T]
}
func makeResourceTransitioner[T constraints.Unsigned](
node *nodeResourceState[T], pod *podResourceState[T],
) resourceTransitioner[T] {
return resourceTransitioner[T]{
node: node,
pod: pod,
}
}
// resourceState represents a resource state in its pod and its node. This is not necessarily the
// current state. It represents the resource state at a point in time.
type resourceState[T constraints.Unsigned] struct {
node nodeResourceState[T]
pod podResourceState[T]
}
// snapshotState snapshots the current state of the resource transitioner by making a copy of
// its state.
func (r resourceTransitioner[T]) snapshotState() resourceState[T] {
return resourceState[T]{*r.node, *r.pod}
}
// verdictSet represents a set of verdicts from some operation, for ease of logging
type verdictSet struct {
cpu string
mem string
}
// MarshalLogObject implements zapcore.ObjectMarshaler
func (s verdictSet) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddString("cpu", s.cpu)
enc.AddString("mem", s.mem)
return nil
}
// handleReserve adds the resources from the pod to the node, reporting if the node was over-budget
//
// Unlike handleRequested, this method should be called to add a NEW pod to the node.
//
// This is used in combination with Xact to speculatively *try* reserving a pod, and then revert if
// it would result in being over-budget.
func (r resourceTransitioner[T]) handleReserve() (overbudget bool, verdict string) {
callback := func(oldState, newState resourceState[T]) string {
if oldState.pod.Buffer != 0 {
return fmt.Sprintf(
"node reserved %v [buffer %v] + %v [buffer %v] -> %v [buffer %v] of total %v",
// node reserved %v [buffer %v] + %v [buffer %v] ->
oldState.node.Reserved, oldState.node.Buffer, newState.pod.Reserved, newState.pod.Buffer,
// -> %v [buffer %v] of total %v
newState.node.Reserved, newState.node.Buffer, oldState.node.Total,
)
} else {
return fmt.Sprintf(
"node reserved %v + %v -> %v of total %v",
oldState.node.Reserved, newState.pod.Reserved, newState.node.Reserved, oldState.node.Total,
)
}
}
callbackUnexpected := func(message string) verdictCallback[T] {
return func(_, _ resourceState[T]) string {
panic(errors.New(message))
}
}
// Currently, the caller provides the requested value via the Pod's Reserved field.
// In order to convert this to work with handleRequestedGeneric, we need to explicitly represent
// the increase from zero to pod.Reserved, so we do that by setting the Pod's value to zero and
// passing in the requested amount separately.
requested := r.pod.Reserved
r.pod.Reserved = 0
verdict = r.handleRequestedGeneric(
requested,
requestedOptions[T]{
// by setting factor and forceApprovalMinimum to the requested amount, we force that
// handleRequestedGeneric MUST reserve exactly that amount.
// Then, we leave it up to the caller to accept/reject by returning whether the node was
// overbudget, at the very end.
factor: requested,
forceApprovalMinimum: requested,
// only used for migrations
convertIncreaseIntoPressure: false,
// Yes, add buffer, because this is for reserving a pod for the first time. If the pod
// was already known, it's the caller's responsibility to set buffer appropriately.
addBuffer: true,
callbackNoChange: callback,
callbackDecreaseAutoApproved: callbackUnexpected("got 'decrease approved' from logic to reserve new pod"),
callbackIncreaseTurnedToPressure: callback,
callbackIncreaseRejected: callbackUnexpected("got 'increase rejected' from logic to reserve new pod, but it is infallible"),
callbackIncreasePartiallyApproved: callbackUnexpected("got 'partially approved' from logic to reserve new pod, but it is infallible"),
callbackIncreaseFullyApproved: callback,
},
)
overbudget = r.node.Reserved > r.node.Total
return overbudget, verdict
}
// handleRequested updates r.pod and r.node with changes to match the requested resources, within
// what's possible given the remaining resources.
//
// Any permitted increases are required to be a multiple of factor.
//
// Unlike handleReserve, this method should be called to update the resources for a preexisting pod
// on the node.
//
// A pretty-formatted summary of the outcome is returned as the verdict, for logging.
func (r resourceTransitioner[T]) handleRequested(
requested T,
lastPermit *T,
startingMigration bool,
factor T,
) (verdict string) {
normalVerdictCallback := func(oldState, newState resourceState[T]) string {
fmtString := "Register %d%s -> %d%s (pressure %d -> %d); " +
"node reserved %d%s -> %d%s (of %d), " +
"node capacityPressure %d -> %d (%d -> %d spoken for)"
var oldPodBuffer string
var oldNodeBuffer string
var newNodeBuffer string
if oldState.pod.Buffer != 0 {
oldPodBuffer = fmt.Sprintf(" [buffer %d]", oldState.pod.Buffer)
oldNodeBuffer = fmt.Sprintf(" [buffer %d]", oldState.node.Buffer)
newNodeBuffer = fmt.Sprintf(" [buffer %d]", newState.node.Buffer)
}
var wanted string
if newState.pod.Reserved != requested {
wanted = fmt.Sprintf(" (wanted %d)", requested)
}
return fmt.Sprintf(
fmtString,
// Register %d%s -> %d%s (pressure %d -> %d)
oldState.pod.Reserved, oldPodBuffer, newState.pod.Reserved, wanted, oldState.pod.CapacityPressure, newState.pod.CapacityPressure,
// node reserved %d%s -> %d%s (of %d)
oldState.node.Reserved, oldNodeBuffer, newState.node.Reserved, newNodeBuffer, oldState.node.Total,
// node capacityPressure %d -> %d (%d -> %d spoken for)
oldState.node.CapacityPressure, newState.node.CapacityPressure, oldState.node.PressureAccountedFor, newState.node.PressureAccountedFor,
)
}
migrationVerdictCallback := func(oldState, newState resourceState[T]) string {
fmtString := "Denying increase %d -> %d because the pod is starting migration; " +
"node capacityPressure %d -> %d (%d -> %d spoken for)"
return fmt.Sprintf(
fmtString,
// Denying increase %d -> %d because ...
oldState.pod.Reserved, requested,
// node capacityPressure %d -> %d (%d -> %d spoken for)
oldState.node.CapacityPressure, newState.node.CapacityPressure, oldState.node.PressureAccountedFor, newState.node.PressureAccountedFor,
)
}
var forceApprovalMinimum T
if lastPermit != nil {
forceApprovalMinimum = *lastPermit
}
return r.handleRequestedGeneric(
requested,
requestedOptions[T]{
factor: factor,
forceApprovalMinimum: forceApprovalMinimum,
// Can't increase during migrations.
//
// But we _will_ add the pod's request to the node's pressure, noting that its migration
// will resolve it.
convertIncreaseIntoPressure: startingMigration,
// don't add buffer to the node; autoscaler-agent requests should reset it.
addBuffer: false,
callbackNoChange: normalVerdictCallback,
callbackDecreaseAutoApproved: normalVerdictCallback,
callbackIncreaseTurnedToPressure: migrationVerdictCallback,
callbackIncreaseRejected: normalVerdictCallback,
callbackIncreasePartiallyApproved: normalVerdictCallback,
callbackIncreaseFullyApproved: normalVerdictCallback,
},
)
}
type requestedOptions[T constraints.Unsigned] struct {
// factor provides a multiple binding the result of any increases from handleRequestedGeneric()
//
// For handling autoscaler-agent requests, this is the value of a compute unit's worth of that
// resource (e.g. 0.25 CPU or 1 GiB memory).
// For initially reserving a Pod, factor is set equal to the total additional resources, which
// turns handleRequestedGeneric() into a binary function that either grants the entire request,
// or none of it.
factor T
// forceApprovalMinimum sets the threshold above which handleRequestedGeneric() is allowed to
// reject the request - i.e. if the request is less than or equal to forceApprovalMinimum, it
// must be approved.
//
// This is typically set to a non-zero value when reserving resources for a Pod that has already
// been scheduled (so there's nothing we can do about it), or when handling an autoscaler-agent
// request that provides what a previous scheduler approved (via lastPermit).
forceApprovalMinimum T
// convertIncreaseIntoPressure causes handleRequestedGeneric() to reject any requested increases
// in reserved resources, and instead add the amount of the increase to the CapacityPressure of
// the Pod and Node.
convertIncreaseIntoPressure bool
// addBuffer causes handleRequestedGeneric() to additionally add the pod's Buffer field to the
// node, under the assumption that the Buffer is completely new.
//
// Note that if addBuffer is true, buffer will be added *even if the reservation is rejected*.
addBuffer bool
callbackNoChange verdictCallback[T]
callbackDecreaseAutoApproved verdictCallback[T]
callbackIncreaseTurnedToPressure verdictCallback[T]
callbackIncreaseRejected verdictCallback[T]
callbackIncreasePartiallyApproved verdictCallback[T]
callbackIncreaseFullyApproved verdictCallback[T]
}
type verdictCallback[T constraints.Unsigned] func(oldState, newState resourceState[T]) string
func (r resourceTransitioner[T]) handleRequestedGeneric(
requested T,
opts requestedOptions[T],
) (verdict string) {
oldState := r.snapshotState()
var verdictGenerator verdictCallback[T]
if requested <= r.pod.Reserved {
// Decrease "requests" are actually just notifications it's already happened
r.node.Reserved -= r.pod.Reserved - requested
r.pod.Reserved = requested
// pressure is now zero, because the pod no longer wants to increase resources.
r.pod.CapacityPressure = 0
r.node.CapacityPressure -= oldState.pod.CapacityPressure
if requested == r.pod.Reserved {
verdictGenerator = opts.callbackNoChange
} else /* requested < r.pod.Reserved */ {
verdictGenerator = opts.callbackDecreaseAutoApproved
}
} else if opts.convertIncreaseIntoPressure /* implied: requested > pod.Reserved */ {
r.pod.CapacityPressure = requested - r.pod.Reserved
r.node.CapacityPressure = r.node.CapacityPressure + r.pod.CapacityPressure - oldState.pod.CapacityPressure
verdictGenerator = opts.callbackIncreaseTurnedToPressure
} else /* implied: requested > pod.Reserved && !opts.convertIncreaseIntoPressure */ {
// The following comment was made 2022-11-28 (updated 2023-04-06, 2024-05-DD): (TODO: set date)
//
// Note: this function as currently written will actively cause the autoscaler-agent to use
// resources that are uneven w.r.t. the number of compute units they represent.
//
// For example, we might have a request to go from 3CPU/3Gi -> 4CPU/4Gi but we only allow
// 4CPU/3Gi, which would instead be 4 compute units of CPU but 3 compute units of memory.
// When the autoscaler-agent receives the permit, it naively carries it out, giving itself a
// resource allocation that isn't a multiple of compute units.
//
// This obviously isn't great. However, this *is* the most resilient solution, and it is
// significantly simpler to implement, so it is the one I went with. As it currently stands,
// the autoscaler-agent is still expected to submit requests that are multiples of compute
// units, so the system should *eventually* stabilize (provided that the autoscaler-agent is
// not violating its own guarantees). This allows us to gracefully handle many kinds of
// stressors. Handling the resources separately *from the scheduler's point of view* makes
// it much, much easier to deal with.
//
// Please think carefully before changing this.
// note: it's entirely possible to have Reserved > Total, under a variety of
// undesirable-but-impossible-to-prevent circumstances.
remainingReservable := util.SaturatingSub(r.node.Total, r.node.Reserved)
increase := requested - r.pod.Reserved
// Increases are bounded by what's left in the node, rounded down to the nearest multiple of
// the factor.
maxIncrease := (remainingReservable / opts.factor) * opts.factor
// ... but we must allow at least opts.forceApprovalMinimum
increaseFromForceApproval := util.SaturatingSub(opts.forceApprovalMinimum, r.pod.Reserved)
maxIncrease = util.Max(maxIncrease, increaseFromForceApproval)
if increase > maxIncrease /* increases are bound by what's left in the node */ {
r.pod.CapacityPressure = increase - maxIncrease
// adjust node pressure accordingly. We can have old < new or new > old, so we shouldn't
// directly += or -= (implicitly relying on overflow).
r.node.CapacityPressure = r.node.CapacityPressure - oldState.pod.CapacityPressure + r.pod.CapacityPressure
increase = maxIncrease // cap at maxIncrease.
verdictGenerator = opts.callbackIncreasePartiallyApproved
} else {
// If we're not capped by maxIncrease, relieve pressure coming from this pod
r.node.CapacityPressure -= r.pod.CapacityPressure
r.pod.CapacityPressure = 0
verdictGenerator = opts.callbackIncreaseFullyApproved
}
r.pod.Reserved += increase
r.node.Reserved += increase
}
if r.pod.Buffer != 0 {
if opts.addBuffer {
r.node.Buffer += r.pod.Buffer
} else /* !opts.addBuffer - buffer is only needed until the first request, so we can reset it */ {
r.node.Buffer -= r.pod.Buffer
r.pod.Buffer = 0
}
}
newState := r.snapshotState()
return verdictGenerator(oldState, newState)
}
// handleDeleted updates r.node with changes to match the removal of r.pod
//
// A pretty-formatted summary of the changes is returned as the verdict, for logging.
func (r resourceTransitioner[T]) handleDeleted(currentlyMigrating bool) (verdict string) {
oldState := r.snapshotState()
r.node.Reserved -= r.pod.Reserved
r.node.CapacityPressure -= r.pod.CapacityPressure
if currentlyMigrating {
r.node.PressureAccountedFor -= r.pod.Reserved + r.pod.CapacityPressure
}
var podBuffer string
var oldNodeBuffer string
var newNodeBuffer string
if r.pod.Buffer != 0 {
r.node.Buffer -= r.pod.Buffer
podBuffer = fmt.Sprintf(" [buffer %d]", r.pod.Buffer)
oldNodeBuffer = fmt.Sprintf(" [buffer %d]", oldState.node.Buffer)
newNodeBuffer = fmt.Sprintf(" [buffer %d]", r.node.Buffer)
}
fmtString := "pod had %d%s; node reserved %d%s -> %d%s, " +
"node capacityPressure %d -> %d (%d -> %d spoken for)"
verdict = fmt.Sprintf(
fmtString,
// pod had %d%s; node reserved %d%s -> %d%s
r.pod.Reserved, podBuffer, oldState.node.Reserved, oldNodeBuffer, r.node.Reserved, newNodeBuffer,
// node capacityPressure %d -> %d (%d -> %d spoken for)
oldState.node.CapacityPressure, r.node.CapacityPressure, oldState.node.PressureAccountedFor, r.node.PressureAccountedFor,
)
return verdict
}
func (r resourceTransitioner[T]) handleNonAutoscalingUsageChange(newUsage T) (verdict string) {
oldState := r.snapshotState()
diff := newUsage - r.pod.Reserved
r.pod.Reserved = newUsage
r.node.Reserved += diff
verdict = fmt.Sprintf(
"pod reserved (%v -> %v), node reserved (%v -> %v)",
oldState.pod.Reserved, r.pod.Reserved, oldState.node.Reserved, r.node.Reserved,
)
return verdict
}
// handleAutoscalingDisabled updates r.node with changes to clear any buffer and capacityPressure
// from r.pod
//
// A pretty-formatted summary of the changes is returned as the verdict, for logging.
func (r resourceTransitioner[T]) handleAutoscalingDisabled() (verdict string) {
oldState := r.snapshotState()
// buffer is included in reserved, so we reduce everything by buffer.
buffer := r.pod.Buffer
valuesToReduce := []*T{&r.node.Reserved, &r.node.Buffer, &r.pod.Reserved, &r.pod.Buffer}
for _, v := range valuesToReduce {
*v -= buffer
}
r.node.CapacityPressure -= r.pod.CapacityPressure
r.pod.CapacityPressure = 0
var nodeBufferChange string
if oldState.pod.Buffer != 0 {
nodeBufferChange = fmt.Sprintf(" [buffer %d -> %d]", oldState.node.Buffer, r.node.Buffer)
}
fmtString := "pod had buffer %d, capacityPressure %d; " +
"node reserved %d -> %d%s, capacityPressure %d -> %d"
verdict = fmt.Sprintf(
fmtString,
// pod had buffer %d, capacityPressure %d;
oldState.pod.Buffer, oldState.pod.CapacityPressure,
// node reserved %d -> %d%s, capacityPressure %d -> %d
oldState.node.Reserved, r.node.Reserved, nodeBufferChange, oldState.node.CapacityPressure, r.node.CapacityPressure,
)
return verdict
}
// handleStartMigration updates r.node with changes to clear any buffer and capacityPressure from
// r.pod.
//
// If the pod is the migration source, this method *also* increases the node's PressureAccountedFor
// to match the pod's resource usage.
//
//nolint:unparam // linter complains about 'source'. FIXME: needs more work to figure this out.
func (r resourceTransitioner[T]) handleStartMigration(source bool) (verdict string) {
// This method is basically the same as handleAutoscalingDisabled, except we also update the
// node's PressureAccountedFor because any pressure generated by the pod will be resolved once
// the migration completes and the pod gets deleted.
oldState := r.snapshotState()
buffer := r.pod.Buffer
valuesToReduce := []*T{&r.node.Reserved, &r.node.Buffer, &r.pod.Reserved, &r.pod.Buffer}
for _, v := range valuesToReduce {
*v -= buffer
}
r.node.CapacityPressure -= r.pod.CapacityPressure
r.pod.CapacityPressure = 0
r.node.PressureAccountedFor += r.pod.Reserved
fmtString := "pod had buffer %d, capacityPressure %d; " +
"node reserved %d -> %d, capacityPressure %d -> %d, pressureAccountedFor %d -> %d"
verdict = fmt.Sprintf(
fmtString,
// pod had buffer %d, capacityPressure %d;
oldState.pod.Buffer, oldState.pod.CapacityPressure,
// node reserved %d -> %d, capacityPressure %d -> %d
oldState.node.Reserved, r.node.Reserved, oldState.node.CapacityPressure, r.node.CapacityPressure, oldState.node.PressureAccountedFor, r.node.PressureAccountedFor,
)
return verdict
}
func handleUpdatedLimits[T constraints.Unsigned](
node *nodeResourceState[T],
pod *podResourceState[T],
newMin T,
newMax T,
) (verdict string) {
if newMin == pod.Min && newMax == pod.Max {
return fmt.Sprintf("limits unchanged (min = %d, max = %d)", newMin, newMax)
}
// If the maximum bound has changed, then we should update {node,pod}.Buffer based it so that we
// can make a best-effort attempt to avoid overcommitting. This solution can't be perfect
// (because we're intentionally not using the "hard" limits provided by NeonVM, which would be
// overly conservative).
// However. This solution should be *good enough* - the cases it protects against are already
// exceptionally rare, and the imperfections even more so.
//
// To be clear, the cases we're worried about are things like the following sequence of events:
//
// 1. VM is at 4 CPU (of max 4)
// 2. Scheduler dies, autoscaler-agent loses contact
// 3. autoscaler-agent downscales to 2 CPU
// 3. VM Cpu.Max gets set to 2 (autoscaler-agent misses this)
// 4. Scheduler appears, observes Cpu.Max = 2
// 5. VM Cpu.Max gets set to 4
// 6. autoscaler-agent observes Cpu.Max is still 4
// 7. autoscaler-agent scales VM up to 4 CPU, which it is able to do because a previous
// scheduler approved 4 CPU.
// <-- INCONSISTENT STATE -->
// 8. autoscaler-agent reconnects with scheduler, informing it that it's using 4 CPU
//
// Again: we can't handle this perfectly with the current system. However, a good best-effort
// attempt to prevent this is worthwhile here. (realistically, the things we can't prevent would
// require a "perfect storm" of other failures in order to be relevant - which is good!)
bufferVerdict := ""
updateBuffer := pod.Max != newMax
if updateBuffer {
oldPodBuffer := pod.Buffer
oldNodeBuffer := node.Buffer
oldPodReserved := pod.Reserved
oldNodeReserved := node.Reserved
// Recalculate Reserved and Buffer from scratch because it's easier than doing the math
// directly.
//
// Note that we don't want to reserve *below* what we think the VM is using if the bounds
// decrease; it may be that the autoscaler-agent has not yet reacted to that.
using := pod.Reserved - pod.Buffer
pod.Reserved = util.Max(newMax, using)
pod.Buffer = pod.Reserved - using
node.Reserved = node.Reserved + pod.Reserved - oldPodReserved
node.Buffer = node.Buffer + pod.Buffer - oldPodBuffer
bufferVerdict = fmt.Sprintf(
". no contact yet: pod reserved %d -> %d (buffer %d -> %d), node reserved %d -> %d (buffer %d -> %d)",
oldPodReserved, pod.Reserved, oldPodBuffer, pod.Buffer,
oldNodeReserved, node.Reserved, oldNodeBuffer, node.Buffer,
)
}
oldMin := pod.Min
oldMax := pod.Max
pod.Min = newMin
pod.Max = newMax
return fmt.Sprintf("updated min %d -> %d, max %d -> %d%s", oldMin, newMin, oldMax, newMax, bufferVerdict)
}
package plugin
// Implementation of watching for Pod deletions and changes to a VM's scaling settings (either
// whether it's disabled, or the scaling bounds themselves).
import (
"context"
"reflect"
"time"
"go.uber.org/zap"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
"github.com/neondatabase/autoscaling/pkg/util/watch"
)
type nodeWatchCallbacks struct {
submitNodeDeletion func(*zap.Logger, string)
}
// watchNodeEvents watches for any deleted Nodes, so that we can clean up the resources that were
// associated with them.
func (e *AutoscaleEnforcer) watchNodeEvents(
ctx context.Context,
parentLogger *zap.Logger,
metrics watch.Metrics,
callbacks nodeWatchCallbacks,
) (*watch.Store[corev1.Node], error) {
logger := parentLogger.Named("node-watch")
return watch.Watch(
ctx,
logger.Named("watch"),
e.handle.ClientSet().CoreV1().Nodes(),
watch.Config{
ObjectNameLogField: "node",
Metrics: watch.MetricsConfig{
Metrics: metrics,
Instance: "Nodes",
},
// FIXME: make these configurable.
RetryRelistAfter: util.NewTimeRange(time.Second, 3, 5),
RetryWatchAfter: util.NewTimeRange(time.Second, 3, 5),
},
watch.Accessors[*corev1.NodeList, corev1.Node]{
Items: func(list *corev1.NodeList) []corev1.Node { return list.Items },
},
watch.InitModeSync, // Doesn't matter because AddFunc is nil and node store is only used for events.
metav1.ListOptions{},
watch.HandlerFuncs[*corev1.Node]{
DeleteFunc: func(node *corev1.Node, mayBeStale bool) {
logger.Info("Received delete event for node", zap.String("node", node.Name))
callbacks.submitNodeDeletion(logger, node.Name)
},
},
)
}
type podWatchCallbacks struct {
submitStarted func(_ *zap.Logger, _ *corev1.Pod, preexisting bool)
submitDeletion func(*zap.Logger, util.NamespacedName)
submitStartMigration func(_ *zap.Logger, podName, migrationName util.NamespacedName, source bool)
submitEndMigration func(_ *zap.Logger, podName, migrationName util.NamespacedName)
}
// watchPodEvents continuously tracks a handful of Pod-related events that we care about. These
// events are pod deletion or completion for VM and non-VM pods.
//
// This method starts its own goroutine, and guarantees that we have started listening for FUTURE
// events once it returns (unless it returns error).
//
// Events occurring before this method is called will not be sent.
func (e *AutoscaleEnforcer) watchPodEvents(
ctx context.Context,
parentLogger *zap.Logger,
metrics watch.Metrics,
callbacks podWatchCallbacks,
) (*watch.Store[corev1.Pod], error) {
logger := parentLogger.Named("pod-watch")
return watch.Watch(
ctx,
logger.Named("watch"),
e.handle.ClientSet().CoreV1().Pods(corev1.NamespaceAll),
watch.Config{
ObjectNameLogField: "pod",
Metrics: watch.MetricsConfig{
Metrics: metrics,
Instance: "Pods",
},
// We want to be up-to-date in tracking deletions, so that our reservations are correct.
//
// FIXME: make these configurable.
RetryRelistAfter: util.NewTimeRange(time.Millisecond, 250, 750),
RetryWatchAfter: util.NewTimeRange(time.Millisecond, 250, 750),
},
watch.Accessors[*corev1.PodList, corev1.Pod]{
Items: func(list *corev1.PodList) []corev1.Pod { return list.Items },
},
watch.InitModeSync, // required so that events are queued before watchPodEvents() returns
metav1.ListOptions{},
watch.HandlerFuncs[*corev1.Pod]{
AddFunc: func(pod *corev1.Pod, preexisting bool) {
name := util.GetNamespacedName(pod)
if e.state.conf.ignoredNamespace(pod.Namespace) {
logger.Info("Received add event for ignored Pod", zap.Object("pod", name))
return
}
// Generate events for pods when they become running
if pod.Status.Phase == corev1.PodRunning {
if !preexisting {
// Generally pods shouldn't be immediately running, so we log this as a
// warning. If it was preexisting, then it'll be handled on the initial
// cluster read already (but we generate the events anyways so that we
// definitely don't miss anything).
logger.Warn("Received add event for new Pod already running", zap.Object("pod", name))
}
callbacks.submitStarted(logger, pod, preexisting)
}
},
UpdateFunc: func(oldPod *corev1.Pod, newPod *corev1.Pod) {
name := util.GetNamespacedName(newPod)
if e.state.conf.ignoredNamespace(newPod.Namespace) {
logger.Info("Received update event for ignored Pod", zap.Object("pod", name))
return
}
// Check if a pod is now running.
if oldPod.Status.Phase == corev1.PodPending && newPod.Status.Phase == corev1.PodRunning {
logger.Info("Received update event for Pod now running", zap.Object("pod", name))
callbacks.submitStarted(logger, newPod, false)
}
// Check if pod is "completed" - handle that the same as deletion.
if !util.PodCompleted(oldPod) && util.PodCompleted(newPod) {
logger.Info("Received update event for completion of Pod", zap.Object("pod", name))
callbacks.submitDeletion(logger, name)
return // no other handling worthwhile if the pod's done.
}
// Check if the pod is part of a new migration, or if a migration it *was* part of
// has now ended.
oldMigration := util.TryPodOwnerVirtualMachineMigration(oldPod)
newMigration := util.TryPodOwnerVirtualMachineMigration(newPod)
if oldMigration == nil && newMigration != nil {
isSource := util.TryPodOwnerVirtualMachine(newPod) == nil
callbacks.submitStartMigration(logger, name, *newMigration, isSource)
} else if oldMigration != nil && newMigration == nil {
callbacks.submitEndMigration(logger, name, *oldMigration)
}
},
DeleteFunc: func(pod *corev1.Pod, mayBeStale bool) {
name := util.GetNamespacedName(pod)
if e.state.conf.ignoredNamespace(pod.Namespace) {
logger.Info("Received delete event for ignored Pod", zap.Object("pod", name))
return
}
if util.PodCompleted(pod) {
logger.Info("Received delete event for completed Pod", zap.Object("pod", name))
} else {
logger.Info("Received delete event for Pod", zap.Object("pod", name))
callbacks.submitDeletion(logger, name)
}
},
},
)
}
type vmWatchCallbacks struct {
submitConfigUpdated func(_ *zap.Logger, podName util.NamespacedName, newCfg api.VmConfig)
submitBoundsChanged func(_ *zap.Logger, _ *api.VmInfo, podName string)
submitNonAutoscalingVmUsageChanged func(_ *zap.Logger, _ *api.VmInfo, podName string)
}
// watchVMEvents watches for changes in VMs: signaling when scaling becomes disabled and updating
// stored information when scaling bounds change.
//
// The reason we care about when scaling is disabled is that if we don't, we can run into the
// following race condition:
//
// 1. VM created with autoscaling enabled
// 2. Scheduler restarts and reads the state of the cluster. It records the difference between the
// VM's current and maximum usage as "buffer"
// 3. Before the autoscaler-agent runner for the VM connects to the scheduler, the VM's label to
// enable autoscaling is removed, and the autoscaler-agent's runner exits.
// 4. final state: The scheduler retains buffer for a VM that can't scale.
//
// To avoid (4) occurring, we track events where autoscaling is disabled for a VM and remove its
// "buffer" when that happens. There's still some other possibilities for race conditions (FIXME),
// but those are a little harder to handlle - in particular:
//
// 1. Scheduler exits
// 2. autoscaler-agent runner downscales
// 3. Scheduler starts, reads cluster state
// 4. VM gets autoscaling disabled
// 5. Scheduler removes the VM's buffer
// 6. Before noticing that event, the autoscaler-agent upscales the VM and informs the scheduler of
// its current allocation (which it can do, because it was approved by a previous scheduler).
// 7. The scheduler denies what it sees as upscaling.
//
// This one requires a very unlikely sequence of events to occur, that should be appropriately
// handled by cancelled contexts in *almost all* cases.
func (e *AutoscaleEnforcer) watchVMEvents(
ctx context.Context,
parentLogger *zap.Logger,
metrics watch.Metrics,
callbacks vmWatchCallbacks,
podIndex watch.IndexedStore[corev1.Pod, *watch.NameIndex[corev1.Pod]],
) (*watch.Store[vmapi.VirtualMachine], error) {
logger := parentLogger.Named("vm-watch")
return watch.Watch(
ctx,
logger.Named("watch"),
e.vmClient.NeonvmV1().VirtualMachines(corev1.NamespaceAll),
watch.Config{
ObjectNameLogField: "virtualmachine",
Metrics: watch.MetricsConfig{
Metrics: metrics,
Instance: "VirtualMachines",
},
// FIXME: make these durations configurable.
RetryRelistAfter: util.NewTimeRange(time.Millisecond, 250, 750),
RetryWatchAfter: util.NewTimeRange(time.Millisecond, 250, 750),
},
watch.Accessors[*vmapi.VirtualMachineList, vmapi.VirtualMachine]{
Items: func(list *vmapi.VirtualMachineList) []vmapi.VirtualMachine { return list.Items },
},
watch.InitModeSync, // Doesn't matter because AddFunc is nil, and vmStore is only used for events.
metav1.ListOptions{},
watch.HandlerFuncs[*vmapi.VirtualMachine]{
UpdateFunc: func(oldVM, newVM *vmapi.VirtualMachine) {
if e.state.conf.ignoredNamespace(newVM.Namespace) {
logger.Info("Received update event for ignored VM", util.VMNameFields(newVM))
return
}
newInfo, err := api.ExtractVmInfo(logger, newVM)
if err != nil {
// Try to get the runner pod associated with the VM, if we can, but don't worry
// about it if we can't.
var runnerPod runtime.Object
if podName := newVM.Status.PodName; podName != "" {
// NB: index.Get returns nil if not found, so we only have a non-nil
// runnerPod if it's currently known.
rp, _ := podIndex.GetIndexed(func(index *watch.NameIndex[corev1.Pod]) (*corev1.Pod, bool) {
return index.Get(newVM.Namespace, podName)
})
// avoid typed nils by only assigning if non-nil
// See <https://github.com/neondatabase/autoscaling/issues/689> for more.
if rp != nil {
runnerPod = rp
}
}
logger.Error("Failed to extract VM info in update for new VM", util.VMNameFields(newVM), zap.Error(err))
e.handle.EventRecorder().Eventf(
newVM, // regarding
runnerPod, // related
"Warning", // eventtype
"ExtractVmInfo", // reason
"HandleVmUpdate", // action
"Failed to extract autoscaling info about VM: %s", // note
err,
)
return
}
oldInfo, err := api.ExtractVmInfo(logger, oldVM)
if err != nil {
logger.Error("Failed to extract VM info in update for old VM", util.VMNameFields(oldVM), zap.Error(err))
return
}
if newVM.Status.PodName == "" {
logger.Info("Skipping update for VM because .status.podName is empty", util.VMNameFields(newVM))
return
}
if !reflect.DeepEqual(oldInfo.Config, newInfo.Config) {
logger.Info("Received config update for VM", util.VMNameFields(newVM))
name := util.NamespacedName{Namespace: newInfo.Namespace, Name: newVM.Status.PodName}
callbacks.submitConfigUpdated(logger, name, newInfo.Config)
}
if (!oldInfo.Config.ScalingEnabled || !newInfo.Config.ScalingEnabled) && oldInfo.Using() != newInfo.Using() {
podName := util.NamespacedName{Namespace: newInfo.Namespace, Name: newVM.Status.PodName}
logger.Info("Received update changing usage for VM", zap.Object("old", oldInfo.Using()), zap.Object("new", newInfo.Using()))
callbacks.submitNonAutoscalingVmUsageChanged(logger, newInfo, podName.Name)
}
// If the pod changed, then we're going to handle a deletion event for the old pod,
// plus creation event for the new pod. Don't worry about it - because all VM
// information comes from this watch.Store anyways, there's no possibility of missing
// an update.
if oldVM.Status.PodName != newVM.Status.PodName {
return
}
// If bounds didn't change, then no need to update
if oldInfo.EqualScalingBounds(*newInfo) {
return
}
callbacks.submitBoundsChanged(logger, newInfo, newVM.Status.PodName)
},
},
)
}
type migrationWatchCallbacks struct {
submitMigrationFinished func(*vmapi.VirtualMachineMigration)
}
// watchMigrationEvents *only* looks at migrations that were created by the scheduler plugin (or a
// previous version of it).
//
// We use this to trigger cleaning up migrations once they're finished, because they don't
// auto-delete, and our deterministic naming means that each we won't be able to create a new
// migration for the same VM until the old one's gone.
//
// Tracking whether a migration was created by the scheduler plugin is done by adding the label
// 'autoscaling.neon.tech/created-by-scheduler' to every migration we create.
func (e *AutoscaleEnforcer) watchMigrationEvents(
ctx context.Context,
parentLogger *zap.Logger,
metrics watch.Metrics,
callbacks migrationWatchCallbacks,
) (*watch.Store[vmapi.VirtualMachineMigration], error) {
logger := parentLogger.Named("vmm-watch")
return watch.Watch(
ctx,
logger.Named("watch"),
e.vmClient.NeonvmV1().VirtualMachineMigrations(corev1.NamespaceAll),
watch.Config{
ObjectNameLogField: "virtualmachinemigration",
Metrics: watch.MetricsConfig{
Metrics: metrics,
Instance: "VirtualMachineMigrations",
},
// FIXME: make these durations configurable.
RetryRelistAfter: util.NewTimeRange(time.Second, 3, 5),
RetryWatchAfter: util.NewTimeRange(time.Second, 3, 5),
},
watch.Accessors[*vmapi.VirtualMachineMigrationList, vmapi.VirtualMachineMigration]{
Items: func(list *vmapi.VirtualMachineMigrationList) []vmapi.VirtualMachineMigration { return list.Items },
},
watch.InitModeSync,
metav1.ListOptions{
// NB: Including just the label itself means that we select for objects that *have* the
// label, without caring about the actual value.
//
// See also:
// https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#set-based-requirement
LabelSelector: LabelPluginCreatedMigration,
},
watch.HandlerFuncs[*vmapi.VirtualMachineMigration]{
UpdateFunc: func(oldObj, newObj *vmapi.VirtualMachineMigration) {
if e.state.conf.ignoredNamespace(newObj.Namespace) {
logger.Info(
"Received update event for ignored VM Migration",
zap.Object("virtualmachinemigration", util.GetNamespacedName(newObj)),
)
return
}
shouldDelete := newObj.Status.Phase != oldObj.Status.Phase &&
(newObj.Status.Phase == vmapi.VmmSucceeded || newObj.Status.Phase == vmapi.VmmFailed)
if shouldDelete {
callbacks.submitMigrationFinished(newObj)
}
},
},
)
}
package util
// Helper arithmetic methods
import (
"golang.org/x/exp/constraints"
)
// SaturatingSub returns x - y if x >= y, otherwise zero
func SaturatingSub[T constraints.Unsigned](x, y T) T {
if x >= y {
return x - y
} else {
var zero T
return zero
}
}
// Max returns the maximum of the two values
func Max[T constraints.Ordered](x, y T) T {
if x > y {
return x
} else {
return y
}
}
// Min returns the minimum of the two values
func Min[T constraints.Ordered](x, y T) T {
if x < y {
return x
} else {
return y
}
}
// AbsDiff returns the absolute value of the difference between x and y
func AbsDiff[T constraints.Unsigned](x, y T) T {
if x > y {
return x - y
} else {
return y - x
}
}
// AtomicInt represents the shared interface provided by various atomic.<NAME> integers
//
// This interface type is primarily used by AtomicMax.
type AtomicInt[I any] interface {
Add(delta I) (new I) //nolint:predeclared // same var names as methods
CompareAndSwap(old, new I) (swapped bool) //nolint:predeclared // same var names as methods
Load() I
Store(val I)
Swap(new I) (old I) //nolint:predeclared // same var names as methods
}
// AtomicMax atomically sets a to the maximum of *a and i, returning the old value at a.
//
// On ISAs without atomic maximum/minimum instructions, a fallback is typically implemented as the
// Load + CompareAndSwap loop that this function uses. At time of writing (Go 1.20), the Go standard
// library does not include atomic maximum/minimum functions.
//
// This function is lock-free but not wait-free.
func AtomicMax[A AtomicInt[I], I constraints.Integer](a A, i I) I {
for {
current := a.Load()
if current >= i {
return current
}
if a.CompareAndSwap(current, i) {
return current
}
}
}
package util
// A channel-based sync.Cond-like interface, with support for broadcast operations (but some
// additional restrictions). Refer to the documentation of Wait for detailed usage.
import (
"sync"
)
func NewBroadcaster() *Broadcaster {
return &Broadcaster{
mu: sync.Mutex{},
ch: make(chan struct{}),
sent: 0,
}
}
type Broadcaster struct {
mu sync.Mutex
ch chan struct{}
sent uint64
}
type BroadcastReceiver struct {
b *Broadcaster
viewed uint64
}
// Broadcast sends a signal to all receivers
func (b *Broadcaster) Broadcast() {
b.mu.Lock()
defer b.mu.Unlock()
close(b.ch)
b.ch = make(chan struct{})
b.sent += 1
}
// NewReceiver creates a new BroadcastReceiver that will receive only future broadcasted events.
//
// It's generally not recommended to call (*BroadcastReceiver).Wait() on a single BroadcastReceiver
// from more than one thread at a time, although it *is* thread-safe.
func (b *Broadcaster) NewReceiver() BroadcastReceiver {
b.mu.Lock()
defer b.mu.Unlock()
return BroadcastReceiver{
b: b,
viewed: b.sent,
}
}
var closedChannel = func() <-chan struct{} {
ch := make(chan struct{})
close(ch)
return ch
}()
// Wait returns a channel that will be closed once there has been an event broadcasted since
// the BroadcastReceiver was created, or the last call to Awake().
//
// Typical usage of Wait will involve selecting on the channel returned and calling Awake
// immediately in the branch handling the event, for example:
//
// select {
// case <-ctx.Done():
// return
// case <-receiver.Wait():
// receiver.Awake()
// ...
// }
func (r *BroadcastReceiver) Wait() <-chan struct{} {
r.b.mu.Lock()
defer r.b.mu.Unlock()
if r.b.sent == r.viewed {
return r.b.ch
} else {
return closedChannel
}
}
// Awake marks the most recent broadcast event as received, so that the next call to Wait returns a
// channel that will only be closed once there's been a new event after this call to Awake.
func (r *BroadcastReceiver) Awake() {
r.b.mu.Lock()
defer r.b.mu.Unlock()
r.viewed = r.b.sent
}
package util
// This file primarily exposes the GetBuildInfo function
import (
"runtime/debug"
)
// BuildGitInfo stores some pretty-formatted information about the repository and working tree at
// build time. It's set by the GIT_INFO argument in the Dockerfiles and set to the output of:
//
// git describe --long --dirty
//
// While public, this value is not expected to be used externally. You should use GetBuildInfo
// instead.
var BuildGitInfo string
// BuildInfo stores a little bit of information about the build of the current binary
//
// All strings are guaranteed to be non-empty.
type BuildInfo struct {
GitInfo string `json:"gitInfo"`
GoVersion string `json:"goVersion"`
}
// GetBuildInfo makes a best-effort attempt to return some information about how the currently
// running binary was built
func GetBuildInfo() BuildInfo {
goVersion := "<unknown>"
if buildInfo, ok := debug.ReadBuildInfo(); ok {
if buildInfo.GoVersion != "" {
goVersion = buildInfo.GoVersion
}
}
// FIXME: the "<unknown>" string is depended upon by the plugin's VirtualMachineMigration
// creation process. We should expose something better here.
gitInfo := BuildGitInfo
if BuildGitInfo == "" {
gitInfo = "<unknown>"
}
return BuildInfo{
GitInfo: gitInfo,
GoVersion: goVersion,
}
}
package util
// Implementation of a channel-based mutex, so that it can be combined with Context.Done and other
// select-able methods, without dealing with the hassle of creating separate goroutines
import (
"context"
"fmt"
"time"
)
// ChanMutex is a select-able mutex
//
// It is fair if and only if receiving on a channel is fair. As of Go 1.19/2022-01-17, receiving on
// a channel appears to be fair. However: this is a runtime implementation detail, and so it may
// change without notice in the future.
//
// Unlike sync.Mutex, ChanMutex requires initialization before use because it's basically just a
// channel.
//
// Also unlike sync.Mutex, a ChanMutex may be copied without issue (again, because it's just a
// channel).
type ChanMutex struct {
ch chan struct{}
}
// NewChanMutex creates a new ChanMutex
func NewChanMutex() ChanMutex {
ch := make(chan struct{}, 1)
ch <- struct{}{}
return ChanMutex{ch}
}
// Lock locks m
//
// This method is semantically equivalent to sync.Mutex.Lock
func (m *ChanMutex) Lock() {
if m.ch == nil {
panic("called Lock on uninitialized ChanMutex")
}
<-m.ch
}
// WaitLock is like Lock, but instead returns a channel
//
// If receiving on the channel succeeds, the caller "holds" the lock and must now be responsible for
// Unlock-ing it.
func (m *ChanMutex) WaitLock() <-chan struct{} {
if m.ch == nil {
panic("called WaitLock on uninitialized ChanMutex")
}
return m.ch
}
// TryLock blocks until locking m succeeds or the context is cancelled
//
// If the context is cancelled while waiting to lock m, the lock will be left unchanged and
// ctx.Err() will be returned.
func (m *ChanMutex) TryLock(ctx context.Context) error {
if m.ch == nil {
panic("called TryLock on uninitialized ChanMutex")
}
select {
case <-m.ch:
return nil
case <-ctx.Done():
return ctx.Err()
}
}
// Unlock unlocks m
//
// This method is semantically equivalent to sync.Mutex.Unlock
func (m *ChanMutex) Unlock() {
select {
case m.ch <- struct{}{}:
default:
panic("ChanMutex.Unlock called while already unlocked")
}
}
// DeadlockChecker creates a function that, when called, periodically attempts to acquire the lock,
// panicking if it fails
//
// The returned function exits when the context is done.
func (m *ChanMutex) DeadlockChecker(timeout, delay time.Duration) func(ctx context.Context) {
return func(ctx context.Context) {
for {
// Delay between checks
select {
case <-ctx.Done():
return
case <-time.After(delay):
}
select {
case <-ctx.Done():
return
case <-m.WaitLock():
m.Unlock()
case <-time.After(timeout):
panic(fmt.Errorf("likely deadlock detected, could not get lock after %s", timeout))
}
}
}
}
package util
// Utilities for errors
import (
"errors"
)
// RootError returns the root cause of the error, calling errors.Unwrap until it returns nil
func RootError(err error) error {
for {
next := errors.Unwrap(err)
if next == nil {
return err
}
err = next
}
}
package util
// Wrapper file for the AddHandler function
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"go.uber.org/zap"
)
// AddHandler is a helper function to wrap the handle function with JSON [de]serialization and check
// that the HTTP method is correct
//
// The provided logPrefix is prepended to every log line emitted by the wrapped handler function, to
// offer distinction where that's useful.
func AddHandler[T any, R any](
logger *zap.Logger,
mux *http.ServeMux,
endpoint string,
method string,
reqTypeName string,
handle func(context.Context, *zap.Logger, *T) (_ *R, statusCode int, _ error),
) {
errBadMethod := []byte("request method must be " + method)
logger = logger.With(zap.String("endpoint", endpoint))
hlogger := logger.Named("http")
mux.HandleFunc(endpoint, func(w http.ResponseWriter, r *http.Request) {
if r.Method != method {
w.WriteHeader(http.StatusMethodNotAllowed)
_, _ = w.Write(errBadMethod)
return
}
defer r.Body.Close()
var req T
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
hlogger.Error("Failed to read request body as JSON", zap.String("type", reqTypeName), zap.Error(err))
w.WriteHeader(400)
_, _ = w.Write([]byte("bad JSON"))
return
}
hlogger.Info(
"Received request",
zap.String("endpoint", endpoint),
zap.String("client", r.RemoteAddr),
zap.Any("request", req),
)
resp, status, err := handle(r.Context(), logger.With(zap.Any("request", req)), &req)
if err == nil && status != http.StatusOK {
err = errors.New("HTTP handler error: status != 200 OK, but no error message")
status = 500
}
var respBody []byte
var respBodyFormatted zap.Field
var logFunc func(string, ...zap.Field)
if err != nil {
if 500 <= status && status < 600 {
logFunc = hlogger.Error
} else if 400 <= status && status < 500 {
logFunc = hlogger.Warn
} else /* unexpected status */ {
err = fmt.Errorf("HTTP handler error: invalid status %d for error response: %w", status, err)
logFunc = hlogger.Error
}
respBodyFormatted = zap.NamedError("response", err)
respBody = []byte(err.Error())
} else {
if status == 0 {
hlogger.Warn("non-error response with status = 0")
}
respBodyFormatted = zap.Any("response", resp)
respBody, err = json.Marshal(resp)
if err != nil {
hlogger.Error("Failed to encode JSON response", respBodyFormatted)
w.WriteHeader(500)
_, _ = w.Write([]byte("Error encoding JSON response"))
return
}
logFunc = hlogger.Info
}
logFunc(
"Responding to request",
zap.String("endpoint", endpoint), zap.Int("status", status), respBodyFormatted,
)
w.WriteHeader(status)
_, _ = w.Write(respBody)
})
}
package util
// Kubernetes-specific utility functions
import (
"strings"
corev1 "k8s.io/api/core/v1"
)
// PodReady returns true iff the pod is marked as ready (as determined by the pod's
// Status.Conditions)
func PodReady(pod *corev1.Pod) bool {
for _, c := range pod.Status.Conditions {
if c.Type == corev1.PodReady {
return c.Status == corev1.ConditionTrue
}
}
return false
}
// PodCompleted returns true iff all of the Pod's containers have stopped and will not be restarted
func PodCompleted(pod *corev1.Pod) bool {
return pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed
}
// PodStartedBefore returns true iff Pod p started before Pod q
func PodStartedBefore(p, q *corev1.Pod) bool {
return p.Status.StartTime.Before(q.Status.StartTime)
}
func azForTerm(term corev1.NodeSelectorTerm) string {
for _, expr := range term.MatchExpressions {
isAZ := expr.Key == "topology.kubernetes.io/zone" &&
expr.Operator == corev1.NodeSelectorOpIn &&
len(expr.Values) == 1
if isAZ {
return expr.Values[0]
}
}
return ""
}
// PodPreferredAZIfPresent returns the desired availability zone of the Pod, if it has one
func PodPreferredAZIfPresent(pod *corev1.Pod) string {
if pod.Spec.Affinity == nil || pod.Spec.Affinity.NodeAffinity == nil {
return ""
}
affinity := pod.Spec.Affinity.NodeAffinity
// First, check required affinities for AZ:
if affinity.RequiredDuringSchedulingIgnoredDuringExecution != nil {
for _, term := range affinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms {
if az := azForTerm(term); az != "" {
return az
}
}
}
// Then, check preferred:
for _, term := range affinity.PreferredDuringSchedulingIgnoredDuringExecution {
if az := azForTerm(term.Preference); az != "" {
return az
}
}
// no AZ present
return ""
}
// TryPodOwnerVirtualMachine returns the name of the VirtualMachine that owns the pod, if there is
// one that does. Otherwise returns nil.
func TryPodOwnerVirtualMachine(pod *corev1.Pod) *NamespacedName {
for _, ref := range pod.OwnerReferences {
// For NeonVM, *at time of writing*, the OwnerReference has an APIVersion of
// "vm.neon.tech/v1". But:
//
// 1. It's good to be extra-safe around possible name collisions for the
// "VirtualMachineMigration" name, even though *practically* it's not going to happen;
// 2. We can disambiguate with the APIVersion; and
// 3. We don't want to match on a fixed version, in case we want to change the version
// number later.
//
// So, given that the format is "<NAME>/<VERSION>", we can just match on the "<NAME>/" part
// of the APIVersion to have the safety we want with the flexibility we need.
if strings.HasPrefix(ref.APIVersion, "vm.neon.tech/") && ref.Kind == "VirtualMachine" {
// note: OwnerReferences are not permitted to have a different namespace than the owned
// object, so because VirtualMachineMigrations are namespaced, it must have the same
// namespace as the Pod.
return &NamespacedName{Namespace: pod.Namespace, Name: ref.Name}
}
}
return nil
}
// TryPodOwnerVirtualMachineMigration returns the name of the VirtualMachineMigration that owns the
// pod, if there is one. Otherwise returns nil.
func TryPodOwnerVirtualMachineMigration(pod *corev1.Pod) *NamespacedName {
for _, ref := range pod.OwnerReferences {
if strings.HasPrefix(ref.APIVersion, "vm.neon.tech/") && ref.Kind == "VirtualMachineMigration" {
return &NamespacedName{Namespace: pod.Namespace, Name: ref.Name}
}
}
return nil
}
package util
import (
"context"
"errors"
"fmt"
"net"
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.uber.org/zap"
)
func RegisterMetric[P prometheus.Collector](reg prometheus.Registerer, collector P) P {
reg.MustRegister(collector)
return collector
}
// Prometheus metrics server common to >1 component
// Starts the prometheus server in a background thread. Returns error if binding on the port fails.
func StartPrometheusMetricsServer(ctx context.Context, logger *zap.Logger, port uint16, reg *prometheus.Registry) error {
// Separate binding from serving, so that we can catch any error in this thread, rather than the
// server's.
listener, err := net.ListenTCP("tcp", &net.TCPAddr{IP: net.IPv4zero, Port: int(port)})
if err != nil {
return fmt.Errorf("Error listening on TCP port %d: %w", port, err)
}
shutdownCtx, shutdown := context.WithCancel(ctx)
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg}))
baseContext := context.Background()
srv := &http.Server{Handler: mux, BaseContext: func(net.Listener) context.Context { return baseContext }}
go func() {
<-shutdownCtx.Done()
if err := srv.Shutdown(context.Background()); err != nil {
logger.Error("Error shutting down prometheus server", zap.Error(err))
}
}()
go func() {
// shutdown the shutdown watcher if we exit before it
defer shutdown()
if err := srv.Serve(listener); !errors.Is(err, http.ErrServerClosed) {
logger.Error("Prometheus server exited with unexpected error", zap.Error(err))
}
}()
return nil
}
package util
// same as k8s.io/apimachinery/pkg/types/namespacedname.go, but with JSON (de)serialization
import (
"fmt"
"go.uber.org/zap/zapcore"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
const Separator = '/'
// NamespacedName represents a resource name with the namespace it's in.
//
// When printed with '%v', NamespacedName is rendered as "<namespace>/<name>". Printing with
// '%+v' or '%#v' renders as it would normally.
type NamespacedName struct {
Namespace string `json:"namespace"`
Name string `json:"name"`
}
func GetNamespacedName(obj metav1.ObjectMetaAccessor) NamespacedName {
meta := obj.GetObjectMeta()
return NamespacedName{Namespace: meta.GetNamespace(), Name: meta.GetName()}
}
func (n NamespacedName) Format(state fmt.State, verb rune) {
switch {
case verb == 'v' && state.Flag('+'):
// Show fields, e.g. `{Namespace:foo Name:bar}`
_, _ = state.Write([]byte(string("{Namespace:")))
_, _ = state.Write([]byte(n.Namespace))
_, _ = state.Write([]byte(string(" Name:")))
_, _ = state.Write([]byte(n.Name))
_, _ = state.Write([]byte{'}'})
case verb == 'v' && state.Flag('#'):
// Go syntax representation, e.g. `util.NamespacedName{Namespace:"foo", Name:"bar"}`
_, _ = state.Write([]byte(fmt.Sprintf("util.NamespacedName{Namespace:%q, Name:%q}", n.Namespace, n.Name)))
default:
// Pretty-printed representation, e.g. `foo:bar`
_, _ = state.Write([]byte(n.Namespace))
_, _ = state.Write([]byte(string(Separator)))
_, _ = state.Write([]byte(n.Name))
}
}
// MarshalLogObject implements zapcore.ObjectMarshaler, so that NamespacedName can be used with zap.Object
func (n NamespacedName) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddString("namespace", n.Namespace)
enc.AddString("name", n.Name)
return nil
}
package util
import (
"net/http"
"net/http/pprof"
"time"
)
func MakePPROF(addr string) *http.Server {
mux := http.NewServeMux()
mux.HandleFunc("/debug/pprof/", pprof.Index)
mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
return &http.Server{
Addr: addr,
Handler: mux,
ReadHeaderTimeout: time.Second,
}
}
package util
import "time"
// RecentCounter is a struct that keeps track of recent timestamps within a given interval.
type RecentCounter struct {
interval time.Duration
timestamps []time.Time
}
func NewRecentCounter(interval time.Duration) *RecentCounter {
return &RecentCounter{
interval: interval,
timestamps: make([]time.Time, 0),
}
}
// cleanup removes all timestamps that are beyond the interval from the current time.
func (rc *RecentCounter) cleanup(now time.Time) {
checkpoint := now.Add(-rc.interval)
i := 0
for ; i < len(rc.timestamps); i++ {
if rc.timestamps[i].After(checkpoint) {
break
}
}
rc.timestamps = rc.timestamps[i:]
}
// inc is separated from its exported version to provide more flexibity around testing.
func (rc *RecentCounter) inc(now time.Time) {
rc.cleanup(now)
rc.timestamps = append(rc.timestamps, now)
}
// get is separated from its exported version to provide more flexibity around testing.
func (rc *RecentCounter) get(now time.Time) uint {
rc.cleanup(now)
return uint(len(rc.timestamps))
}
// Inc increments the counter and adds the current timestamp to the list of timestamps.
func (rc *RecentCounter) Inc() {
rc.inc(time.Now())
}
// Get returns the number of recent timestamps stored in the RecentCounter.
func (rc *RecentCounter) Get() uint {
return rc.get(time.Now())
}
package util
// Signalling primitives: single-signal sender/receiver pair and sync.Cond-ish exposed over a
// channel instead
import (
"sync"
)
func NewSingleSignalPair[T any]() (SignalSender[T], SignalReceiver[T]) {
sigCh := make(chan T, 1)
once := &sync.Once{}
closeSigCh := func() { once.Do(func() { close(sigCh) }) }
return SignalSender[T]{
send: func(data T) {
once.Do(func() {
sigCh <- data
close(sigCh)
})
},
}, SignalReceiver[T]{sigCh: sigCh, closeSigCh: closeSigCh}
}
type SignalSender[T any] struct {
send func(T)
}
type SignalReceiver[T any] struct {
sigCh chan T
closeSigCh func()
}
func (s SignalSender[T]) Send(data T) {
s.send(data)
}
func (s SignalReceiver[T]) Recv() <-chan T {
return s.sigCh
}
func (s SignalReceiver[T]) Close() {
s.closeSigCh()
}
// NewCondChannelPair creates a sender/receiver pair for a sync.Cond-like interface
//
// The differences from sync.Cond are that receiving is exposed through a channel (so it can be
// select-ed) and there is no equivalent to (*Cond).Broadcast()
func NewCondChannelPair() (CondChannelSender, CondChannelReceiver) {
ch := make(chan struct{}, 1)
return CondChannelSender{ch: ch}, CondChannelReceiver{ch: ch}
}
// CondChannelSender is the sending half of a sync.Cond-like interface
type CondChannelSender struct {
ch chan struct{}
}
// CondChannelReceiver is the receiving half of a sync.Cond-like interface
type CondChannelReceiver struct {
ch chan struct{}
}
// Send performs a non-blocking notify of the associated CondChannelReceiver
//
// If there is currently a receiver waiting via Recv, then this will immediately wake them.
// Otherwise, the next receive on the channel returned by Recv will complete immediately.
func (c *CondChannelSender) Send() {
select {
case c.ch <- struct{}{}:
default:
}
}
// Unsend cancels an existing signal that has been sent but not yet received.
//
// It returns whether there was a signal to be cancelled.
func (c *CondChannelSender) Unsend() bool {
select {
case <-c.ch:
return true
default:
return false
}
}
// Consume removes any existing signal created by Send, requiring an additional Send to be made
// before the receiving on Recv will unblock
//
// This method is non-blocking.
func (c *CondChannelReceiver) Consume() {
select {
case <-c.ch:
default:
}
}
// Recv returns a channel for which receiving will complete either (a) immediately, if Send has been
// called without Consume or another receive since; or (b) as soon as Send is next called
//
// This method is non-blocking but receiving on the returned channel may block.
func (c *CondChannelReceiver) Recv() <-chan struct{} {
return c.ch
}
package stack
// Originally taken from https://github.com/sharnoff/chord
// TODO - want to have some kind of "N skipped" when (a) there's lots of frames and (b) many of
// those frames are duplicates
import (
"runtime"
"strconv"
"sync"
)
// StackTrace represents a collected stack trace, possibly with a parent (i.e caller)
//
// StackTraces are designed to make it easy to track callers across goroutines. They are typically
// produced by [GetStackTrace]; refer to that function for more information.
type StackTrace struct {
// Frames provides the frames of this stack trace. Each frame's caller is at the index following
// it; the first frame is the direct caller.
Frames []StackFrame
// Parent, if not nil, provides the "parent" stack trace - typically the stack trace at the
// point this goroutine was spawned.
Parent *StackTrace
}
// Individual stack frame, contained in a [StackTrace], produced by [GetStackTrace].
type StackFrame struct {
// Function provides the name of the function being called, or the empty string if unknown.
Function string
// File gives the name of the file, or an empty string if the file is unknown.
File string
// Line gives the line number (starting from 1), or zero if the line number is unknown.
Line int
}
// GetStackTrace produces a StackTrace, optionally with a parent's stack trace to append.
//
// skip sets the number of initial calling stack frames to exclude. Setting skip to zero will
// produce a StackTrace where the first [StackFrame] represents the location where GetStackTrace was
// called.
func GetStackTrace(parent *StackTrace, skip uint) StackTrace {
frames := getFrames(skip + 1) // skip the additional frame introduced by GetStackTrace
return StackTrace{Frames: frames, Parent: parent}
}
// String produces a string representation of the stack trace, roughly similar to the default panic
// handler's.
//
// For some examples of formatting, refer to the StackTrace tests.
func (st StackTrace) String() string {
var buf []byte
for {
if len(st.Frames) == 0 {
buf = append(buf, "<empty stack>\n"...)
} else {
for _, f := range st.Frames {
var function, functionTail, file, fileLineSep, line string
if f.Function == "" {
function = "<unknown function>"
} else {
function = f.Function
functionTail = "(...)"
}
if f.File == "" {
file = "<unknown file>"
} else {
file = f.File
if f.Line != 0 {
fileLineSep = ":"
line = strconv.Itoa(f.Line)
}
}
buf = append(buf, function...)
buf = append(buf, functionTail...)
buf = append(buf, "\n\t"...)
buf = append(buf, file...)
buf = append(buf, fileLineSep...)
buf = append(buf, line...)
buf = append(buf, byte('\n'))
}
}
if st.Parent == nil {
break
}
st = *st.Parent
buf = append(buf, "called by "...)
continue
}
return string(buf)
}
var pcBufPool = sync.Pool{
New: func() any {
buf := make([]uintptr, 128)
return &buf
},
}
func putPCBuffer(buf *[]uintptr) {
if len(*buf) < 1024 {
pcBufPool.Put(buf)
}
}
func getFrames(skip uint) []StackFrame {
skip += 2 // skip the frame introduced by this function and runtime.Callers
pcBuf := pcBufPool.Get().(*[]uintptr)
defer putPCBuffer(pcBuf)
if len(*pcBuf) == 0 {
panic("internal error: len(*pcBuf) == 0")
}
// read program counters into the buffer, repeating until buffer is big enough.
//
// This is O(n log n), where n is the true number of program counters.
var pc []uintptr
for {
n := runtime.Callers(0, *pcBuf)
if n == 0 {
panic("runtime.Callers(0, ...) returned zero")
}
if n < len(*pcBuf) {
pc = (*pcBuf)[:n]
break
} else {
*pcBuf = make([]uintptr, 2*len(*pcBuf))
}
}
framesIter := runtime.CallersFrames(pc)
var frames []StackFrame
more := true
for more {
var frame runtime.Frame
frame, more = framesIter.Next()
if skip > 0 {
skip -= 1
continue
}
frames = append(frames, StackFrame{
Function: frame.Function,
File: frame.File,
Line: frame.Line,
})
}
return frames
}
// Originally taken from https://github.com/ptxmac/multierrgroup
// Package taskgroup provides a mix of multierr and errgroup
// See documentation for https://pkg.go.dev/go.uber.org/multierr and https://pkg.go.dev/golang.org/x/sync/errgroup
package taskgroup
import (
"context"
"fmt"
"sync"
"go.uber.org/multierr"
"go.uber.org/zap"
"github.com/neondatabase/autoscaling/pkg/util/stack"
)
// Group manages goroutines and collect all the errors.
// See https://pkg.go.dev/golang.org/x/sync/errgroup#group for more information
type Group interface {
Ctx() context.Context
Wait() error
Go(name string, f func(logger *zap.Logger) error)
}
type group struct {
cancel context.CancelFunc
ctx context.Context
logger *zap.Logger
panicHandler func(any)
wg sync.WaitGroup
errMutex sync.Mutex
err error
}
type GroupOption func(*group)
// WithParentContext sets the parent context for the group.
func WithParentContext(ctx context.Context) GroupOption {
return func(g *group) {
g.ctx, g.cancel = context.WithCancel(ctx)
}
}
// WithPanicHandler sets a panic handler for the group.
func WithPanicHandler(f func(any)) GroupOption {
return func(g *group) {
g.panicHandler = f
}
}
// NewGroup returns a new Group.
func NewGroup(logger *zap.Logger, opts ...GroupOption) Group {
g := &group{
cancel: nil, // Set separately by Ctx
ctx: nil, // Set separately by Ctx
panicHandler: nil, // Set separately by WithPanicHandler
logger: logger,
wg: sync.WaitGroup{},
errMutex: sync.Mutex{},
err: nil,
}
for _, opt := range opts {
opt(g)
}
if g.ctx == nil {
// If parent context is not set, use background context
WithParentContext(context.Background())(g)
}
return g
}
// Ctx returns a context that will be canceled when the group is Waited.
func (g *group) Ctx() context.Context {
return g.ctx
}
// Wait blocks until all goroutines have completed.
//
// All errors returned from the goroutines will be combined into one using multierr and returned from this method.
func (g *group) Wait() error {
g.wg.Wait()
if g.cancel != nil {
g.cancel()
}
return g.err
}
func (g *group) call(f func() error) (err error) {
defer func() {
if r := recover(); r != nil {
if g.panicHandler != nil {
g.panicHandler(r)
}
// Omit 1 frame - the f() call below
st := stack.GetStackTrace(nil, 1).String()
g.logger.Error("Task panicked", zap.Any("payload", r), zap.String("stack", st))
err = fmt.Errorf("panic: %v", r)
}
}()
err = f()
return err
}
// Go calls the function in a new goroutine.
// If a non-nil errors is returned, the context is canceled and
// the error is collected using multierr and will be returned by Wait.
func (g *group) Go(name string, f func(logger *zap.Logger) error) {
g.wg.Add(1)
go func() {
defer g.wg.Done()
logger := g.logger.Named(name)
cb := func() error {
return f(logger)
}
if err := g.call(cb); err != nil {
err = fmt.Errorf("task %s failed: %w", name, err)
g.errMutex.Lock()
g.err = multierr.Append(g.err, err)
g.errMutex.Unlock()
logger.Error(err.Error())
if g.cancel != nil {
g.cancel()
}
}
}()
}
package util
import (
"errors"
"math/rand"
"time"
)
type TimeRange struct {
min int
max int
units time.Duration
}
func NewTimeRange(units time.Duration, min, max int) *TimeRange {
if min < 0 {
panic(errors.New("bad time range: min < 0"))
} else if min == 0 && max == 0 {
panic(errors.New("bad time range: min and max = 0"))
} else if max < min {
panic(errors.New("bad time range: max < min"))
}
return &TimeRange{min: min, max: max, units: units}
}
// Random returns a random time.Duration within the range
func (r TimeRange) Random() time.Duration {
if r.max == r.min {
return time.Duration(r.min) * r.units
}
count := rand.Intn(r.max-r.min) + r.min
return time.Duration(count) * r.units
}
package util
// Helper for creating a zap.Field for a VM
import (
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
corev1 "k8s.io/api/core/v1"
vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
)
type nameFields struct {
virtualmachine NamespacedName
pod NamespacedName
}
// MarshalLogObject implements zapcore.ObjectMarshaler
func (f nameFields) MarshalLogObject(enc zapcore.ObjectEncoder) error {
if err := enc.AddObject("virtualmachine", f.virtualmachine); err != nil {
return err
}
if err := enc.AddObject("pod", f.pod); err != nil {
return err
}
return nil
}
func VMNameFields(vm *vmapi.VirtualMachine) zap.Field {
vmName := GetNamespacedName(vm)
// If the VM has a pod, log both the VM and the pod, otherwise just the VM.
if vm.Status.PodName == "" {
return zap.Object("virtualmachine", vmName)
} else {
podName := NamespacedName{Namespace: vm.Namespace, Name: vm.Status.PodName}
return zap.Inline(nameFields{
virtualmachine: vmName,
pod: podName,
})
}
}
func PodNameFields(pod *corev1.Pod) zap.Field {
podName := GetNamespacedName(pod)
if vmName, ok := pod.Labels[vmapi.VirtualMachineNameLabel]; ok {
vmName := NamespacedName{Namespace: pod.Namespace, Name: vmName}
return zap.Inline(nameFields{
virtualmachine: vmName,
pod: podName,
})
} else {
return zap.Object("pod", podName)
}
}