123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304 |
- /*
- Copyright 2016 The Kubernetes Authors.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
- package eviction
- import (
- "sort"
- "sync"
- "time"
- "github.com/golang/glog"
- "k8s.io/kubernetes/pkg/api"
- "k8s.io/kubernetes/pkg/client/record"
- "k8s.io/kubernetes/pkg/kubelet/lifecycle"
- "k8s.io/kubernetes/pkg/kubelet/qos"
- "k8s.io/kubernetes/pkg/kubelet/server/stats"
- "k8s.io/kubernetes/pkg/kubelet/util/format"
- "k8s.io/kubernetes/pkg/util/clock"
- "k8s.io/kubernetes/pkg/util/wait"
- )
- // managerImpl implements NodeStabilityManager
- type managerImpl struct {
- // used to track time
- clock clock.Clock
- // config is how the manager is configured
- config Config
- // the function to invoke to kill a pod
- killPodFunc KillPodFunc
- // the interface that knows how to do image gc
- imageGC ImageGC
- // protects access to internal state
- sync.RWMutex
- // node conditions are the set of conditions present
- nodeConditions []api.NodeConditionType
- // captures when a node condition was last observed based on a threshold being met
- nodeConditionsLastObservedAt nodeConditionsObservedAt
- // nodeRef is a reference to the node
- nodeRef *api.ObjectReference
- // used to record events about the node
- recorder record.EventRecorder
- // used to measure usage stats on system
- summaryProvider stats.SummaryProvider
- // records when a threshold was first observed
- thresholdsFirstObservedAt thresholdsObservedAt
- // records the set of thresholds that have been met (including graceperiod) but not yet resolved
- thresholdsMet []Threshold
- // resourceToRankFunc maps a resource to ranking function for that resource.
- resourceToRankFunc map[api.ResourceName]rankFunc
- // resourceToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource.
- resourceToNodeReclaimFuncs map[api.ResourceName]nodeReclaimFuncs
- }
- // ensure it implements the required interface
- var _ Manager = &managerImpl{}
- // NewManager returns a configured Manager and an associated admission handler to enforce eviction configuration.
- func NewManager(
- summaryProvider stats.SummaryProvider,
- config Config,
- killPodFunc KillPodFunc,
- imageGC ImageGC,
- recorder record.EventRecorder,
- nodeRef *api.ObjectReference,
- clock clock.Clock) (Manager, lifecycle.PodAdmitHandler, error) {
- manager := &managerImpl{
- clock: clock,
- killPodFunc: killPodFunc,
- imageGC: imageGC,
- config: config,
- recorder: recorder,
- summaryProvider: summaryProvider,
- nodeRef: nodeRef,
- nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
- thresholdsFirstObservedAt: thresholdsObservedAt{},
- }
- return manager, manager, nil
- }
- // Admit rejects a pod if its not safe to admit for node stability.
- func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
- m.RLock()
- defer m.RUnlock()
- if len(m.nodeConditions) == 0 {
- return lifecycle.PodAdmitResult{Admit: true}
- }
- // Check the node conditions to identify the resource under pressure.
- // The resource can only be either disk or memory; set the default to disk.
- resource := api.ResourceStorage
- if hasNodeCondition(m.nodeConditions, api.NodeMemoryPressure) {
- resource = api.ResourceMemory
- // the node has memory pressure, admit if not best-effort
- notBestEffort := qos.BestEffort != qos.GetPodQOS(attrs.Pod)
- if notBestEffort {
- return lifecycle.PodAdmitResult{Admit: true}
- }
- }
- // reject pods when under memory pressure (if pod is best effort), or if under disk pressure.
- glog.Warningf("Failed to admit pod %q - node has conditions: %v", format.Pod(attrs.Pod), m.nodeConditions)
- return lifecycle.PodAdmitResult{
- Admit: false,
- Reason: reason,
- Message: getMessage(resource),
- }
- }
- // Start starts the control loop to observe and response to low compute resources.
- func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, monitoringInterval time.Duration) error {
- // start the eviction manager monitoring
- go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc) }, monitoringInterval, wait.NeverStop)
- return nil
- }
- // IsUnderMemoryPressure returns true if the node is under memory pressure.
- func (m *managerImpl) IsUnderMemoryPressure() bool {
- m.RLock()
- defer m.RUnlock()
- return hasNodeCondition(m.nodeConditions, api.NodeMemoryPressure)
- }
- // IsUnderDiskPressure returns true if the node is under disk pressure.
- func (m *managerImpl) IsUnderDiskPressure() bool {
- m.RLock()
- defer m.RUnlock()
- return hasNodeCondition(m.nodeConditions, api.NodeDiskPressure)
- }
- // synchronize is the main control loop that enforces eviction thresholds.
- func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) {
- // if we have nothing to do, just return
- thresholds := m.config.Thresholds
- if len(thresholds) == 0 {
- return
- }
- // build the ranking functions (if not yet known)
- // TODO: have a function in cadvisor that lets us know if global housekeeping has completed
- if len(m.resourceToRankFunc) == 0 || len(m.resourceToNodeReclaimFuncs) == 0 {
- // this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
- hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
- if err != nil {
- return
- }
- m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
- m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
- }
- // make observations and get a function to derive pod usage stats relative to those observations.
- observations, statsFunc, err := makeSignalObservations(m.summaryProvider)
- if err != nil {
- glog.Errorf("eviction manager: unexpected err: %v", err)
- return
- }
- // find the list of thresholds that are met independent of grace period
- now := m.clock.Now()
- // determine the set of thresholds met independent of grace period
- thresholds = thresholdsMet(thresholds, observations, false)
- // determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim
- if len(m.thresholdsMet) > 0 {
- thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true)
- thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved)
- }
- // track when a threshold was first observed
- thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now)
- // the set of node conditions that are triggered by currently observed thresholds
- nodeConditions := nodeConditions(thresholds)
- // track when a node condition was last observed
- nodeConditionsLastObservedAt := nodeConditionsLastObservedAt(nodeConditions, m.nodeConditionsLastObservedAt, now)
- // node conditions report true if it has been observed within the transition period window
- nodeConditions = nodeConditionsObservedSince(nodeConditionsLastObservedAt, m.config.PressureTransitionPeriod, now)
- // determine the set of thresholds we need to drive eviction behavior (i.e. all grace periods are met)
- thresholds = thresholdsMetGracePeriod(thresholdsFirstObservedAt, now)
- // update internal state
- m.Lock()
- m.nodeConditions = nodeConditions
- m.thresholdsFirstObservedAt = thresholdsFirstObservedAt
- m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt
- m.thresholdsMet = thresholds
- m.Unlock()
- // determine the set of resources under starvation
- starvedResources := getStarvedResources(thresholds)
- if len(starvedResources) == 0 {
- glog.V(3).Infof("eviction manager: no resources are starved")
- return
- }
- // rank the resources to reclaim by eviction priority
- sort.Sort(byEvictionPriority(starvedResources))
- resourceToReclaim := starvedResources[0]
- glog.Warningf("eviction manager: attempting to reclaim %v", resourceToReclaim)
- // determine if this is a soft or hard eviction associated with the resource
- softEviction := isSoftEviction(thresholds, resourceToReclaim)
- // record an event about the resources we are now attempting to reclaim via eviction
- m.recorder.Eventf(m.nodeRef, api.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim)
- // check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
- if m.reclaimNodeLevelResources(resourceToReclaim, observations) {
- glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
- return
- }
- glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
- // rank the pods for eviction
- rank, ok := m.resourceToRankFunc[resourceToReclaim]
- if !ok {
- glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim)
- return
- }
- // the only candidates viable for eviction are those pods that had anything running.
- activePods := podFunc()
- if len(activePods) == 0 {
- glog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
- return
- }
- // rank the running pods for eviction for the specified resource
- rank(activePods, statsFunc)
- glog.Infof("eviction manager: pods ranked for eviction: %s", format.Pods(activePods))
- // we kill at most a single pod during each eviction interval
- message := getMessage(resourceToReclaim)
- for i := range activePods {
- pod := activePods[i]
- status := api.PodStatus{
- Phase: api.PodFailed,
- Message: message,
- Reason: reason,
- }
- // record that we are evicting the pod
- m.recorder.Eventf(pod, api.EventTypeWarning, reason, message)
- gracePeriodOverride := int64(0)
- if softEviction {
- gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
- }
- // this is a blocking call and should only return when the pod and its containers are killed.
- err := m.killPodFunc(pod, status, &gracePeriodOverride)
- if err != nil {
- glog.Infof("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
- continue
- }
- // success, so we return until the next housekeeping interval
- glog.Infof("eviction manager: pod %s evicted successfully", format.Pod(pod))
- return
- }
- glog.Infof("eviction manager: unable to evict any pods from the node")
- }
- // reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.
- func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim api.ResourceName, observations signalObservations) bool {
- nodeReclaimFuncs := m.resourceToNodeReclaimFuncs[resourceToReclaim]
- for _, nodeReclaimFunc := range nodeReclaimFuncs {
- // attempt to reclaim the pressured resource.
- reclaimed, err := nodeReclaimFunc()
- if err == nil {
- // update our local observations based on the amount reported to have been reclaimed.
- // note: this is optimistic, other things could have been still consuming the pressured resource in the interim.
- signal := resourceToSignal[resourceToReclaim]
- value, ok := observations[signal]
- if !ok {
- glog.Errorf("eviction manager: unable to find value associated with signal %v", signal)
- continue
- }
- value.available.Add(*reclaimed)
- // evaluate all current thresholds to see if with adjusted observations, we think we have met min reclaim goals
- if len(thresholdsMet(m.thresholdsMet, observations, true)) == 0 {
- return true
- }
- } else {
- glog.Errorf("eviction manager: unexpected error when attempting to reduce %v pressure: %v", resourceToReclaim, err)
- }
- }
- return false
- }
|