eviction_manager.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package eviction
  14. import (
  15. "sort"
  16. "sync"
  17. "time"
  18. "github.com/golang/glog"
  19. "k8s.io/kubernetes/pkg/api"
  20. "k8s.io/kubernetes/pkg/client/record"
  21. "k8s.io/kubernetes/pkg/kubelet/lifecycle"
  22. "k8s.io/kubernetes/pkg/kubelet/qos"
  23. "k8s.io/kubernetes/pkg/kubelet/server/stats"
  24. "k8s.io/kubernetes/pkg/kubelet/util/format"
  25. "k8s.io/kubernetes/pkg/util/clock"
  26. "k8s.io/kubernetes/pkg/util/wait"
  27. )
  28. // managerImpl implements NodeStabilityManager
  29. type managerImpl struct {
  30. // used to track time
  31. clock clock.Clock
  32. // config is how the manager is configured
  33. config Config
  34. // the function to invoke to kill a pod
  35. killPodFunc KillPodFunc
  36. // the interface that knows how to do image gc
  37. imageGC ImageGC
  38. // protects access to internal state
  39. sync.RWMutex
  40. // node conditions are the set of conditions present
  41. nodeConditions []api.NodeConditionType
  42. // captures when a node condition was last observed based on a threshold being met
  43. nodeConditionsLastObservedAt nodeConditionsObservedAt
  44. // nodeRef is a reference to the node
  45. nodeRef *api.ObjectReference
  46. // used to record events about the node
  47. recorder record.EventRecorder
  48. // used to measure usage stats on system
  49. summaryProvider stats.SummaryProvider
  50. // records when a threshold was first observed
  51. thresholdsFirstObservedAt thresholdsObservedAt
  52. // records the set of thresholds that have been met (including graceperiod) but not yet resolved
  53. thresholdsMet []Threshold
  54. // resourceToRankFunc maps a resource to ranking function for that resource.
  55. resourceToRankFunc map[api.ResourceName]rankFunc
  56. // resourceToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource.
  57. resourceToNodeReclaimFuncs map[api.ResourceName]nodeReclaimFuncs
  58. }
  59. // ensure it implements the required interface
  60. var _ Manager = &managerImpl{}
  61. // NewManager returns a configured Manager and an associated admission handler to enforce eviction configuration.
  62. func NewManager(
  63. summaryProvider stats.SummaryProvider,
  64. config Config,
  65. killPodFunc KillPodFunc,
  66. imageGC ImageGC,
  67. recorder record.EventRecorder,
  68. nodeRef *api.ObjectReference,
  69. clock clock.Clock) (Manager, lifecycle.PodAdmitHandler, error) {
  70. manager := &managerImpl{
  71. clock: clock,
  72. killPodFunc: killPodFunc,
  73. imageGC: imageGC,
  74. config: config,
  75. recorder: recorder,
  76. summaryProvider: summaryProvider,
  77. nodeRef: nodeRef,
  78. nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
  79. thresholdsFirstObservedAt: thresholdsObservedAt{},
  80. }
  81. return manager, manager, nil
  82. }
  83. // Admit rejects a pod if its not safe to admit for node stability.
  84. func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
  85. m.RLock()
  86. defer m.RUnlock()
  87. if len(m.nodeConditions) == 0 {
  88. return lifecycle.PodAdmitResult{Admit: true}
  89. }
  90. // Check the node conditions to identify the resource under pressure.
  91. // The resource can only be either disk or memory; set the default to disk.
  92. resource := api.ResourceStorage
  93. if hasNodeCondition(m.nodeConditions, api.NodeMemoryPressure) {
  94. resource = api.ResourceMemory
  95. // the node has memory pressure, admit if not best-effort
  96. notBestEffort := qos.BestEffort != qos.GetPodQOS(attrs.Pod)
  97. if notBestEffort {
  98. return lifecycle.PodAdmitResult{Admit: true}
  99. }
  100. }
  101. // reject pods when under memory pressure (if pod is best effort), or if under disk pressure.
  102. glog.Warningf("Failed to admit pod %q - node has conditions: %v", format.Pod(attrs.Pod), m.nodeConditions)
  103. return lifecycle.PodAdmitResult{
  104. Admit: false,
  105. Reason: reason,
  106. Message: getMessage(resource),
  107. }
  108. }
  109. // Start starts the control loop to observe and response to low compute resources.
  110. func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, monitoringInterval time.Duration) error {
  111. // start the eviction manager monitoring
  112. go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc) }, monitoringInterval, wait.NeverStop)
  113. return nil
  114. }
  115. // IsUnderMemoryPressure returns true if the node is under memory pressure.
  116. func (m *managerImpl) IsUnderMemoryPressure() bool {
  117. m.RLock()
  118. defer m.RUnlock()
  119. return hasNodeCondition(m.nodeConditions, api.NodeMemoryPressure)
  120. }
  121. // IsUnderDiskPressure returns true if the node is under disk pressure.
  122. func (m *managerImpl) IsUnderDiskPressure() bool {
  123. m.RLock()
  124. defer m.RUnlock()
  125. return hasNodeCondition(m.nodeConditions, api.NodeDiskPressure)
  126. }
  127. // synchronize is the main control loop that enforces eviction thresholds.
  128. func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) {
  129. // if we have nothing to do, just return
  130. thresholds := m.config.Thresholds
  131. if len(thresholds) == 0 {
  132. return
  133. }
  134. // build the ranking functions (if not yet known)
  135. // TODO: have a function in cadvisor that lets us know if global housekeeping has completed
  136. if len(m.resourceToRankFunc) == 0 || len(m.resourceToNodeReclaimFuncs) == 0 {
  137. // this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
  138. hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
  139. if err != nil {
  140. return
  141. }
  142. m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
  143. m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
  144. }
  145. // make observations and get a function to derive pod usage stats relative to those observations.
  146. observations, statsFunc, err := makeSignalObservations(m.summaryProvider)
  147. if err != nil {
  148. glog.Errorf("eviction manager: unexpected err: %v", err)
  149. return
  150. }
  151. // find the list of thresholds that are met independent of grace period
  152. now := m.clock.Now()
  153. // determine the set of thresholds met independent of grace period
  154. thresholds = thresholdsMet(thresholds, observations, false)
  155. // determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim
  156. if len(m.thresholdsMet) > 0 {
  157. thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true)
  158. thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved)
  159. }
  160. // track when a threshold was first observed
  161. thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now)
  162. // the set of node conditions that are triggered by currently observed thresholds
  163. nodeConditions := nodeConditions(thresholds)
  164. // track when a node condition was last observed
  165. nodeConditionsLastObservedAt := nodeConditionsLastObservedAt(nodeConditions, m.nodeConditionsLastObservedAt, now)
  166. // node conditions report true if it has been observed within the transition period window
  167. nodeConditions = nodeConditionsObservedSince(nodeConditionsLastObservedAt, m.config.PressureTransitionPeriod, now)
  168. // determine the set of thresholds we need to drive eviction behavior (i.e. all grace periods are met)
  169. thresholds = thresholdsMetGracePeriod(thresholdsFirstObservedAt, now)
  170. // update internal state
  171. m.Lock()
  172. m.nodeConditions = nodeConditions
  173. m.thresholdsFirstObservedAt = thresholdsFirstObservedAt
  174. m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt
  175. m.thresholdsMet = thresholds
  176. m.Unlock()
  177. // determine the set of resources under starvation
  178. starvedResources := getStarvedResources(thresholds)
  179. if len(starvedResources) == 0 {
  180. glog.V(3).Infof("eviction manager: no resources are starved")
  181. return
  182. }
  183. // rank the resources to reclaim by eviction priority
  184. sort.Sort(byEvictionPriority(starvedResources))
  185. resourceToReclaim := starvedResources[0]
  186. glog.Warningf("eviction manager: attempting to reclaim %v", resourceToReclaim)
  187. // determine if this is a soft or hard eviction associated with the resource
  188. softEviction := isSoftEviction(thresholds, resourceToReclaim)
  189. // record an event about the resources we are now attempting to reclaim via eviction
  190. m.recorder.Eventf(m.nodeRef, api.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim)
  191. // check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
  192. if m.reclaimNodeLevelResources(resourceToReclaim, observations) {
  193. glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
  194. return
  195. }
  196. glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
  197. // rank the pods for eviction
  198. rank, ok := m.resourceToRankFunc[resourceToReclaim]
  199. if !ok {
  200. glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim)
  201. return
  202. }
  203. // the only candidates viable for eviction are those pods that had anything running.
  204. activePods := podFunc()
  205. if len(activePods) == 0 {
  206. glog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
  207. return
  208. }
  209. // rank the running pods for eviction for the specified resource
  210. rank(activePods, statsFunc)
  211. glog.Infof("eviction manager: pods ranked for eviction: %s", format.Pods(activePods))
  212. // we kill at most a single pod during each eviction interval
  213. message := getMessage(resourceToReclaim)
  214. for i := range activePods {
  215. pod := activePods[i]
  216. status := api.PodStatus{
  217. Phase: api.PodFailed,
  218. Message: message,
  219. Reason: reason,
  220. }
  221. // record that we are evicting the pod
  222. m.recorder.Eventf(pod, api.EventTypeWarning, reason, message)
  223. gracePeriodOverride := int64(0)
  224. if softEviction {
  225. gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
  226. }
  227. // this is a blocking call and should only return when the pod and its containers are killed.
  228. err := m.killPodFunc(pod, status, &gracePeriodOverride)
  229. if err != nil {
  230. glog.Infof("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
  231. continue
  232. }
  233. // success, so we return until the next housekeeping interval
  234. glog.Infof("eviction manager: pod %s evicted successfully", format.Pod(pod))
  235. return
  236. }
  237. glog.Infof("eviction manager: unable to evict any pods from the node")
  238. }
  239. // reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.
  240. func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim api.ResourceName, observations signalObservations) bool {
  241. nodeReclaimFuncs := m.resourceToNodeReclaimFuncs[resourceToReclaim]
  242. for _, nodeReclaimFunc := range nodeReclaimFuncs {
  243. // attempt to reclaim the pressured resource.
  244. reclaimed, err := nodeReclaimFunc()
  245. if err == nil {
  246. // update our local observations based on the amount reported to have been reclaimed.
  247. // note: this is optimistic, other things could have been still consuming the pressured resource in the interim.
  248. signal := resourceToSignal[resourceToReclaim]
  249. value, ok := observations[signal]
  250. if !ok {
  251. glog.Errorf("eviction manager: unable to find value associated with signal %v", signal)
  252. continue
  253. }
  254. value.available.Add(*reclaimed)
  255. // evaluate all current thresholds to see if with adjusted observations, we think we have met min reclaim goals
  256. if len(thresholdsMet(m.thresholdsMet, observations, true)) == 0 {
  257. return true
  258. }
  259. } else {
  260. glog.Errorf("eviction manager: unexpected error when attempting to reduce %v pressure: %v", resourceToReclaim, err)
  261. }
  262. }
  263. return false
  264. }