helpers.go 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package eviction
  14. import (
  15. "fmt"
  16. "sort"
  17. "strconv"
  18. "strings"
  19. "time"
  20. "github.com/golang/glog"
  21. "k8s.io/kubernetes/pkg/api"
  22. "k8s.io/kubernetes/pkg/api/resource"
  23. statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
  24. "k8s.io/kubernetes/pkg/kubelet/qos"
  25. "k8s.io/kubernetes/pkg/kubelet/server/stats"
  26. "k8s.io/kubernetes/pkg/quota/evaluator/core"
  27. "k8s.io/kubernetes/pkg/util/sets"
  28. )
  29. const (
  30. unsupportedEvictionSignal = "unsupported eviction signal %v"
  31. // the reason reported back in status.
  32. reason = "Evicted"
  33. // the message format associated with the reason.
  34. messageFmt = "The node was low on %s."
  35. // disk, in bytes. internal to this module, used to account for local disk usage.
  36. resourceDisk api.ResourceName = "disk"
  37. // inodes, number. internal to this module, used to account for local disk inode consumption.
  38. resourceInodes api.ResourceName = "inodes"
  39. // imagefs, in bytes. internal to this module, used to account for local image filesystem usage.
  40. resourceImageFs api.ResourceName = "imagefs"
  41. // imagefs inodes, number. internal to this module, used to account for local image filesystem inodes.
  42. resourceImageFsInodes api.ResourceName = "imagefsInodes"
  43. // nodefs, in bytes. internal to this module, used to account for local node root filesystem usage.
  44. resourceNodeFs api.ResourceName = "nodefs"
  45. // nodefs inodes, number. internal to this module, used to account for local node root filesystem inodes.
  46. resourceNodeFsInodes api.ResourceName = "nodefsInodes"
  47. )
  48. var (
  49. // signalToNodeCondition maps a signal to the node condition to report if threshold is met.
  50. signalToNodeCondition map[Signal]api.NodeConditionType
  51. // signalToResource maps a Signal to its associated Resource.
  52. signalToResource map[Signal]api.ResourceName
  53. // resourceToSignal maps a Resource to its associated Signal
  54. resourceToSignal map[api.ResourceName]Signal
  55. )
  56. func init() {
  57. // map eviction signals to node conditions
  58. signalToNodeCondition = map[Signal]api.NodeConditionType{}
  59. signalToNodeCondition[SignalMemoryAvailable] = api.NodeMemoryPressure
  60. signalToNodeCondition[SignalImageFsAvailable] = api.NodeDiskPressure
  61. signalToNodeCondition[SignalNodeFsAvailable] = api.NodeDiskPressure
  62. signalToNodeCondition[SignalImageFsInodesFree] = api.NodeDiskPressure
  63. signalToNodeCondition[SignalNodeFsInodesFree] = api.NodeDiskPressure
  64. // map signals to resources (and vice-versa)
  65. signalToResource = map[Signal]api.ResourceName{}
  66. signalToResource[SignalMemoryAvailable] = api.ResourceMemory
  67. signalToResource[SignalImageFsAvailable] = resourceImageFs
  68. signalToResource[SignalImageFsInodesFree] = resourceImageFsInodes
  69. signalToResource[SignalNodeFsAvailable] = resourceNodeFs
  70. signalToResource[SignalNodeFsInodesFree] = resourceNodeFsInodes
  71. resourceToSignal = map[api.ResourceName]Signal{}
  72. for key, value := range signalToResource {
  73. resourceToSignal[value] = key
  74. }
  75. }
  76. // validSignal returns true if the signal is supported.
  77. func validSignal(signal Signal) bool {
  78. _, found := signalToResource[signal]
  79. return found
  80. }
  81. // ParseThresholdConfig parses the flags for thresholds.
  82. func ParseThresholdConfig(evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim string) ([]Threshold, error) {
  83. results := []Threshold{}
  84. hardThresholds, err := parseThresholdStatements(evictionHard)
  85. if err != nil {
  86. return nil, err
  87. }
  88. results = append(results, hardThresholds...)
  89. softThresholds, err := parseThresholdStatements(evictionSoft)
  90. if err != nil {
  91. return nil, err
  92. }
  93. gracePeriods, err := parseGracePeriods(evictionSoftGracePeriod)
  94. if err != nil {
  95. return nil, err
  96. }
  97. minReclaims, err := parseMinimumReclaims(evictionMinimumReclaim)
  98. if err != nil {
  99. return nil, err
  100. }
  101. for i := range softThresholds {
  102. signal := softThresholds[i].Signal
  103. period, found := gracePeriods[signal]
  104. if !found {
  105. return nil, fmt.Errorf("grace period must be specified for the soft eviction threshold %v", signal)
  106. }
  107. softThresholds[i].GracePeriod = period
  108. }
  109. results = append(results, softThresholds...)
  110. for i := range results {
  111. for signal, minReclaim := range minReclaims {
  112. if results[i].Signal == signal {
  113. results[i].MinReclaim = &minReclaim
  114. break
  115. }
  116. }
  117. }
  118. return results, nil
  119. }
  120. // parseThresholdStatements parses the input statements into a list of Threshold objects.
  121. func parseThresholdStatements(expr string) ([]Threshold, error) {
  122. if len(expr) == 0 {
  123. return nil, nil
  124. }
  125. results := []Threshold{}
  126. statements := strings.Split(expr, ",")
  127. signalsFound := sets.NewString()
  128. for _, statement := range statements {
  129. result, err := parseThresholdStatement(statement)
  130. if err != nil {
  131. return nil, err
  132. }
  133. if signalsFound.Has(string(result.Signal)) {
  134. return nil, fmt.Errorf("found duplicate eviction threshold for signal %v", result.Signal)
  135. }
  136. signalsFound.Insert(string(result.Signal))
  137. results = append(results, result)
  138. }
  139. return results, nil
  140. }
  141. // parseThresholdStatement parses a threshold statement.
  142. func parseThresholdStatement(statement string) (Threshold, error) {
  143. tokens2Operator := map[string]ThresholdOperator{
  144. "<": OpLessThan,
  145. }
  146. var (
  147. operator ThresholdOperator
  148. parts []string
  149. )
  150. for token := range tokens2Operator {
  151. parts = strings.Split(statement, token)
  152. // if we got a token, we know this was the operator...
  153. if len(parts) > 1 {
  154. operator = tokens2Operator[token]
  155. break
  156. }
  157. }
  158. if len(operator) == 0 || len(parts) != 2 {
  159. return Threshold{}, fmt.Errorf("invalid eviction threshold syntax %v, expected <signal><operator><value>", statement)
  160. }
  161. signal := Signal(parts[0])
  162. if !validSignal(signal) {
  163. return Threshold{}, fmt.Errorf(unsupportedEvictionSignal, signal)
  164. }
  165. quantityValue := parts[1]
  166. if strings.HasSuffix(quantityValue, "%") {
  167. percentage, err := parsePercentage(quantityValue)
  168. if err != nil {
  169. return Threshold{}, err
  170. }
  171. if percentage <= 0 {
  172. return Threshold{}, fmt.Errorf("eviction percentage threshold %v must be positive: %s", signal, quantityValue)
  173. }
  174. return Threshold{
  175. Signal: signal,
  176. Operator: operator,
  177. Value: ThresholdValue{
  178. Percentage: percentage,
  179. },
  180. }, nil
  181. }
  182. quantity, err := resource.ParseQuantity(quantityValue)
  183. if err != nil {
  184. return Threshold{}, err
  185. }
  186. if quantity.Sign() < 0 || quantity.IsZero() {
  187. return Threshold{}, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity)
  188. }
  189. return Threshold{
  190. Signal: signal,
  191. Operator: operator,
  192. Value: ThresholdValue{
  193. Quantity: &quantity,
  194. },
  195. }, nil
  196. }
  197. // parsePercentage parses a string representing a percentage value
  198. func parsePercentage(input string) (float32, error) {
  199. value, err := strconv.ParseFloat(strings.TrimRight(input, "%"), 32)
  200. if err != nil {
  201. return 0, err
  202. }
  203. return float32(value) / 100, nil
  204. }
  205. // parseGracePeriods parses the grace period statements
  206. func parseGracePeriods(expr string) (map[Signal]time.Duration, error) {
  207. if len(expr) == 0 {
  208. return nil, nil
  209. }
  210. results := map[Signal]time.Duration{}
  211. statements := strings.Split(expr, ",")
  212. for _, statement := range statements {
  213. parts := strings.Split(statement, "=")
  214. if len(parts) != 2 {
  215. return nil, fmt.Errorf("invalid eviction grace period syntax %v, expected <signal>=<duration>", statement)
  216. }
  217. signal := Signal(parts[0])
  218. if !validSignal(signal) {
  219. return nil, fmt.Errorf(unsupportedEvictionSignal, signal)
  220. }
  221. gracePeriod, err := time.ParseDuration(parts[1])
  222. if err != nil {
  223. return nil, err
  224. }
  225. if gracePeriod < 0 {
  226. return nil, fmt.Errorf("invalid eviction grace period specified: %v, must be a positive value", parts[1])
  227. }
  228. // check against duplicate statements
  229. if _, found := results[signal]; found {
  230. return nil, fmt.Errorf("duplicate eviction grace period specified for %v", signal)
  231. }
  232. results[signal] = gracePeriod
  233. }
  234. return results, nil
  235. }
  236. // parseMinimumReclaims parses the minimum reclaim statements
  237. func parseMinimumReclaims(expr string) (map[Signal]resource.Quantity, error) {
  238. if len(expr) == 0 {
  239. return nil, nil
  240. }
  241. results := map[Signal]resource.Quantity{}
  242. statements := strings.Split(expr, ",")
  243. for _, statement := range statements {
  244. parts := strings.Split(statement, "=")
  245. if len(parts) != 2 {
  246. return nil, fmt.Errorf("invalid eviction minimum reclaim syntax: %v, expected <signal>=<quantity>", statement)
  247. }
  248. signal := Signal(parts[0])
  249. if !validSignal(signal) {
  250. return nil, fmt.Errorf(unsupportedEvictionSignal, signal)
  251. }
  252. // check against duplicate statements
  253. if _, found := results[signal]; found {
  254. return nil, fmt.Errorf("duplicate eviction minimum reclaim specified for %v", signal)
  255. }
  256. quantity, err := resource.ParseQuantity(parts[1])
  257. if quantity.Sign() < 0 {
  258. return nil, fmt.Errorf("negative eviction minimum reclaim specified for %v", signal)
  259. }
  260. if err != nil {
  261. return nil, err
  262. }
  263. results[signal] = quantity
  264. }
  265. return results, nil
  266. }
  267. // diskUsage converts used bytes into a resource quantity.
  268. func diskUsage(fsStats *statsapi.FsStats) *resource.Quantity {
  269. if fsStats == nil || fsStats.UsedBytes == nil {
  270. return &resource.Quantity{Format: resource.BinarySI}
  271. }
  272. usage := int64(*fsStats.UsedBytes)
  273. return resource.NewQuantity(usage, resource.BinarySI)
  274. }
  275. // inodeUsage converts inodes consumed into a resource quantity.
  276. func inodeUsage(fsStats *statsapi.FsStats) *resource.Quantity {
  277. // TODO: cadvisor needs to support inodes used per container
  278. // right now, cadvisor reports total inodes and inodes free per filesystem.
  279. // this is insufficient to know how many inodes are consumed by the container.
  280. // for example, with the overlay driver, the rootfs and each container filesystem
  281. // will report the same total inode and inode free values but no way of knowing
  282. // how many inodes consumed in that filesystem are charged to this container.
  283. // for now, we report 0 as inode usage pending support in cadvisor.
  284. return resource.NewQuantity(int64(0), resource.BinarySI)
  285. }
  286. // memoryUsage converts working set into a resource quantity.
  287. func memoryUsage(memStats *statsapi.MemoryStats) *resource.Quantity {
  288. if memStats == nil || memStats.WorkingSetBytes == nil {
  289. return &resource.Quantity{Format: resource.BinarySI}
  290. }
  291. usage := int64(*memStats.WorkingSetBytes)
  292. return resource.NewQuantity(usage, resource.BinarySI)
  293. }
  294. // localVolumeNames returns the set of volumes for the pod that are local
  295. // TODO: sumamry API should report what volumes consume local storage rather than hard-code here.
  296. func localVolumeNames(pod *api.Pod) []string {
  297. result := []string{}
  298. for _, volume := range pod.Spec.Volumes {
  299. if volume.HostPath != nil ||
  300. (volume.EmptyDir != nil && volume.EmptyDir.Medium != api.StorageMediumMemory) ||
  301. volume.ConfigMap != nil ||
  302. volume.GitRepo != nil {
  303. result = append(result, volume.Name)
  304. }
  305. }
  306. return result
  307. }
  308. // podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure.
  309. func podDiskUsage(podStats statsapi.PodStats, pod *api.Pod, statsToMeasure []fsStatsType) (api.ResourceList, error) {
  310. disk := resource.Quantity{Format: resource.BinarySI}
  311. inodes := resource.Quantity{Format: resource.BinarySI}
  312. for _, container := range podStats.Containers {
  313. if hasFsStatsType(statsToMeasure, fsStatsRoot) {
  314. disk.Add(*diskUsage(container.Rootfs))
  315. inodes.Add(*inodeUsage(container.Rootfs))
  316. }
  317. if hasFsStatsType(statsToMeasure, fsStatsLogs) {
  318. disk.Add(*diskUsage(container.Logs))
  319. inodes.Add(*inodeUsage(container.Logs))
  320. }
  321. }
  322. if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) {
  323. volumeNames := localVolumeNames(pod)
  324. for _, volumeName := range volumeNames {
  325. for _, volumeStats := range podStats.VolumeStats {
  326. if volumeStats.Name == volumeName {
  327. disk.Add(*diskUsage(&volumeStats.FsStats))
  328. inodes.Add(*inodeUsage(&volumeStats.FsStats))
  329. break
  330. }
  331. }
  332. }
  333. }
  334. return api.ResourceList{
  335. resourceDisk: disk,
  336. resourceInodes: inodes,
  337. }, nil
  338. }
  339. // podMemoryUsage aggregates pod memory usage.
  340. func podMemoryUsage(podStats statsapi.PodStats) (api.ResourceList, error) {
  341. disk := resource.Quantity{Format: resource.BinarySI}
  342. memory := resource.Quantity{Format: resource.BinarySI}
  343. for _, container := range podStats.Containers {
  344. // disk usage (if known)
  345. for _, fsStats := range []*statsapi.FsStats{container.Rootfs, container.Logs} {
  346. disk.Add(*diskUsage(fsStats))
  347. }
  348. // memory usage (if known)
  349. memory.Add(*memoryUsage(container.Memory))
  350. }
  351. return api.ResourceList{
  352. api.ResourceMemory: memory,
  353. resourceDisk: disk,
  354. }, nil
  355. }
  356. // formatThreshold formats a threshold for logging.
  357. func formatThreshold(threshold Threshold) string {
  358. return fmt.Sprintf("threshold(signal=%v, operator=%v, value=%v, gracePeriod=%v)", threshold.Signal, formatThresholdValue(threshold.Value), threshold.Operator, threshold.GracePeriod)
  359. }
  360. // formatThresholdValue formats a thresholdValue for logging.
  361. func formatThresholdValue(value ThresholdValue) string {
  362. if value.Quantity != nil {
  363. return value.Quantity.String()
  364. }
  365. return fmt.Sprintf("%f%%", value.Percentage*float32(100))
  366. }
  367. // cachedStatsFunc returns a statsFunc based on the provided pod stats.
  368. func cachedStatsFunc(podStats []statsapi.PodStats) statsFunc {
  369. uid2PodStats := map[string]statsapi.PodStats{}
  370. for i := range podStats {
  371. uid2PodStats[podStats[i].PodRef.UID] = podStats[i]
  372. }
  373. return func(pod *api.Pod) (statsapi.PodStats, bool) {
  374. stats, found := uid2PodStats[string(pod.UID)]
  375. return stats, found
  376. }
  377. }
  378. // Cmp compares p1 and p2 and returns:
  379. //
  380. // -1 if p1 < p2
  381. // 0 if p1 == p2
  382. // +1 if p1 > p2
  383. //
  384. type cmpFunc func(p1, p2 *api.Pod) int
  385. // multiSorter implements the Sort interface, sorting changes within.
  386. type multiSorter struct {
  387. pods []*api.Pod
  388. cmp []cmpFunc
  389. }
  390. // Sort sorts the argument slice according to the less functions passed to OrderedBy.
  391. func (ms *multiSorter) Sort(pods []*api.Pod) {
  392. ms.pods = pods
  393. sort.Sort(ms)
  394. }
  395. // OrderedBy returns a Sorter that sorts using the cmp functions, in order.
  396. // Call its Sort method to sort the data.
  397. func orderedBy(cmp ...cmpFunc) *multiSorter {
  398. return &multiSorter{
  399. cmp: cmp,
  400. }
  401. }
  402. // Len is part of sort.Interface.
  403. func (ms *multiSorter) Len() int {
  404. return len(ms.pods)
  405. }
  406. // Swap is part of sort.Interface.
  407. func (ms *multiSorter) Swap(i, j int) {
  408. ms.pods[i], ms.pods[j] = ms.pods[j], ms.pods[i]
  409. }
  410. // Less is part of sort.Interface.
  411. func (ms *multiSorter) Less(i, j int) bool {
  412. p1, p2 := ms.pods[i], ms.pods[j]
  413. var k int
  414. for k = 0; k < len(ms.cmp)-1; k++ {
  415. cmpResult := ms.cmp[k](p1, p2)
  416. // p1 is less than p2
  417. if cmpResult < 0 {
  418. return true
  419. }
  420. // p1 is greater than p2
  421. if cmpResult > 0 {
  422. return false
  423. }
  424. // we don't know yet
  425. }
  426. // the last cmp func is the final decider
  427. return ms.cmp[k](p1, p2) < 0
  428. }
  429. // qosComparator compares pods by QoS (BestEffort < Burstable < Guaranteed)
  430. func qosComparator(p1, p2 *api.Pod) int {
  431. qosP1 := qos.GetPodQOS(p1)
  432. qosP2 := qos.GetPodQOS(p2)
  433. // its a tie
  434. if qosP1 == qosP2 {
  435. return 0
  436. }
  437. // if p1 is best effort, we know p2 is burstable or guaranteed
  438. if qosP1 == qos.BestEffort {
  439. return -1
  440. }
  441. // we know p1 and p2 are not besteffort, so if p1 is burstable, p2 must be guaranteed
  442. if qosP1 == qos.Burstable {
  443. if qosP2 == qos.Guaranteed {
  444. return -1
  445. }
  446. return 1
  447. }
  448. // ok, p1 must be guaranteed.
  449. return 1
  450. }
  451. // memory compares pods by largest consumer of memory relative to request.
  452. func memory(stats statsFunc) cmpFunc {
  453. return func(p1, p2 *api.Pod) int {
  454. p1Stats, found := stats(p1)
  455. // if we have no usage stats for p1, we want p2 first
  456. if !found {
  457. return -1
  458. }
  459. // if we have no usage stats for p2, but p1 has usage, we want p1 first.
  460. p2Stats, found := stats(p2)
  461. if !found {
  462. return 1
  463. }
  464. // if we cant get usage for p1 measured, we want p2 first
  465. p1Usage, err := podMemoryUsage(p1Stats)
  466. if err != nil {
  467. return -1
  468. }
  469. // if we cant get usage for p2 measured, we want p1 first
  470. p2Usage, err := podMemoryUsage(p2Stats)
  471. if err != nil {
  472. return 1
  473. }
  474. // adjust p1, p2 usage relative to the request (if any)
  475. p1Memory := p1Usage[api.ResourceMemory]
  476. p1Spec := core.PodUsageFunc(p1)
  477. p1Request := p1Spec[api.ResourceRequestsMemory]
  478. p1Memory.Sub(p1Request)
  479. p2Memory := p2Usage[api.ResourceMemory]
  480. p2Spec := core.PodUsageFunc(p2)
  481. p2Request := p2Spec[api.ResourceRequestsMemory]
  482. p2Memory.Sub(p2Request)
  483. // if p2 is using more than p1, we want p2 first
  484. return p2Memory.Cmp(p1Memory)
  485. }
  486. }
  487. // disk compares pods by largest consumer of disk relative to request for the specified disk resource.
  488. func disk(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource api.ResourceName) cmpFunc {
  489. return func(p1, p2 *api.Pod) int {
  490. p1Stats, found := stats(p1)
  491. // if we have no usage stats for p1, we want p2 first
  492. if !found {
  493. return -1
  494. }
  495. // if we have no usage stats for p2, but p1 has usage, we want p1 first.
  496. p2Stats, found := stats(p2)
  497. if !found {
  498. return 1
  499. }
  500. // if we cant get usage for p1 measured, we want p2 first
  501. p1Usage, err := podDiskUsage(p1Stats, p1, fsStatsToMeasure)
  502. if err != nil {
  503. return -1
  504. }
  505. // if we cant get usage for p2 measured, we want p1 first
  506. p2Usage, err := podDiskUsage(p2Stats, p2, fsStatsToMeasure)
  507. if err != nil {
  508. return 1
  509. }
  510. // disk is best effort, so we don't measure relative to a request.
  511. // TODO: add disk as a guaranteed resource
  512. p1Disk := p1Usage[diskResource]
  513. p2Disk := p2Usage[diskResource]
  514. // if p2 is using more than p1, we want p2 first
  515. return p2Disk.Cmp(p1Disk)
  516. }
  517. }
  518. // rankMemoryPressure orders the input pods for eviction in response to memory pressure.
  519. func rankMemoryPressure(pods []*api.Pod, stats statsFunc) {
  520. orderedBy(qosComparator, memory(stats)).Sort(pods)
  521. }
  522. // rankDiskPressureFunc returns a rankFunc that measures the specified fs stats.
  523. func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource api.ResourceName) rankFunc {
  524. return func(pods []*api.Pod, stats statsFunc) {
  525. orderedBy(qosComparator, disk(stats, fsStatsToMeasure, diskResource)).Sort(pods)
  526. }
  527. }
  528. // byEvictionPriority implements sort.Interface for []api.ResourceName.
  529. type byEvictionPriority []api.ResourceName
  530. func (a byEvictionPriority) Len() int { return len(a) }
  531. func (a byEvictionPriority) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  532. // Less ranks memory before all other resources.
  533. func (a byEvictionPriority) Less(i, j int) bool {
  534. return a[i] == api.ResourceMemory
  535. }
  536. // makeSignalObservations derives observations using the specified summary provider.
  537. func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObservations, statsFunc, error) {
  538. summary, err := summaryProvider.Get()
  539. if err != nil {
  540. return nil, nil, err
  541. }
  542. // build the function to work against for pod stats
  543. statsFunc := cachedStatsFunc(summary.Pods)
  544. // build an evaluation context for current eviction signals
  545. result := signalObservations{}
  546. if memory := summary.Node.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil {
  547. result[SignalMemoryAvailable] = signalObservation{
  548. available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI),
  549. capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI),
  550. }
  551. }
  552. if nodeFs := summary.Node.Fs; nodeFs != nil {
  553. if nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil {
  554. result[SignalNodeFsAvailable] = signalObservation{
  555. available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI),
  556. capacity: resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI),
  557. }
  558. }
  559. if nodeFs.InodesFree != nil && nodeFs.Inodes != nil {
  560. result[SignalNodeFsInodesFree] = signalObservation{
  561. available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.BinarySI),
  562. capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.BinarySI),
  563. }
  564. }
  565. }
  566. if summary.Node.Runtime != nil {
  567. if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil {
  568. if imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil {
  569. result[SignalImageFsAvailable] = signalObservation{
  570. available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI),
  571. capacity: resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI),
  572. }
  573. if imageFs.InodesFree != nil && imageFs.Inodes != nil {
  574. result[SignalImageFsInodesFree] = signalObservation{
  575. available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.BinarySI),
  576. capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.BinarySI),
  577. }
  578. }
  579. }
  580. }
  581. }
  582. return result, statsFunc, nil
  583. }
  584. // thresholdsMet returns the set of thresholds that were met independent of grace period
  585. func thresholdsMet(thresholds []Threshold, observations signalObservations, enforceMinReclaim bool) []Threshold {
  586. results := []Threshold{}
  587. for i := range thresholds {
  588. threshold := thresholds[i]
  589. observed, found := observations[threshold.Signal]
  590. if !found {
  591. glog.Warningf("eviction manager: no observation found for eviction signal %v", threshold.Signal)
  592. continue
  593. }
  594. // determine if we have met the specified threshold
  595. thresholdMet := false
  596. quantity := getThresholdQuantity(threshold.Value, observed.capacity)
  597. // if enforceMinReclaim is specified, we compare relative to value - minreclaim
  598. if enforceMinReclaim && threshold.MinReclaim != nil {
  599. quantity.Add(*threshold.MinReclaim)
  600. }
  601. thresholdResult := quantity.Cmp(*observed.available)
  602. switch threshold.Operator {
  603. case OpLessThan:
  604. thresholdMet = thresholdResult > 0
  605. }
  606. if thresholdMet {
  607. results = append(results, threshold)
  608. }
  609. }
  610. return results
  611. }
  612. // getThresholdQuantity returns the expected quantity value for a thresholdValue
  613. func getThresholdQuantity(value ThresholdValue, capacity *resource.Quantity) *resource.Quantity {
  614. if value.Quantity != nil {
  615. return value.Quantity.Copy()
  616. }
  617. return resource.NewQuantity(int64(float64(capacity.Value())*float64(value.Percentage)), resource.BinarySI)
  618. }
  619. // thresholdsFirstObservedAt merges the input set of thresholds with the previous observation to determine when active set of thresholds were initially met.
  620. func thresholdsFirstObservedAt(thresholds []Threshold, lastObservedAt thresholdsObservedAt, now time.Time) thresholdsObservedAt {
  621. results := thresholdsObservedAt{}
  622. for i := range thresholds {
  623. observedAt, found := lastObservedAt[thresholds[i]]
  624. if !found {
  625. observedAt = now
  626. }
  627. results[thresholds[i]] = observedAt
  628. }
  629. return results
  630. }
  631. // thresholdsMetGracePeriod returns the set of thresholds that have satisfied associated grace period
  632. func thresholdsMetGracePeriod(observedAt thresholdsObservedAt, now time.Time) []Threshold {
  633. results := []Threshold{}
  634. for threshold, at := range observedAt {
  635. duration := now.Sub(at)
  636. if duration < threshold.GracePeriod {
  637. glog.V(2).Infof("eviction manager: eviction criteria not yet met for %v, duration: %v", formatThreshold(threshold), duration)
  638. continue
  639. }
  640. results = append(results, threshold)
  641. }
  642. return results
  643. }
  644. // nodeConditions returns the set of node conditions associated with a threshold
  645. func nodeConditions(thresholds []Threshold) []api.NodeConditionType {
  646. results := []api.NodeConditionType{}
  647. for _, threshold := range thresholds {
  648. if nodeCondition, found := signalToNodeCondition[threshold.Signal]; found {
  649. if !hasNodeCondition(results, nodeCondition) {
  650. results = append(results, nodeCondition)
  651. }
  652. }
  653. }
  654. return results
  655. }
  656. // nodeConditionsLastObservedAt merges the input with the previous observation to determine when a condition was most recently met.
  657. func nodeConditionsLastObservedAt(nodeConditions []api.NodeConditionType, lastObservedAt nodeConditionsObservedAt, now time.Time) nodeConditionsObservedAt {
  658. results := nodeConditionsObservedAt{}
  659. // the input conditions were observed "now"
  660. for i := range nodeConditions {
  661. results[nodeConditions[i]] = now
  662. }
  663. // the conditions that were not observed now are merged in with their old time
  664. for key, value := range lastObservedAt {
  665. _, found := results[key]
  666. if !found {
  667. results[key] = value
  668. }
  669. }
  670. return results
  671. }
  672. // nodeConditionsObservedSince returns the set of conditions that have been observed within the specified period
  673. func nodeConditionsObservedSince(observedAt nodeConditionsObservedAt, period time.Duration, now time.Time) []api.NodeConditionType {
  674. results := []api.NodeConditionType{}
  675. for nodeCondition, at := range observedAt {
  676. duration := now.Sub(at)
  677. if duration < period {
  678. results = append(results, nodeCondition)
  679. }
  680. }
  681. return results
  682. }
  683. // hasFsStatsType returns true if the fsStat is in the input list
  684. func hasFsStatsType(inputs []fsStatsType, item fsStatsType) bool {
  685. for _, input := range inputs {
  686. if input == item {
  687. return true
  688. }
  689. }
  690. return false
  691. }
  692. // hasNodeCondition returns true if the node condition is in the input list
  693. func hasNodeCondition(inputs []api.NodeConditionType, item api.NodeConditionType) bool {
  694. for _, input := range inputs {
  695. if input == item {
  696. return true
  697. }
  698. }
  699. return false
  700. }
  701. // mergeThresholds will merge both threshold lists eliminating duplicates.
  702. func mergeThresholds(inputsA []Threshold, inputsB []Threshold) []Threshold {
  703. results := inputsA
  704. for _, threshold := range inputsB {
  705. if !hasThreshold(results, threshold) {
  706. results = append(results, threshold)
  707. }
  708. }
  709. return results
  710. }
  711. // hasThreshold returns true if the threshold is in the input list
  712. func hasThreshold(inputs []Threshold, item Threshold) bool {
  713. for _, input := range inputs {
  714. if input.GracePeriod == item.GracePeriod && input.Operator == item.Operator && input.Signal == item.Signal && compareThresholdValue(input.Value, item.Value) {
  715. return true
  716. }
  717. }
  718. return false
  719. }
  720. // compareThresholdValue returns true if the two thresholdValue objects are logically the same
  721. func compareThresholdValue(a ThresholdValue, b ThresholdValue) bool {
  722. if a.Quantity != nil {
  723. if b.Quantity == nil {
  724. return false
  725. }
  726. return a.Quantity.Cmp(*b.Quantity) == 0
  727. }
  728. if b.Quantity != nil {
  729. return false
  730. }
  731. return a.Percentage == b.Percentage
  732. }
  733. // getStarvedResources returns the set of resources that are starved based on thresholds met.
  734. func getStarvedResources(thresholds []Threshold) []api.ResourceName {
  735. results := []api.ResourceName{}
  736. for _, threshold := range thresholds {
  737. if starvedResource, found := signalToResource[threshold.Signal]; found {
  738. results = append(results, starvedResource)
  739. }
  740. }
  741. return results
  742. }
  743. // isSoftEviction returns true if the thresholds met for the starved resource are only soft thresholds
  744. func isSoftEviction(thresholds []Threshold, starvedResource api.ResourceName) bool {
  745. for _, threshold := range thresholds {
  746. if resourceToCheck := signalToResource[threshold.Signal]; resourceToCheck != starvedResource {
  747. continue
  748. }
  749. if threshold.GracePeriod == time.Duration(0) {
  750. return false
  751. }
  752. }
  753. return true
  754. }
  755. // buildResourceToRankFunc returns ranking functions associated with resources
  756. func buildResourceToRankFunc(withImageFs bool) map[api.ResourceName]rankFunc {
  757. resourceToRankFunc := map[api.ResourceName]rankFunc{
  758. api.ResourceMemory: rankMemoryPressure,
  759. }
  760. // usage of an imagefs is optional
  761. if withImageFs {
  762. // with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes
  763. resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
  764. resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
  765. // with an imagefs, imagefs pod rank func for eviction only includes rootfs
  766. resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceDisk)
  767. resourceToRankFunc[resourceImageFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes)
  768. } else {
  769. // without an imagefs, nodefs pod rank func for eviction looks at all fs stats.
  770. // since imagefs and nodefs share a common device, they share common ranking functions.
  771. resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
  772. resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
  773. resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
  774. resourceToRankFunc[resourceImageFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
  775. }
  776. return resourceToRankFunc
  777. }
  778. // PodIsEvicted returns true if the reported pod status is due to an eviction.
  779. func PodIsEvicted(podStatus api.PodStatus) bool {
  780. return podStatus.Phase == api.PodFailed && podStatus.Reason == reason
  781. }
  782. // buildResourceToNodeReclaimFuncs returns reclaim functions associated with resources.
  783. func buildResourceToNodeReclaimFuncs(imageGC ImageGC, withImageFs bool) map[api.ResourceName]nodeReclaimFuncs {
  784. resourceToReclaimFunc := map[api.ResourceName]nodeReclaimFuncs{}
  785. // usage of an imagefs is optional
  786. if withImageFs {
  787. // with an imagefs, nodefs pressure should just delete logs
  788. resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs()}
  789. resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs()}
  790. // with an imagefs, imagefs pressure should delete unused images
  791. resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC, true)}
  792. resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteImages(imageGC, false)}
  793. } else {
  794. // without an imagefs, nodefs pressure should delete logs, and unused images
  795. // since imagefs and nodefs share a common device, they share common reclaim functions
  796. resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, true)}
  797. resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, false)}
  798. resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, true)}
  799. resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, false)}
  800. }
  801. return resourceToReclaimFunc
  802. }
  803. // deleteLogs will delete logs to free up disk pressure.
  804. func deleteLogs() nodeReclaimFunc {
  805. return func() (*resource.Quantity, error) {
  806. // TODO: not yet supported.
  807. return resource.NewQuantity(int64(0), resource.BinarySI), nil
  808. }
  809. }
  810. // deleteImages will delete unused images to free up disk pressure.
  811. func deleteImages(imageGC ImageGC, reportBytesFreed bool) nodeReclaimFunc {
  812. return func() (*resource.Quantity, error) {
  813. glog.Infof("eviction manager: attempting to delete unused images")
  814. bytesFreed, err := imageGC.DeleteUnusedImages()
  815. if err != nil {
  816. return nil, err
  817. }
  818. reclaimed := int64(0)
  819. if reportBytesFreed {
  820. reclaimed = bytesFreed
  821. }
  822. return resource.NewQuantity(reclaimed, resource.BinarySI), nil
  823. }
  824. }
  825. func getMessage(resource api.ResourceName) string {
  826. return fmt.Sprintf(messageFmt, resource)
  827. }