kubelet_perf.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. /*
  2. Copyright 2015 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package e2e
  14. import (
  15. "fmt"
  16. "strings"
  17. "time"
  18. "k8s.io/kubernetes/pkg/api"
  19. client "k8s.io/kubernetes/pkg/client/unversioned"
  20. "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
  21. "k8s.io/kubernetes/pkg/util/sets"
  22. "k8s.io/kubernetes/pkg/util/uuid"
  23. "k8s.io/kubernetes/test/e2e/framework"
  24. . "github.com/onsi/ginkgo"
  25. . "github.com/onsi/gomega"
  26. )
  27. const (
  28. // Interval to poll /stats/container on a node
  29. containerStatsPollingPeriod = 10 * time.Second
  30. // The monitoring time for one test.
  31. monitoringTime = 20 * time.Minute
  32. // The periodic reporting period.
  33. reportingPeriod = 5 * time.Minute
  34. // Timeout for waiting for the image prepulling to complete.
  35. imagePrePullingLongTimeout = time.Minute * 8
  36. )
  37. type resourceTest struct {
  38. podsPerNode int
  39. cpuLimits framework.ContainersCPUSummary
  40. memLimits framework.ResourceUsagePerContainer
  41. }
  42. func logPodsOnNodes(c *client.Client, nodeNames []string) {
  43. for _, n := range nodeNames {
  44. podList, err := framework.GetKubeletRunningPods(c, n)
  45. if err != nil {
  46. framework.Logf("Unable to retrieve kubelet pods for node %v", n)
  47. continue
  48. }
  49. framework.Logf("%d pods are running on node %v", len(podList.Items), n)
  50. }
  51. }
  52. func runResourceTrackingTest(f *framework.Framework, podsPerNode int, nodeNames sets.String, rm *framework.ResourceMonitor,
  53. expectedCPU map[string]map[float64]float64, expectedMemory framework.ResourceUsagePerContainer) {
  54. numNodes := nodeNames.Len()
  55. totalPods := podsPerNode * numNodes
  56. By(fmt.Sprintf("Creating a RC of %d pods and wait until all pods of this RC are running", totalPods))
  57. rcName := fmt.Sprintf("resource%d-%s", totalPods, string(uuid.NewUUID()))
  58. // TODO: Use a more realistic workload
  59. Expect(framework.RunRC(framework.RCConfig{
  60. Client: f.Client,
  61. Name: rcName,
  62. Namespace: f.Namespace.Name,
  63. Image: framework.GetPauseImageName(f.Client),
  64. Replicas: totalPods,
  65. })).NotTo(HaveOccurred())
  66. // Log once and flush the stats.
  67. rm.LogLatest()
  68. rm.Reset()
  69. By("Start monitoring resource usage")
  70. // Periodically dump the cpu summary until the deadline is met.
  71. // Note that without calling framework.ResourceMonitor.Reset(), the stats
  72. // would occupy increasingly more memory. This should be fine
  73. // for the current test duration, but we should reclaim the
  74. // entries if we plan to monitor longer (e.g., 8 hours).
  75. deadline := time.Now().Add(monitoringTime)
  76. for time.Now().Before(deadline) {
  77. timeLeft := deadline.Sub(time.Now())
  78. framework.Logf("Still running...%v left", timeLeft)
  79. if timeLeft < reportingPeriod {
  80. time.Sleep(timeLeft)
  81. } else {
  82. time.Sleep(reportingPeriod)
  83. }
  84. logPodsOnNodes(f.Client, nodeNames.List())
  85. }
  86. By("Reporting overall resource usage")
  87. logPodsOnNodes(f.Client, nodeNames.List())
  88. usageSummary, err := rm.GetLatest()
  89. Expect(err).NotTo(HaveOccurred())
  90. // TODO(random-liu): Remove the original log when we migrate to new perfdash
  91. framework.Logf("%s", rm.FormatResourceUsage(usageSummary))
  92. // Log perf result
  93. framework.PrintPerfData(framework.ResourceUsageToPerfData(rm.GetMasterNodeLatest(usageSummary)))
  94. verifyMemoryLimits(f.Client, expectedMemory, usageSummary)
  95. cpuSummary := rm.GetCPUSummary()
  96. framework.Logf("%s", rm.FormatCPUSummary(cpuSummary))
  97. // Log perf result
  98. framework.PrintPerfData(framework.CPUUsageToPerfData(rm.GetMasterNodeCPUSummary(cpuSummary)))
  99. verifyCPULimits(expectedCPU, cpuSummary)
  100. By("Deleting the RC")
  101. framework.DeleteRCAndPods(f.Client, f.Namespace.Name, rcName)
  102. }
  103. func verifyMemoryLimits(c *client.Client, expected framework.ResourceUsagePerContainer, actual framework.ResourceUsagePerNode) {
  104. if expected == nil {
  105. return
  106. }
  107. var errList []string
  108. for nodeName, nodeSummary := range actual {
  109. var nodeErrs []string
  110. for cName, expectedResult := range expected {
  111. container, ok := nodeSummary[cName]
  112. if !ok {
  113. nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: missing", cName))
  114. continue
  115. }
  116. expectedValue := expectedResult.MemoryRSSInBytes
  117. actualValue := container.MemoryRSSInBytes
  118. if expectedValue != 0 && actualValue > expectedValue {
  119. nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: expected RSS memory (MB) < %d; got %d",
  120. cName, expectedValue, actualValue))
  121. }
  122. }
  123. if len(nodeErrs) > 0 {
  124. errList = append(errList, fmt.Sprintf("node %v:\n %s", nodeName, strings.Join(nodeErrs, ", ")))
  125. heapStats, err := framework.GetKubeletHeapStats(c, nodeName)
  126. if err != nil {
  127. framework.Logf("Unable to get heap stats from %q", nodeName)
  128. } else {
  129. framework.Logf("Heap stats on %q\n:%v", nodeName, heapStats)
  130. }
  131. }
  132. }
  133. if len(errList) > 0 {
  134. framework.Failf("Memory usage exceeding limits:\n %s", strings.Join(errList, "\n"))
  135. }
  136. }
  137. func verifyCPULimits(expected framework.ContainersCPUSummary, actual framework.NodesCPUSummary) {
  138. if expected == nil {
  139. return
  140. }
  141. var errList []string
  142. for nodeName, perNodeSummary := range actual {
  143. var nodeErrs []string
  144. for cName, expectedResult := range expected {
  145. perContainerSummary, ok := perNodeSummary[cName]
  146. if !ok {
  147. nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: missing", cName))
  148. continue
  149. }
  150. for p, expectedValue := range expectedResult {
  151. actualValue, ok := perContainerSummary[p]
  152. if !ok {
  153. nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: missing percentile %v", cName, p))
  154. continue
  155. }
  156. if actualValue > expectedValue {
  157. nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: expected %.0fth%% usage < %.3f; got %.3f",
  158. cName, p*100, expectedValue, actualValue))
  159. }
  160. }
  161. }
  162. if len(nodeErrs) > 0 {
  163. errList = append(errList, fmt.Sprintf("node %v:\n %s", nodeName, strings.Join(nodeErrs, ", ")))
  164. }
  165. }
  166. if len(errList) > 0 {
  167. framework.Failf("CPU usage exceeding limits:\n %s", strings.Join(errList, "\n"))
  168. }
  169. }
  170. // Slow by design (1 hour)
  171. var _ = framework.KubeDescribe("Kubelet [Serial] [Slow]", func() {
  172. var nodeNames sets.String
  173. f := framework.NewDefaultFramework("kubelet-perf")
  174. var om *framework.RuntimeOperationMonitor
  175. var rm *framework.ResourceMonitor
  176. BeforeEach(func() {
  177. // Wait until image prepull pod has completed so that they wouldn't
  178. // affect the runtime cpu usage. Fail the test if prepulling cannot
  179. // finish in time.
  180. if err := framework.WaitForPodsSuccess(f.Client, api.NamespaceSystem, framework.ImagePullerLabels, imagePrePullingLongTimeout); err != nil {
  181. framework.Failf("Image puller didn't complete in %v, not running resource usage test since the metrics might be adultrated", imagePrePullingLongTimeout)
  182. }
  183. nodes := framework.GetReadySchedulableNodesOrDie(f.Client)
  184. nodeNames = sets.NewString()
  185. for _, node := range nodes.Items {
  186. nodeNames.Insert(node.Name)
  187. }
  188. om = framework.NewRuntimeOperationMonitor(f.Client)
  189. rm = framework.NewResourceMonitor(f.Client, framework.TargetContainers(), containerStatsPollingPeriod)
  190. rm.Start()
  191. })
  192. AfterEach(func() {
  193. rm.Stop()
  194. result := om.GetLatestRuntimeOperationErrorRate()
  195. framework.Logf("runtime operation error metrics:\n%s", framework.FormatRuntimeOperationErrorRate(result))
  196. })
  197. framework.KubeDescribe("regular resource usage tracking", func() {
  198. // We assume that the scheduler will make reasonable scheduling choices
  199. // and assign ~N pods on the node.
  200. // Although we want to track N pods per node, there are N + add-on pods
  201. // in the cluster. The cluster add-on pods can be distributed unevenly
  202. // among the nodes because they are created during the cluster
  203. // initialization. This *noise* is obvious when N is small. We
  204. // deliberately set higher resource usage limits to account for the
  205. // noise.
  206. rTests := []resourceTest{
  207. {
  208. podsPerNode: 0,
  209. cpuLimits: framework.ContainersCPUSummary{
  210. stats.SystemContainerKubelet: {0.50: 0.06, 0.95: 0.08},
  211. stats.SystemContainerRuntime: {0.50: 0.05, 0.95: 0.06},
  212. },
  213. // We set the memory limits generously because the distribution
  214. // of the addon pods affect the memory usage on each node.
  215. memLimits: framework.ResourceUsagePerContainer{
  216. stats.SystemContainerKubelet: &framework.ContainerResourceUsage{MemoryRSSInBytes: 70 * 1024 * 1024},
  217. // The detail can be found at https://github.com/kubernetes/kubernetes/issues/28384#issuecomment-244158892
  218. stats.SystemContainerRuntime: &framework.ContainerResourceUsage{MemoryRSSInBytes: 125 * 1024 * 1024},
  219. },
  220. },
  221. {
  222. podsPerNode: 35,
  223. cpuLimits: framework.ContainersCPUSummary{
  224. stats.SystemContainerKubelet: {0.50: 0.12, 0.95: 0.14},
  225. stats.SystemContainerRuntime: {0.50: 0.05, 0.95: 0.07},
  226. },
  227. // We set the memory limits generously because the distribution
  228. // of the addon pods affect the memory usage on each node.
  229. memLimits: framework.ResourceUsagePerContainer{
  230. stats.SystemContainerKubelet: &framework.ContainerResourceUsage{MemoryRSSInBytes: 70 * 1024 * 1024},
  231. stats.SystemContainerRuntime: &framework.ContainerResourceUsage{MemoryRSSInBytes: 200 * 1024 * 1024},
  232. },
  233. },
  234. {
  235. cpuLimits: framework.ContainersCPUSummary{
  236. stats.SystemContainerKubelet: {0.50: 0.17, 0.95: 0.22},
  237. stats.SystemContainerRuntime: {0.50: 0.06, 0.95: 0.09},
  238. },
  239. podsPerNode: 100,
  240. // We set the memory limits generously because the distribution
  241. // of the addon pods affect the memory usage on each node.
  242. memLimits: framework.ResourceUsagePerContainer{
  243. stats.SystemContainerKubelet: &framework.ContainerResourceUsage{MemoryRSSInBytes: 80 * 1024 * 1024},
  244. stats.SystemContainerRuntime: &framework.ContainerResourceUsage{MemoryRSSInBytes: 300 * 1024 * 1024},
  245. },
  246. },
  247. }
  248. for _, testArg := range rTests {
  249. itArg := testArg
  250. podsPerNode := itArg.podsPerNode
  251. name := fmt.Sprintf(
  252. "resource tracking for %d pods per node", podsPerNode)
  253. It(name, func() {
  254. runResourceTrackingTest(f, podsPerNode, nodeNames, rm, itArg.cpuLimits, itArg.memLimits)
  255. })
  256. }
  257. })
  258. framework.KubeDescribe("experimental resource usage tracking [Feature:ExperimentalResourceUsageTracking]", func() {
  259. density := []int{100}
  260. for i := range density {
  261. podsPerNode := density[i]
  262. name := fmt.Sprintf(
  263. "resource tracking for %d pods per node", podsPerNode)
  264. It(name, func() {
  265. runResourceTrackingTest(f, podsPerNode, nodeNames, rm, nil, nil)
  266. })
  267. }
  268. })
  269. })