123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287 |
- // +build linux
- /*
- Copyright 2015 The Kubernetes Authors.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
- package e2e_node
- import (
- "fmt"
- "strings"
- "time"
- client "k8s.io/kubernetes/pkg/client/unversioned"
- "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
- "k8s.io/kubernetes/test/e2e/framework"
- . "github.com/onsi/ginkgo"
- . "github.com/onsi/gomega"
- )
- var _ = framework.KubeDescribe("Resource-usage [Serial] [Slow]", func() {
- const (
- // Interval to poll /stats/container on a node
- containerStatsPollingPeriod = 10 * time.Second
- )
- var (
- ns string
- rc *ResourceCollector
- om *framework.RuntimeOperationMonitor
- )
- f := framework.NewDefaultFramework("resource-usage")
- BeforeEach(func() {
- ns = f.Namespace.Name
- om = framework.NewRuntimeOperationMonitor(f.Client)
- // The test collects resource usage from a standalone Cadvisor pod.
- // The Cadvsior of Kubelet has a housekeeping interval of 10s, which is too long to
- // show the resource usage spikes. But changing its interval increases the overhead
- // of kubelet. Hence we use a Cadvisor pod.
- f.PodClient().CreateSync(getCadvisorPod())
- rc = NewResourceCollector(containerStatsPollingPeriod)
- })
- AfterEach(func() {
- result := om.GetLatestRuntimeOperationErrorRate()
- framework.Logf("runtime operation error metrics:\n%s", framework.FormatRuntimeOperationErrorRate(result))
- })
- // This test measures and verifies the steady resource usage of node is within limit
- // It collects data from a standalone Cadvisor with housekeeping interval 1s.
- // It verifies CPU percentiles and the lastest memory usage.
- Context("regular resource usage tracking", func() {
- rTests := []resourceTest{
- {
- podsNr: 10,
- cpuLimits: framework.ContainersCPUSummary{
- stats.SystemContainerKubelet: {0.50: 0.30, 0.95: 0.35},
- stats.SystemContainerRuntime: {0.50: 0.30, 0.95: 0.40},
- },
- memLimits: framework.ResourceUsagePerContainer{
- stats.SystemContainerKubelet: &framework.ContainerResourceUsage{MemoryRSSInBytes: 100 * 1024 * 1024},
- stats.SystemContainerRuntime: &framework.ContainerResourceUsage{MemoryRSSInBytes: 400 * 1024 * 1024},
- },
- },
- }
- for _, testArg := range rTests {
- itArg := testArg
- It(fmt.Sprintf("resource tracking for %d pods per node", itArg.podsNr), func() {
- runResourceUsageTest(f, rc, itArg)
- // Log and verify resource usage
- logAndVerifyResource(f, rc, itArg.cpuLimits, itArg.memLimits, itArg.getTestName(), true)
- })
- }
- })
- Context("regular resource usage tracking", func() {
- rTests := []resourceTest{
- {
- podsNr: 10,
- },
- {
- podsNr: 35,
- },
- {
- podsNr: 105,
- },
- }
- for _, testArg := range rTests {
- itArg := testArg
- It(fmt.Sprintf("resource tracking for %d pods per node [Benchmark]", itArg.podsNr), func() {
- runResourceUsageTest(f, rc, itArg)
- // Log and verify resource usage
- logAndVerifyResource(f, rc, itArg.cpuLimits, itArg.memLimits, itArg.getTestName(), true)
- })
- }
- })
- })
- type resourceTest struct {
- podsNr int
- cpuLimits framework.ContainersCPUSummary
- memLimits framework.ResourceUsagePerContainer
- }
- func (rt *resourceTest) getTestName() string {
- return fmt.Sprintf("resource_%d", rt.podsNr)
- }
- // runResourceUsageTest runs the resource usage test
- func runResourceUsageTest(f *framework.Framework, rc *ResourceCollector, testArg resourceTest) {
- const (
- // The monitoring time for one test
- monitoringTime = 10 * time.Minute
- // The periodic reporting period
- reportingPeriod = 5 * time.Minute
- // sleep for an interval here to measure steady data
- sleepAfterCreatePods = 10 * time.Second
- )
- pods := newTestPods(testArg.podsNr, ImageRegistry[pauseImage], "test_pod")
- rc.Start()
- // Explicitly delete pods to prevent namespace controller cleanning up timeout
- defer deletePodsSync(f, append(pods, getCadvisorPod()))
- defer rc.Stop()
- By("Creating a batch of Pods")
- f.PodClient().CreateBatch(pods)
- // wait for a while to let the node be steady
- time.Sleep(sleepAfterCreatePods)
- // Log once and flush the stats.
- rc.LogLatest()
- rc.Reset()
- By("Start monitoring resource usage")
- // Periodically dump the cpu summary until the deadline is met.
- // Note that without calling framework.ResourceMonitor.Reset(), the stats
- // would occupy increasingly more memory. This should be fine
- // for the current test duration, but we should reclaim the
- // entries if we plan to monitor longer (e.g., 8 hours).
- deadline := time.Now().Add(monitoringTime)
- for time.Now().Before(deadline) {
- timeLeft := deadline.Sub(time.Now())
- framework.Logf("Still running...%v left", timeLeft)
- if timeLeft < reportingPeriod {
- time.Sleep(timeLeft)
- } else {
- time.Sleep(reportingPeriod)
- }
- logPods(f.Client)
- }
- By("Reporting overall resource usage")
- logPods(f.Client)
- }
- // logAndVerifyResource prints the resource usage as perf data and verifies whether resource usage satisfies the limit.
- func logAndVerifyResource(f *framework.Framework, rc *ResourceCollector, cpuLimits framework.ContainersCPUSummary,
- memLimits framework.ResourceUsagePerContainer, testName string, isVerify bool) {
- nodeName := framework.TestContext.NodeName
- // Obtain memory PerfData
- usagePerContainer, err := rc.GetLatest()
- Expect(err).NotTo(HaveOccurred())
- framework.Logf("%s", formatResourceUsageStats(usagePerContainer))
- usagePerNode := make(framework.ResourceUsagePerNode)
- usagePerNode[nodeName] = usagePerContainer
- // Obtain CPU PerfData
- cpuSummary := rc.GetCPUSummary()
- framework.Logf("%s", formatCPUSummary(cpuSummary))
- cpuSummaryPerNode := make(framework.NodesCPUSummary)
- cpuSummaryPerNode[nodeName] = cpuSummary
- // Print resource usage
- framework.PrintPerfData(framework.ResourceUsageToPerfDataWithLabels(usagePerNode,
- map[string]string{"test": testName, "node": nodeName}))
- framework.PrintPerfData(framework.CPUUsageToPerfDataWithLabels(cpuSummaryPerNode,
- map[string]string{"test": testName, "node": nodeName}))
- // Verify resource usage
- if isVerify {
- verifyMemoryLimits(f.Client, memLimits, usagePerNode)
- verifyCPULimits(cpuLimits, cpuSummaryPerNode)
- }
- }
- func verifyMemoryLimits(c *client.Client, expected framework.ResourceUsagePerContainer, actual framework.ResourceUsagePerNode) {
- if expected == nil {
- return
- }
- var errList []string
- for nodeName, nodeSummary := range actual {
- var nodeErrs []string
- for cName, expectedResult := range expected {
- container, ok := nodeSummary[cName]
- if !ok {
- nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: missing", cName))
- continue
- }
- expectedValue := expectedResult.MemoryRSSInBytes
- actualValue := container.MemoryRSSInBytes
- if expectedValue != 0 && actualValue > expectedValue {
- nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: expected RSS memory (MB) < %d; got %d",
- cName, expectedValue, actualValue))
- }
- }
- if len(nodeErrs) > 0 {
- errList = append(errList, fmt.Sprintf("node %v:\n %s", nodeName, strings.Join(nodeErrs, ", ")))
- heapStats, err := framework.GetKubeletHeapStats(c, nodeName)
- if err != nil {
- framework.Logf("Unable to get heap stats from %q", nodeName)
- } else {
- framework.Logf("Heap stats on %q\n:%v", nodeName, heapStats)
- }
- }
- }
- if len(errList) > 0 {
- framework.Failf("Memory usage exceeding limits:\n %s", strings.Join(errList, "\n"))
- }
- }
- func verifyCPULimits(expected framework.ContainersCPUSummary, actual framework.NodesCPUSummary) {
- if expected == nil {
- return
- }
- var errList []string
- for nodeName, perNodeSummary := range actual {
- var nodeErrs []string
- for cName, expectedResult := range expected {
- perContainerSummary, ok := perNodeSummary[cName]
- if !ok {
- nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: missing", cName))
- continue
- }
- for p, expectedValue := range expectedResult {
- actualValue, ok := perContainerSummary[p]
- if !ok {
- nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: missing percentile %v", cName, p))
- continue
- }
- if actualValue > expectedValue {
- nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: expected %.0fth%% usage < %.3f; got %.3f",
- cName, p*100, expectedValue, actualValue))
- }
- }
- }
- if len(nodeErrs) > 0 {
- errList = append(errList, fmt.Sprintf("node %v:\n %s", nodeName, strings.Join(nodeErrs, ", ")))
- }
- }
- if len(errList) > 0 {
- framework.Failf("CPU usage exceeding limits:\n %s", strings.Join(errList, "\n"))
- }
- }
- func logPods(c *client.Client) {
- nodeName := framework.TestContext.NodeName
- podList, err := framework.GetKubeletRunningPods(c, nodeName)
- if err != nil {
- framework.Logf("Unable to retrieve kubelet pods for node %v", nodeName)
- }
- framework.Logf("%d pods are running on node %v", len(podList.Items), nodeName)
- }
|