123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739 |
- // +build linux
- /*
- Copyright 2015 The Kubernetes Authors.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
- package cm
- import (
- "fmt"
- "io/ioutil"
- "os"
- "path"
- "strconv"
- "sync"
- "time"
- "github.com/blang/semver"
- "github.com/golang/glog"
- "github.com/opencontainers/runc/libcontainer/cgroups"
- "github.com/opencontainers/runc/libcontainer/cgroups/fs"
- "github.com/opencontainers/runc/libcontainer/configs"
- "k8s.io/kubernetes/pkg/api"
- "k8s.io/kubernetes/pkg/api/resource"
- "k8s.io/kubernetes/pkg/kubelet/cadvisor"
- "k8s.io/kubernetes/pkg/kubelet/qos"
- "k8s.io/kubernetes/pkg/util"
- utilerrors "k8s.io/kubernetes/pkg/util/errors"
- "k8s.io/kubernetes/pkg/util/mount"
- "k8s.io/kubernetes/pkg/util/oom"
- "k8s.io/kubernetes/pkg/util/procfs"
- "k8s.io/kubernetes/pkg/util/runtime"
- "k8s.io/kubernetes/pkg/util/sets"
- utilsysctl "k8s.io/kubernetes/pkg/util/sysctl"
- "k8s.io/kubernetes/pkg/util/wait"
- )
- const (
- // The percent of the machine memory capacity. The value is used to calculate
- // docker memory resource container's hardlimit to workaround docker memory
- // leakage issue. Please see kubernetes/issues/9881 for more detail.
- DockerMemoryLimitThresholdPercent = 70
- // The minimum memory limit allocated to docker container: 150Mi
- MinDockerMemoryLimit = 150 * 1024 * 1024
- dockerProcessName = "docker"
- dockerPidFile = "/var/run/docker.pid"
- containerdProcessName = "docker-containerd"
- containerdPidFile = "/run/docker/libcontainerd/docker-containerd.pid"
- )
- var (
- // The docker version in which containerd was introduced.
- containerdVersion = semver.MustParse("1.11.0")
- )
- // A non-user container tracked by the Kubelet.
- type systemContainer struct {
- // Absolute name of the container.
- name string
- // CPU limit in millicores.
- cpuMillicores int64
- // Function that ensures the state of the container.
- // m is the cgroup manager for the specified container.
- ensureStateFunc func(m *fs.Manager) error
- // Manager for the cgroups of the external container.
- manager *fs.Manager
- }
- func newSystemCgroups(containerName string) *systemContainer {
- return &systemContainer{
- name: containerName,
- manager: createManager(containerName),
- }
- }
- type containerManagerImpl struct {
- sync.RWMutex
- cadvisorInterface cadvisor.Interface
- mountUtil mount.Interface
- NodeConfig
- status Status
- // External containers being managed.
- systemContainers []*systemContainer
- qosContainers QOSContainersInfo
- periodicTasks []func()
- // holds all the mounted cgroup subsystems
- subsystems *CgroupSubsystems
- nodeInfo *api.Node
- }
- type features struct {
- cpuHardcapping bool
- }
- var _ ContainerManager = &containerManagerImpl{}
- // checks if the required cgroups subsystems are mounted.
- // As of now, only 'cpu' and 'memory' are required.
- // cpu quota is a soft requirement.
- func validateSystemRequirements(mountUtil mount.Interface) (features, error) {
- const (
- cgroupMountType = "cgroup"
- localErr = "system validation failed"
- )
- var (
- cpuMountPoint string
- f features
- )
- mountPoints, err := mountUtil.List()
- if err != nil {
- return f, fmt.Errorf("%s - %v", localErr, err)
- }
- expectedCgroups := sets.NewString("cpu", "cpuacct", "cpuset", "memory")
- for _, mountPoint := range mountPoints {
- if mountPoint.Type == cgroupMountType {
- for _, opt := range mountPoint.Opts {
- if expectedCgroups.Has(opt) {
- expectedCgroups.Delete(opt)
- }
- if opt == "cpu" {
- cpuMountPoint = mountPoint.Path
- }
- }
- }
- }
- if expectedCgroups.Len() > 0 {
- return f, fmt.Errorf("%s - Following Cgroup subsystem not mounted: %v", localErr, expectedCgroups.List())
- }
- // Check if cpu quota is available.
- // CPU cgroup is required and so it expected to be mounted at this point.
- periodExists, err := util.FileExists(path.Join(cpuMountPoint, "cpu.cfs_period_us"))
- if err != nil {
- glog.Errorf("failed to detect if CPU cgroup cpu.cfs_period_us is available - %v", err)
- }
- quotaExists, err := util.FileExists(path.Join(cpuMountPoint, "cpu.cfs_quota_us"))
- if err != nil {
- glog.Errorf("failed to detect if CPU cgroup cpu.cfs_quota_us is available - %v", err)
- }
- if quotaExists && periodExists {
- f.cpuHardcapping = true
- }
- return f, nil
- }
- // TODO(vmarmol): Add limits to the system containers.
- // Takes the absolute name of the specified containers.
- // Empty container name disables use of the specified container.
- func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig) (ContainerManager, error) {
- // Check if Cgroup-root actually exists on the node
- if nodeConfig.CgroupsPerQOS {
- if nodeConfig.CgroupRoot == "" {
- return nil, fmt.Errorf("invalid configuration: cgroups-per-qos was specified and cgroup-root was not specified. To enable the QoS cgroup hierarchy you need to specify a valid cgroup-root")
- }
- if _, err := os.Stat(nodeConfig.CgroupRoot); err != nil {
- return nil, fmt.Errorf("invalid configuration: cgroup-root doesn't exist : %v", err)
- }
- }
- subsystems, err := GetCgroupSubsystems()
- if err != nil {
- return nil, fmt.Errorf("failed to get mounted cgroup subsystems: %v", err)
- }
- return &containerManagerImpl{
- cadvisorInterface: cadvisorInterface,
- mountUtil: mountUtil,
- NodeConfig: nodeConfig,
- subsystems: subsystems,
- }, nil
- }
- // NewPodContainerManager is a factory method returns a PodContainerManager object
- // If qosCgroups are enabled then it returns the general pod container manager
- // otherwise it returns a no-op manager which essentially does nothing
- func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
- if cm.NodeConfig.CgroupsPerQOS {
- return &podContainerManagerImpl{
- qosContainersInfo: cm.qosContainers,
- nodeInfo: cm.nodeInfo,
- subsystems: cm.subsystems,
- cgroupManager: NewCgroupManager(cm.subsystems),
- }
- }
- return &podContainerManagerNoop{
- cgroupRoot: cm.NodeConfig.CgroupRoot,
- }
- }
- // Create a cgroup container manager.
- func createManager(containerName string) *fs.Manager {
- allowAllDevices := true
- return &fs.Manager{
- Cgroups: &configs.Cgroup{
- Parent: "/",
- Name: containerName,
- Resources: &configs.Resources{
- AllowAllDevices: &allowAllDevices,
- },
- },
- }
- }
- type KernelTunableBehavior string
- const (
- KernelTunableWarn KernelTunableBehavior = "warn"
- KernelTunableError KernelTunableBehavior = "error"
- KernelTunableModify KernelTunableBehavior = "modify"
- )
- // InitQOS creates the top level qos cgroup containers
- // We create top level QoS containers for only Burstable and Best Effort
- // and not Guaranteed QoS class. All guaranteed pods are nested under the
- // RootContainer by default. InitQOS is called only once during kubelet bootstrapping.
- // TODO(@dubstack) Add support for cgroup-root to work on both systemd and cgroupfs
- // drivers. Currently we only support systems running cgroupfs driver
- func InitQOS(rootContainer string, subsystems *CgroupSubsystems) (QOSContainersInfo, error) {
- cm := NewCgroupManager(subsystems)
- // Top level for Qos containers are created only for Burstable
- // and Best Effort classes
- qosClasses := [2]qos.QOSClass{qos.Burstable, qos.BestEffort}
- // Create containers for both qos classes
- for _, qosClass := range qosClasses {
- // get the container's absolute name
- absoluteContainerName := path.Join(rootContainer, string(qosClass))
- // containerConfig object stores the cgroup specifications
- containerConfig := &CgroupConfig{
- Name: absoluteContainerName,
- ResourceParameters: &ResourceConfig{},
- }
- // TODO(@dubstack) Add support on systemd cgroups driver
- if err := cm.Create(containerConfig); err != nil {
- return QOSContainersInfo{}, fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err)
- }
- }
- // Store the top level qos container names
- qosContainersInfo := QOSContainersInfo{
- Guaranteed: rootContainer,
- Burstable: path.Join(rootContainer, string(qos.Burstable)),
- BestEffort: path.Join(rootContainer, string(qos.BestEffort)),
- }
- return qosContainersInfo, nil
- }
- // setupKernelTunables validates kernel tunable flags are set as expected
- // depending upon the specified option, it will either warn, error, or modify the kernel tunable flags
- func setupKernelTunables(option KernelTunableBehavior) error {
- desiredState := map[string]int{
- utilsysctl.VmOvercommitMemory: utilsysctl.VmOvercommitMemoryAlways,
- utilsysctl.VmPanicOnOOM: utilsysctl.VmPanicOnOOMInvokeOOMKiller,
- utilsysctl.KernelPanic: utilsysctl.KernelPanicRebootTimeout,
- utilsysctl.KernelPanicOnOops: utilsysctl.KernelPanicOnOopsAlways,
- }
- sysctl := utilsysctl.New()
- errList := []error{}
- for flag, expectedValue := range desiredState {
- val, err := sysctl.GetSysctl(flag)
- if err != nil {
- errList = append(errList, err)
- continue
- }
- if val == expectedValue {
- continue
- }
- switch option {
- case KernelTunableError:
- errList = append(errList, fmt.Errorf("Invalid kernel flag: %v, expected value: %v, actual value: %v", flag, expectedValue, val))
- case KernelTunableWarn:
- glog.V(2).Infof("Invalid kernel flag: %v, expected value: %v, actual value: %v", flag, expectedValue, val)
- case KernelTunableModify:
- glog.V(2).Infof("Updating kernel flag: %v, expected value: %v, actual value: %v", flag, expectedValue, val)
- err = sysctl.SetSysctl(flag, expectedValue)
- if err != nil {
- errList = append(errList, err)
- }
- }
- }
- return utilerrors.NewAggregate(errList)
- }
- func (cm *containerManagerImpl) setupNode() error {
- f, err := validateSystemRequirements(cm.mountUtil)
- if err != nil {
- return err
- }
- if !f.cpuHardcapping {
- cm.status.SoftRequirements = fmt.Errorf("CPU hardcapping unsupported")
- }
- b := KernelTunableModify
- if cm.GetNodeConfig().ProtectKernelDefaults {
- b = KernelTunableError
- }
- if err := setupKernelTunables(b); err != nil {
- return err
- }
- // Setup top level qos containers only if CgroupsPerQOS flag is specified as true
- if cm.NodeConfig.CgroupsPerQOS {
- qosContainersInfo, err := InitQOS(cm.NodeConfig.CgroupRoot, cm.subsystems)
- if err != nil {
- return fmt.Errorf("failed to initialise top level QOS containers: %v", err)
- }
- cm.qosContainers = qosContainersInfo
- }
- systemContainers := []*systemContainer{}
- if cm.ContainerRuntime == "docker" {
- if cm.RuntimeCgroupsName != "" {
- cont := newSystemCgroups(cm.RuntimeCgroupsName)
- info, err := cm.cadvisorInterface.MachineInfo()
- var capacity = api.ResourceList{}
- if err != nil {
- } else {
- capacity = cadvisor.CapacityFromMachineInfo(info)
- }
- memoryLimit := (int64(capacity.Memory().Value() * DockerMemoryLimitThresholdPercent / 100))
- if memoryLimit < MinDockerMemoryLimit {
- glog.Warningf("Memory limit %d for container %s is too small, reset it to %d", memoryLimit, cm.RuntimeCgroupsName, MinDockerMemoryLimit)
- memoryLimit = MinDockerMemoryLimit
- }
- glog.V(2).Infof("Configure resource-only container %s with memory limit: %d", cm.RuntimeCgroupsName, memoryLimit)
- allowAllDevices := true
- dockerContainer := &fs.Manager{
- Cgroups: &configs.Cgroup{
- Parent: "/",
- Name: cm.RuntimeCgroupsName,
- Resources: &configs.Resources{
- Memory: memoryLimit,
- MemorySwap: -1,
- AllowAllDevices: &allowAllDevices,
- },
- },
- }
- dockerVersion := getDockerVersion(cm.cadvisorInterface)
- cont.ensureStateFunc = func(manager *fs.Manager) error {
- return ensureDockerInContainer(dockerVersion, -900, dockerContainer)
- }
- systemContainers = append(systemContainers, cont)
- } else {
- cm.periodicTasks = append(cm.periodicTasks, func() {
- cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile)
- if err != nil {
- glog.Error(err)
- return
- }
- glog.V(2).Infof("Discovered runtime cgroups name: %s", cont)
- cm.Lock()
- defer cm.Unlock()
- cm.RuntimeCgroupsName = cont
- })
- }
- }
- if cm.SystemCgroupsName != "" {
- if cm.SystemCgroupsName == "/" {
- return fmt.Errorf("system container cannot be root (\"/\")")
- }
- cont := newSystemCgroups(cm.SystemCgroupsName)
- rootContainer := &fs.Manager{
- Cgroups: &configs.Cgroup{
- Parent: "/",
- Name: "/",
- },
- }
- cont.ensureStateFunc = func(manager *fs.Manager) error {
- return ensureSystemCgroups(rootContainer, manager)
- }
- systemContainers = append(systemContainers, cont)
- }
- if cm.KubeletCgroupsName != "" {
- cont := newSystemCgroups(cm.KubeletCgroupsName)
- allowAllDevices := true
- manager := fs.Manager{
- Cgroups: &configs.Cgroup{
- Parent: "/",
- Name: cm.KubeletCgroupsName,
- Resources: &configs.Resources{
- AllowAllDevices: &allowAllDevices,
- },
- },
- }
- cont.ensureStateFunc = func(_ *fs.Manager) error {
- return manager.Apply(os.Getpid())
- }
- systemContainers = append(systemContainers, cont)
- } else {
- cm.periodicTasks = append(cm.periodicTasks, func() {
- cont, err := getContainer(os.Getpid())
- if err != nil {
- glog.Errorf("failed to find cgroups of kubelet - %v", err)
- return
- }
- cm.Lock()
- defer cm.Unlock()
- cm.KubeletCgroupsName = cont
- })
- }
- cm.systemContainers = systemContainers
- return nil
- }
- func getContainerNameForProcess(name, pidFile string) (string, error) {
- pids, err := getPidsForProcess(name, pidFile)
- if err != nil {
- return "", fmt.Errorf("failed to detect process id for %q - %v", name, err)
- }
- if len(pids) == 0 {
- return "", nil
- }
- cont, err := getContainer(pids[0])
- if err != nil {
- return "", err
- }
- return cont, nil
- }
- func (cm *containerManagerImpl) GetNodeConfig() NodeConfig {
- cm.RLock()
- defer cm.RUnlock()
- return cm.NodeConfig
- }
- func (cm *containerManagerImpl) GetMountedSubsystems() *CgroupSubsystems {
- return cm.subsystems
- }
- func (cm *containerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
- return cm.qosContainers
- }
- func (cm *containerManagerImpl) Status() Status {
- cm.RLock()
- defer cm.RUnlock()
- return cm.status
- }
- func (cm *containerManagerImpl) Start(node *api.Node) error {
- // cache the node Info including resource capacity and
- // allocatable of the node
- cm.nodeInfo = node
- // Setup the node
- if err := cm.setupNode(); err != nil {
- return err
- }
- // Don't run a background thread if there are no ensureStateFuncs.
- numEnsureStateFuncs := 0
- for _, cont := range cm.systemContainers {
- if cont.ensureStateFunc != nil {
- numEnsureStateFuncs++
- }
- }
- if numEnsureStateFuncs >= 0 {
- // Run ensure state functions every minute.
- go wait.Until(func() {
- for _, cont := range cm.systemContainers {
- if cont.ensureStateFunc != nil {
- if err := cont.ensureStateFunc(cont.manager); err != nil {
- glog.Warningf("[ContainerManager] Failed to ensure state of %q: %v", cont.name, err)
- }
- }
- }
- }, time.Minute, wait.NeverStop)
- }
- if len(cm.periodicTasks) > 0 {
- go wait.Until(func() {
- for _, task := range cm.periodicTasks {
- if task != nil {
- task()
- }
- }
- }, 5*time.Minute, wait.NeverStop)
- }
- return nil
- }
- func (cm *containerManagerImpl) SystemCgroupsLimit() api.ResourceList {
- cpuLimit := int64(0)
- // Sum up resources of all external containers.
- for _, cont := range cm.systemContainers {
- cpuLimit += cont.cpuMillicores
- }
- return api.ResourceList{
- api.ResourceCPU: *resource.NewMilliQuantity(
- cpuLimit,
- resource.DecimalSI),
- }
- }
- func isProcessRunningInHost(pid int) (bool, error) {
- // Get init mount namespace. Mount namespace is unique for all containers.
- initMntNs, err := os.Readlink("/proc/1/ns/mnt")
- if err != nil {
- return false, fmt.Errorf("failed to find mount namespace of init process")
- }
- processMntNs, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/mnt", pid))
- if err != nil {
- return false, fmt.Errorf("failed to find mount namespace of process %q", pid)
- }
- return initMntNs == processMntNs, nil
- }
- func getPidFromPidFile(pidFile string) (int, error) {
- file, err := os.Open(pidFile)
- if err != nil {
- return 0, fmt.Errorf("error opening pid file %s: %v", pidFile, err)
- }
- defer file.Close()
- data, err := ioutil.ReadAll(file)
- if err != nil {
- return 0, fmt.Errorf("error reading pid file %s: %v", pidFile, err)
- }
- pid, err := strconv.Atoi(string(data))
- if err != nil {
- return 0, fmt.Errorf("error parsing %s as a number: %v", string(data), err)
- }
- return pid, nil
- }
- func getPidsForProcess(name, pidFile string) ([]int, error) {
- if len(pidFile) > 0 {
- if pid, err := getPidFromPidFile(pidFile); err == nil {
- return []int{pid}, nil
- } else {
- // log the error and fall back to pidof
- runtime.HandleError(err)
- }
- }
- return procfs.PidOf(name)
- }
- // Ensures that the Docker daemon is in the desired container.
- func ensureDockerInContainer(dockerVersion semver.Version, oomScoreAdj int, manager *fs.Manager) error {
- type process struct{ name, file string }
- dockerProcs := []process{{dockerProcessName, dockerPidFile}}
- if dockerVersion.GTE(containerdVersion) {
- dockerProcs = append(dockerProcs, process{containerdProcessName, containerdPidFile})
- }
- var errs []error
- for _, proc := range dockerProcs {
- pids, err := getPidsForProcess(proc.name, proc.file)
- if err != nil {
- errs = append(errs, fmt.Errorf("failed to get pids for %q: %v", proc.name, err))
- continue
- }
- // Move if the pid is not already in the desired container.
- for _, pid := range pids {
- if err := ensureProcessInContainer(pid, oomScoreAdj, manager); err != nil {
- errs = append(errs, fmt.Errorf("errors moving %q pid: %v", proc.name, err))
- }
- }
- }
- return utilerrors.NewAggregate(errs)
- }
- func ensureProcessInContainer(pid int, oomScoreAdj int, manager *fs.Manager) error {
- if runningInHost, err := isProcessRunningInHost(pid); err != nil {
- // Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context.
- return err
- } else if !runningInHost {
- // Process is running inside a container. Don't touch that.
- return nil
- }
- var errs []error
- cont, err := getContainer(pid)
- if err != nil {
- errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err))
- }
- if cont != manager.Cgroups.Name {
- err = manager.Apply(pid)
- if err != nil {
- errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q", pid, cont, manager.Cgroups.Name))
- }
- }
- // Also apply oom-score-adj to processes
- oomAdjuster := oom.NewOOMAdjuster()
- if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil {
- errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d", oomScoreAdj, pid))
- }
- return utilerrors.NewAggregate(errs)
- }
- // getContainer returns the cgroup associated with the specified pid.
- // It enforces a unified hierarchy for memory and cpu cgroups.
- // On systemd environments, it uses the name=systemd cgroup for the specified pid.
- func getContainer(pid int) (string, error) {
- cgs, err := cgroups.ParseCgroupFile(fmt.Sprintf("/proc/%d/cgroup", pid))
- if err != nil {
- return "", err
- }
- cpu, found := cgs["cpu"]
- if !found {
- return "", cgroups.NewNotFoundError("cpu")
- }
- memory, found := cgs["memory"]
- if !found {
- return "", cgroups.NewNotFoundError("memory")
- }
- // since we use this container for accounting, we need to ensure its a unified hierarchy.
- if cpu != memory {
- return "", fmt.Errorf("cpu and memory cgroup hierarchy not unified. cpu: %s, memory: %s", cpu, memory)
- }
- // on systemd, every pid is in a unified cgroup hierarchy (name=systemd as seen in systemd-cgls)
- // cpu and memory accounting is off by default, users may choose to enable it per unit or globally.
- // users could enable CPU and memory accounting globally via /etc/systemd/system.conf (DefaultCPUAccounting=true DefaultMemoryAccounting=true).
- // users could also enable CPU and memory accounting per unit via CPUAccounting=true and MemoryAccounting=true
- // we only warn if accounting is not enabled for CPU or memory so as to not break local development flows where kubelet is launched in a terminal.
- // for example, the cgroup for the user session will be something like /user.slice/user-X.slice/session-X.scope, but the cpu and memory
- // cgroup will be the closest ancestor where accounting is performed (most likely /) on systems that launch docker containers.
- // as a result, on those systems, you will not get cpu or memory accounting statistics for kubelet.
- // in addition, you would not get memory or cpu accounting for the runtime unless accounting was enabled on its unit (or globally).
- if systemd, found := cgs["name=systemd"]; found {
- if systemd != cpu {
- glog.Warningf("CPUAccounting not enabled for pid: %d", pid)
- }
- if systemd != memory {
- glog.Warningf("MemoryAccounting not enabled for pid: %d", pid)
- }
- return systemd, nil
- }
- return cpu, nil
- }
- // Ensures the system container is created and all non-kernel threads and process 1
- // without a container are moved to it.
- //
- // The reason of leaving kernel threads at root cgroup is that we don't want to tie the
- // execution of these threads with to-be defined /system quota and create priority inversions.
- //
- func ensureSystemCgroups(rootContainer *fs.Manager, manager *fs.Manager) error {
- // Move non-kernel PIDs to the system container.
- attemptsRemaining := 10
- var errs []error
- for attemptsRemaining >= 0 {
- // Only keep errors on latest attempt.
- errs = []error{}
- attemptsRemaining--
- allPids, err := rootContainer.GetPids()
- if err != nil {
- errs = append(errs, fmt.Errorf("failed to list PIDs for root: %v", err))
- continue
- }
- // Remove kernel pids and other protected PIDs (pid 1, PIDs already in system & kubelet containers)
- pids := make([]int, 0, len(allPids))
- for _, pid := range allPids {
- if pid == 1 || isKernelPid(pid) {
- continue
- }
- pids = append(pids, pid)
- }
- glog.Infof("Found %d PIDs in root, %d of them are not to be moved", len(allPids), len(allPids)-len(pids))
- // Check if we have moved all the non-kernel PIDs.
- if len(pids) == 0 {
- break
- }
- glog.Infof("Moving non-kernel processes: %v", pids)
- for _, pid := range pids {
- err := manager.Apply(pid)
- if err != nil {
- errs = append(errs, fmt.Errorf("failed to move PID %d into the system container %q: %v", pid, manager.Cgroups.Name, err))
- }
- }
- }
- if attemptsRemaining < 0 {
- errs = append(errs, fmt.Errorf("ran out of attempts to create system containers %q", manager.Cgroups.Name))
- }
- return utilerrors.NewAggregate(errs)
- }
- // Determines whether the specified PID is a kernel PID.
- func isKernelPid(pid int) bool {
- // Kernel threads have no associated executable.
- _, err := os.Readlink(fmt.Sprintf("/proc/%d/exe", pid))
- return err != nil
- }
- // Helper for getting the docker version.
- func getDockerVersion(cadvisor cadvisor.Interface) semver.Version {
- var fallback semver.Version // Fallback to zero-value by default.
- versions, err := cadvisor.VersionInfo()
- if err != nil {
- glog.Errorf("Error requesting cAdvisor VersionInfo: %v", err)
- return fallback
- }
- dockerVersion, err := semver.Parse(versions.DockerVersion)
- if err != nil {
- glog.Errorf("Error parsing docker version %q: %v", versions.DockerVersion, err)
- return fallback
- }
- return dockerVersion
- }
|