node_problem_detector.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package e2e
  14. import (
  15. "fmt"
  16. "strings"
  17. "time"
  18. "k8s.io/kubernetes/pkg/api"
  19. client "k8s.io/kubernetes/pkg/client/unversioned"
  20. "k8s.io/kubernetes/pkg/fields"
  21. "k8s.io/kubernetes/pkg/labels"
  22. "k8s.io/kubernetes/pkg/util/system"
  23. "k8s.io/kubernetes/pkg/util/uuid"
  24. "k8s.io/kubernetes/test/e2e/framework"
  25. . "github.com/onsi/ginkgo"
  26. . "github.com/onsi/gomega"
  27. )
  28. var _ = framework.KubeDescribe("NodeProblemDetector", func() {
  29. const (
  30. pollInterval = 1 * time.Second
  31. pollConsistent = 5 * time.Second
  32. pollTimeout = 1 * time.Minute
  33. image = "gcr.io/google_containers/node-problem-detector:v0.1"
  34. )
  35. f := framework.NewDefaultFramework("node-problem-detector")
  36. var c *client.Client
  37. var uid string
  38. var ns, name, configName, eventNamespace string
  39. BeforeEach(func() {
  40. c = f.Client
  41. ns = f.Namespace.Name
  42. uid = string(uuid.NewUUID())
  43. name = "node-problem-detector-" + uid
  44. configName = "node-problem-detector-config-" + uid
  45. // There is no namespace for Node, event recorder will set default namespace for node events.
  46. eventNamespace = api.NamespaceDefault
  47. })
  48. // Test kernel monitor. We may add other tests if we have more problem daemons in the future.
  49. framework.KubeDescribe("KernelMonitor", func() {
  50. const (
  51. // Use test condition to avoid conflict with real node problem detector
  52. // TODO(random-liu): Now node condition could be arbitrary string, consider wether we need to
  53. // add TestCondition when switching to predefined condition list.
  54. condition = api.NodeConditionType("TestCondition")
  55. defaultReason = "Default"
  56. defaultMessage = "default message"
  57. logDir = "/log"
  58. logFile = "test.log"
  59. configDir = "/config"
  60. configFile = "testconfig.json"
  61. tempReason = "Temporary"
  62. tempMessage = "temporary error"
  63. permReason = "Permanent"
  64. permMessage = "permanent error"
  65. configVolume = "config"
  66. logVolume = "log"
  67. )
  68. var source, config, tmpDir string
  69. var node *api.Node
  70. var eventListOptions api.ListOptions
  71. injectCommand := func(err string, num int) string {
  72. var commands []string
  73. for i := 0; i < num; i++ {
  74. commands = append(commands, fmt.Sprintf("echo kernel: [%d.000000] %s >> %s/%s", i, err, tmpDir, logFile))
  75. }
  76. return strings.Join(commands, ";")
  77. }
  78. BeforeEach(func() {
  79. framework.SkipUnlessProviderIs(framework.ProvidersWithSSH...)
  80. // Randomize the source name to avoid conflict with real node problem detector
  81. source = "kernel-monitor-" + uid
  82. config = `
  83. {
  84. "logPath": "` + logDir + "/" + logFile + `",
  85. "bufferSize": 10,
  86. "source": "` + source + `",
  87. "conditions": [
  88. {
  89. "type": "` + string(condition) + `",
  90. "reason": "` + defaultReason + `",
  91. "message": "` + defaultMessage + `"
  92. }
  93. ],
  94. "rules": [
  95. {
  96. "type": "temporary",
  97. "reason": "` + tempReason + `",
  98. "pattern": "` + tempMessage + `"
  99. },
  100. {
  101. "type": "permanent",
  102. "condition": "` + string(condition) + `",
  103. "reason": "` + permReason + `",
  104. "pattern": "` + permMessage + `"
  105. }
  106. ]
  107. }`
  108. By("Get a non master node to run the pod")
  109. nodes, err := c.Nodes().List(api.ListOptions{})
  110. Expect(err).NotTo(HaveOccurred())
  111. node = nil
  112. for _, n := range nodes.Items {
  113. if !system.IsMasterNode(&n) {
  114. node = &n
  115. break
  116. }
  117. }
  118. Expect(node).NotTo(BeNil())
  119. By("Generate event list options")
  120. selector := fields.Set{
  121. "involvedObject.kind": "Node",
  122. "involvedObject.name": node.Name,
  123. "involvedObject.namespace": api.NamespaceAll,
  124. "source": source,
  125. }.AsSelector()
  126. eventListOptions = api.ListOptions{FieldSelector: selector}
  127. By("Create the test log file")
  128. tmpDir = "/tmp/" + name
  129. cmd := fmt.Sprintf("mkdir %s; > %s/%s", tmpDir, tmpDir, logFile)
  130. Expect(framework.IssueSSHCommand(cmd, framework.TestContext.Provider, node)).To(Succeed())
  131. By("Create config map for the node problem detector")
  132. _, err = c.ConfigMaps(ns).Create(&api.ConfigMap{
  133. ObjectMeta: api.ObjectMeta{
  134. Name: configName,
  135. },
  136. Data: map[string]string{configFile: config},
  137. })
  138. Expect(err).NotTo(HaveOccurred())
  139. By("Create the node problem detector")
  140. _, err = c.Pods(ns).Create(&api.Pod{
  141. ObjectMeta: api.ObjectMeta{
  142. Name: name,
  143. },
  144. Spec: api.PodSpec{
  145. NodeName: node.Name,
  146. SecurityContext: &api.PodSecurityContext{HostNetwork: true},
  147. Volumes: []api.Volume{
  148. {
  149. Name: configVolume,
  150. VolumeSource: api.VolumeSource{
  151. ConfigMap: &api.ConfigMapVolumeSource{
  152. LocalObjectReference: api.LocalObjectReference{Name: configName},
  153. },
  154. },
  155. },
  156. {
  157. Name: logVolume,
  158. VolumeSource: api.VolumeSource{
  159. HostPath: &api.HostPathVolumeSource{Path: tmpDir},
  160. },
  161. },
  162. },
  163. Containers: []api.Container{
  164. {
  165. Name: name,
  166. Image: image,
  167. Command: []string{"/node-problem-detector", "--kernel-monitor=" + configDir + "/" + configFile},
  168. VolumeMounts: []api.VolumeMount{
  169. {
  170. Name: logVolume,
  171. MountPath: logDir,
  172. },
  173. {
  174. Name: configVolume,
  175. MountPath: configDir,
  176. },
  177. },
  178. },
  179. },
  180. },
  181. })
  182. Expect(err).NotTo(HaveOccurred())
  183. By("Wait for node problem detector running")
  184. Expect(f.WaitForPodRunning(name)).To(Succeed())
  185. })
  186. It("should generate node condition and events for corresponding errors", func() {
  187. By("Make sure no events are generated")
  188. Consistently(func() error {
  189. return verifyNoEvents(c.Events(eventNamespace), eventListOptions)
  190. }, pollConsistent, pollInterval).Should(Succeed())
  191. By("Make sure the default node condition is generated")
  192. Eventually(func() error {
  193. return verifyCondition(c.Nodes(), node.Name, condition, api.ConditionFalse, defaultReason, defaultMessage)
  194. }, pollConsistent, pollInterval).Should(Succeed())
  195. num := 3
  196. By(fmt.Sprintf("Inject %d temporary errors", num))
  197. Expect(framework.IssueSSHCommand(injectCommand(tempMessage, num), framework.TestContext.Provider, node)).To(Succeed())
  198. By(fmt.Sprintf("Wait for %d events generated", num))
  199. Eventually(func() error {
  200. return verifyEvents(c.Events(eventNamespace), eventListOptions, num, tempReason, tempMessage)
  201. }, pollTimeout, pollInterval).Should(Succeed())
  202. By(fmt.Sprintf("Make sure only %d events generated", num))
  203. Consistently(func() error {
  204. return verifyEvents(c.Events(eventNamespace), eventListOptions, num, tempReason, tempMessage)
  205. }, pollConsistent, pollInterval).Should(Succeed())
  206. By("Make sure the node condition is still false")
  207. Expect(verifyCondition(c.Nodes(), node.Name, condition, api.ConditionFalse, defaultReason, defaultMessage)).To(Succeed())
  208. By("Inject 1 permanent error")
  209. Expect(framework.IssueSSHCommand(injectCommand(permMessage, 1), framework.TestContext.Provider, node)).To(Succeed())
  210. By("Make sure the corresponding node condition is generated")
  211. Eventually(func() error {
  212. return verifyCondition(c.Nodes(), node.Name, condition, api.ConditionTrue, permReason, permMessage)
  213. }, pollTimeout, pollInterval).Should(Succeed())
  214. By("Make sure no new events are generated")
  215. Consistently(func() error {
  216. return verifyEvents(c.Events(eventNamespace), eventListOptions, num, tempReason, tempMessage)
  217. }, pollConsistent, pollInterval).Should(Succeed())
  218. })
  219. AfterEach(func() {
  220. By("Delete the node problem detector")
  221. c.Pods(ns).Delete(name, api.NewDeleteOptions(0))
  222. By("Wait for the node problem detector to disappear")
  223. Expect(framework.WaitForPodToDisappear(c, ns, name, labels.Everything(), pollInterval, pollTimeout)).To(Succeed())
  224. By("Delete the config map")
  225. c.ConfigMaps(ns).Delete(configName)
  226. By("Clean up the events")
  227. Expect(c.Events(eventNamespace).DeleteCollection(api.NewDeleteOptions(0), eventListOptions)).To(Succeed())
  228. By("Clean up the node condition")
  229. patch := []byte(fmt.Sprintf(`{"status":{"conditions":[{"$patch":"delete","type":"%s"}]}}`, condition))
  230. c.Patch(api.StrategicMergePatchType).Resource("nodes").Name(node.Name).SubResource("status").Body(patch).Do()
  231. By("Clean up the temporary directory")
  232. framework.IssueSSHCommand(fmt.Sprintf("rm -r %s", tmpDir), framework.TestContext.Provider, node)
  233. })
  234. })
  235. })
  236. // verifyEvents verifies there are num specific events generated
  237. func verifyEvents(e client.EventInterface, options api.ListOptions, num int, reason, message string) error {
  238. events, err := e.List(options)
  239. if err != nil {
  240. return err
  241. }
  242. count := 0
  243. for _, event := range events.Items {
  244. if event.Reason != reason || event.Message != message {
  245. return fmt.Errorf("unexpected event: %v", event)
  246. }
  247. count += int(event.Count)
  248. }
  249. if count != num {
  250. return fmt.Errorf("expect event number %d, got %d: %v", num, count, events.Items)
  251. }
  252. return nil
  253. }
  254. // verifyNoEvents verifies there is no event generated
  255. func verifyNoEvents(e client.EventInterface, options api.ListOptions) error {
  256. events, err := e.List(options)
  257. if err != nil {
  258. return err
  259. }
  260. if len(events.Items) != 0 {
  261. return fmt.Errorf("unexpected events: %v", events.Items)
  262. }
  263. return nil
  264. }
  265. // verifyCondition verifies specific node condition is generated, if reason and message are empty, they will not be checked
  266. func verifyCondition(n client.NodeInterface, nodeName string, condition api.NodeConditionType, status api.ConditionStatus, reason, message string) error {
  267. node, err := n.Get(nodeName)
  268. if err != nil {
  269. return err
  270. }
  271. _, c := api.GetNodeCondition(&node.Status, condition)
  272. if c == nil {
  273. return fmt.Errorf("node condition %q not found", condition)
  274. }
  275. if c.Status != status || c.Reason != reason || c.Message != message {
  276. return fmt.Errorf("unexpected node condition %q: %+v", condition, c)
  277. }
  278. return nil
  279. }