vxlan.go 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. // Copyright 2015 flannel authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // +build !windows
  15. package vxlan
  16. // Some design notes and history:
  17. // VXLAN encapsulates L2 packets (though flannel is L3 only so don't expect to be able to send L2 packets across hosts)
  18. // The first versions of vxlan for flannel registered the flannel daemon as a handler for both "L2" and "L3" misses
  19. // - When a container sends a packet to a new IP address on the flannel network (but on a different host) this generates
  20. // an L2 miss (i.e. an ARP lookup)
  21. // - The flannel daemon knows which flannel host the packet is destined for so it can supply the VTEP MAC to use.
  22. // This is stored in the ARP table (with a timeout) to avoid constantly looking it up.
  23. // - The packet can then be encapsulated but the host needs to know where to send it. This creates another callout from
  24. // the kernal vxlan code to the flannel daemon to get the public IP that should be used for that VTEP (this gets called
  25. // an L3 miss). The L2/L3 miss hooks are registered when the vxlan device is created. At the same time a device route
  26. // is created to the whole flannel network so that non-local traffic is sent over the vxlan device.
  27. //
  28. // In this scheme the scaling of table entries (per host) is:
  29. // - 1 route (for the configured network out the vxlan device)
  30. // - One arp entry for each remote container that this host has recently contacted
  31. // - One FDB entry for each remote host
  32. //
  33. // The second version of flannel vxlan removed the need for the L3MISS callout. When a new remote host is found (either
  34. // during startup or when it's created), flannel simply adds the required entries so that no further lookup/callout is required.
  35. //
  36. //
  37. // The latest version of the vxlan backend removes the need for the L2MISS too, which means that the flannel deamon is not
  38. // listening for any netlink messages anymore. This improves reliability (no problems with timeouts if
  39. // flannel crashes or restarts) and simplifies upgrades.
  40. //
  41. // How it works:
  42. // Create the vxlan device but don't register for any L2MISS or L3MISS messages
  43. // Then, as each remote host is discovered (either on startup or when they are added), do the following
  44. // 1) Create routing table entry for the remote subnet. It goes via the vxlan device but also specifies a next hop (of the remote flannel host).
  45. // 2) Create a static ARP entry for the remote flannel host IP address (and the VTEP MAC)
  46. // 3) Create an FDB entry with the VTEP MAC and the public IP of the remote flannel daemon.
  47. //
  48. // In this scheme the scaling of table entries is linear to the number of remote hosts - 1 route, 1 arp entry and 1 FDB entry per host
  49. //
  50. // In this newest scheme, there is also the option of skipping the use of vxlan for hosts that are on the same subnet,
  51. // this is called "directRouting"
  52. import (
  53. "encoding/json"
  54. "fmt"
  55. "net"
  56. "sync"
  57. "golang.org/x/net/context"
  58. "github.com/flannel-io/flannel/backend"
  59. "github.com/flannel-io/flannel/pkg/ip"
  60. "github.com/flannel-io/flannel/subnet"
  61. log "k8s.io/klog"
  62. )
  63. func init() {
  64. backend.Register("vxlan", New)
  65. }
  66. const (
  67. defaultVNI = 1
  68. )
  69. type VXLANBackend struct {
  70. subnetMgr subnet.Manager
  71. extIface *backend.ExternalInterface
  72. }
  73. func New(sm subnet.Manager, extIface *backend.ExternalInterface) (backend.Backend, error) {
  74. backend := &VXLANBackend{
  75. subnetMgr: sm,
  76. extIface: extIface,
  77. }
  78. return backend, nil
  79. }
  80. func newSubnetAttrs(publicIP net.IP, publicIPv6 net.IP, vnid uint16, dev, v6Dev *vxlanDevice) (*subnet.LeaseAttrs, error) {
  81. leaseAttrs := &subnet.LeaseAttrs{
  82. BackendType: "vxlan",
  83. }
  84. if publicIP != nil && dev != nil {
  85. data, err := json.Marshal(&vxlanLeaseAttrs{
  86. VNI: vnid,
  87. VtepMAC: hardwareAddr(dev.MACAddr()),
  88. })
  89. if err != nil {
  90. return nil, err
  91. }
  92. leaseAttrs.PublicIP = ip.FromIP(publicIP)
  93. leaseAttrs.BackendData = json.RawMessage(data)
  94. }
  95. if publicIPv6 != nil && v6Dev != nil {
  96. data, err := json.Marshal(&vxlanLeaseAttrs{
  97. VNI: vnid,
  98. VtepMAC: hardwareAddr(v6Dev.MACAddr()),
  99. })
  100. if err != nil {
  101. return nil, err
  102. }
  103. leaseAttrs.PublicIPv6 = ip.FromIP6(publicIPv6)
  104. leaseAttrs.BackendV6Data = json.RawMessage(data)
  105. }
  106. return leaseAttrs, nil
  107. }
  108. func (be *VXLANBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup, config *subnet.Config) (backend.Network, error) {
  109. // Parse our configuration
  110. cfg := struct {
  111. VNI int
  112. Port int
  113. GBP bool
  114. Learning bool
  115. DirectRouting bool
  116. }{
  117. VNI: defaultVNI,
  118. }
  119. if len(config.Backend) > 0 {
  120. if err := json.Unmarshal(config.Backend, &cfg); err != nil {
  121. return nil, fmt.Errorf("error decoding VXLAN backend config: %v", err)
  122. }
  123. }
  124. log.Infof("VXLAN config: VNI=%d Port=%d GBP=%v Learning=%v DirectRouting=%v", cfg.VNI, cfg.Port, cfg.GBP, cfg.Learning, cfg.DirectRouting)
  125. var dev, v6Dev *vxlanDevice
  126. var err error
  127. if config.EnableIPv4 {
  128. devAttrs := vxlanDeviceAttrs{
  129. vni: uint32(cfg.VNI),
  130. name: fmt.Sprintf("flannel.%v", cfg.VNI),
  131. vtepIndex: be.extIface.Iface.Index,
  132. vtepAddr: be.extIface.IfaceAddr,
  133. vtepPort: cfg.Port,
  134. gbp: cfg.GBP,
  135. learning: cfg.Learning,
  136. }
  137. dev, err = newVXLANDevice(&devAttrs)
  138. if err != nil {
  139. return nil, err
  140. }
  141. dev.directRouting = cfg.DirectRouting
  142. }
  143. if config.EnableIPv6 {
  144. v6DevAttrs := vxlanDeviceAttrs{
  145. vni: uint32(cfg.VNI),
  146. name: fmt.Sprintf("flannel-v6.%v", cfg.VNI),
  147. vtepIndex: be.extIface.Iface.Index,
  148. vtepAddr: be.extIface.IfaceV6Addr,
  149. vtepPort: cfg.Port,
  150. gbp: cfg.GBP,
  151. learning: cfg.Learning,
  152. }
  153. v6Dev, err = newVXLANDevice(&v6DevAttrs)
  154. if err != nil {
  155. return nil, err
  156. }
  157. v6Dev.directRouting = cfg.DirectRouting
  158. }
  159. subnetAttrs, err := newSubnetAttrs(be.extIface.ExtAddr, be.extIface.ExtV6Addr, uint16(cfg.VNI), dev, v6Dev)
  160. if err != nil {
  161. return nil, err
  162. }
  163. lease, err := be.subnetMgr.AcquireLease(ctx, subnetAttrs)
  164. switch err {
  165. case nil:
  166. case context.Canceled, context.DeadlineExceeded:
  167. return nil, err
  168. default:
  169. return nil, fmt.Errorf("failed to acquire lease: %v", err)
  170. }
  171. // Ensure that the device has a /32 address so that no broadcast routes are created.
  172. // This IP is just used as a source address for host to workload traffic (so
  173. // the return path for the traffic has an address on the flannel network to use as the destination)
  174. if config.EnableIPv4 {
  175. if err := dev.Configure(ip.IP4Net{IP: lease.Subnet.IP, PrefixLen: 32}, config.Network); err != nil {
  176. return nil, fmt.Errorf("failed to configure interface %s: %w", dev.link.Attrs().Name, err)
  177. }
  178. }
  179. if config.EnableIPv6 {
  180. if err := v6Dev.ConfigureIPv6(ip.IP6Net{IP: lease.IPv6Subnet.IP, PrefixLen: 128}, config.IPv6Network); err != nil {
  181. return nil, fmt.Errorf("failed to configure interface %s: %w", v6Dev.link.Attrs().Name, err)
  182. }
  183. }
  184. return newNetwork(be.subnetMgr, be.extIface, dev, v6Dev, ip.IP4Net{}, lease)
  185. }
  186. // So we can make it JSON (un)marshalable
  187. type hardwareAddr net.HardwareAddr
  188. func (hw hardwareAddr) MarshalJSON() ([]byte, error) {
  189. return []byte(fmt.Sprintf("%q", net.HardwareAddr(hw))), nil
  190. }
  191. func (hw *hardwareAddr) UnmarshalJSON(bytes []byte) error {
  192. if len(bytes) < 2 || bytes[0] != '"' || bytes[len(bytes)-1] != '"' {
  193. return fmt.Errorf("error parsing hardware addr")
  194. }
  195. bytes = bytes[1 : len(bytes)-1]
  196. mac, err := net.ParseMAC(string(bytes))
  197. if err != nil {
  198. return err
  199. }
  200. *hw = hardwareAddr(mac)
  201. return nil
  202. }