123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173 |
- // Copyright 2015 flannel authors
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- package vxlan
- // Some design notes and history:
- // VXLAN encapsulates L2 packets (though flannel is L3 only so don't expect to be able to send L2 packets across hosts)
- // The first versions of vxlan for flannel registered the flannel daemon as a handler for both "L2" and "L3" misses
- // - When a container sends a packet to a new IP address on the flannel network (but on a different host) this generates
- // an L2 miss (i.e. an ARP lookup)
- // - The flannel daemon knows which flannel host the packet is destined for so it can supply the VTEP MAC to use.
- // This is stored in the ARP table (with a timeout) to avoid constantly looking it up.
- // - The packet can then be encapsulated but the host needs to know where to send it. This creates another callout from
- // the kernal vxlan code to the flannel daemon to get the public IP that should be used for that VTEP (this gets called
- // an L3 miss). The L2/L3 miss hooks are registered when the vxlan device is created. At the same time a device route
- // is created to the whole flannel network so that non-local traffic is sent over the vxlan device.
- //
- // In this scheme the scaling of table entries (per host) is:
- // - 1 route (for the configured network out the vxlan device)
- // - One arp entry for each remote container that this host has recently contacted
- // - One FDB entry for each remote host
- //
- // The second version of flannel vxlan removed the need for the L3MISS callout. When a new remote host is found (either
- // during startup or when it's created), flannel simply adds the required entries so that no further lookup/callout is required.
- //
- //
- // The latest version of the vxlan backend removes the need for the L2MISS too, which means that the flannel deamon is not
- // listening for any netlink messages anymore. This improves reliability (no problems with timeouts if
- // flannel crashes or restarts) and simplifies upgrades.
- //
- // How it works:
- // Create the vxlan device but don't register for any L2MISS or L3MISS messages
- // Then, as each remote host is discovered (either on startup or when they are added), do the following
- // 1) create routing table entry for the remote subnet. It goes via the vxlan device but also specifies a next hop (of the remote flannel host).
- // 2) Create a static ARP entry for the remote flannel host IP address (and the VTEP MAC)
- // 3) Create an FDB entry with the VTEP MAC and the public IP of the remote flannel daemon.
- //
- // In this scheme the scaling of table entries is linear to the number of remote hosts - 1 route, 1 arp entry and 1 FDB entry per host
- import (
- "encoding/json"
- "fmt"
- "net"
- "golang.org/x/net/context"
- "github.com/coreos/flannel/backend"
- "github.com/coreos/flannel/pkg/ip"
- "github.com/coreos/flannel/subnet"
- )
- func init() {
- backend.Register("vxlan", New)
- }
- const (
- defaultVNI = 1
- )
- type VXLANBackend struct {
- subnetMgr subnet.Manager
- extIface *backend.ExternalInterface
- }
- func New(sm subnet.Manager, extIface *backend.ExternalInterface) (backend.Backend, error) {
- backend := &VXLANBackend{
- subnetMgr: sm,
- extIface: extIface,
- }
- return backend, nil
- }
- func newSubnetAttrs(publicIP net.IP, mac net.HardwareAddr) (*subnet.LeaseAttrs, error) {
- data, err := json.Marshal(&vxlanLeaseAttrs{hardwareAddr(mac)})
- if err != nil {
- return nil, err
- }
- return &subnet.LeaseAttrs{
- PublicIP: ip.FromIP(publicIP),
- BackendType: "vxlan",
- BackendData: json.RawMessage(data),
- }, nil
- }
- func (be *VXLANBackend) RegisterNetwork(ctx context.Context, config *subnet.Config) (backend.Network, error) {
- // Parse our configuration
- cfg := struct {
- VNI int
- Port int
- GBP bool
- }{
- VNI: defaultVNI,
- }
- if len(config.Backend) > 0 {
- if err := json.Unmarshal(config.Backend, &cfg); err != nil {
- return nil, fmt.Errorf("error decoding VXLAN backend config: %v", err)
- }
- }
- devAttrs := vxlanDeviceAttrs{
- vni: uint32(cfg.VNI),
- name: fmt.Sprintf("flannel.%v", cfg.VNI),
- vtepIndex: be.extIface.Iface.Index,
- vtepAddr: be.extIface.IfaceAddr,
- vtepPort: cfg.Port,
- gbp: cfg.GBP,
- }
- dev, err := newVXLANDevice(&devAttrs)
- if err != nil {
- return nil, err
- }
- subnetAttrs, err := newSubnetAttrs(be.extIface.ExtAddr, dev.MACAddr())
- if err != nil {
- return nil, err
- }
- lease, err := be.subnetMgr.AcquireLease(ctx, subnetAttrs)
- switch err {
- case nil:
- case context.Canceled, context.DeadlineExceeded:
- return nil, err
- default:
- return nil, fmt.Errorf("failed to acquire lease: %v", err)
- }
- // Ensure that the device has a /32 address so that no broadcast routes are created.
- // This IP is just used as a source address for host to workload traffic (so
- // the return path for the traffic has an address on the flannel network to use as the destination)
- if err := dev.Configure(ip.IP4Net{IP: lease.Subnet.IP, PrefixLen: 32}); err != nil {
- return nil, fmt.Errorf("failed to configure interface %s: %s", dev.link.Attrs().Name, err)
- }
- return newNetwork(be.subnetMgr, be.extIface, dev, ip.IP4Net{}, lease)
- }
- // So we can make it JSON (un)marshalable
- type hardwareAddr net.HardwareAddr
- func (hw hardwareAddr) MarshalJSON() ([]byte, error) {
- return []byte(fmt.Sprintf("%q", net.HardwareAddr(hw))), nil
- }
- func (hw *hardwareAddr) UnmarshalJSON(bytes []byte) error {
- if len(bytes) < 2 || bytes[0] != '"' || bytes[len(bytes)-1] != '"' {
- return fmt.Errorf("error parsing hardware addr")
- }
- bytes = bytes[1 : len(bytes)-1]
- mac, err := net.ParseMAC(string(bytes))
- if err != nil {
- return err
- }
- *hw = hardwareAddr(mac)
- return nil
- }
|