瀏覽代碼

backend/vxlan: simplify vxlan processing

Tom Denham 8 年之前
父節點
當前提交
5d3d664255
共有 5 個文件被更改,包括 136 次插入327 次删除
  1. 37 114
      backend/vxlan/device.go
  2. 0 57
      backend/vxlan/routes.go
  3. 40 11
      backend/vxlan/vxlan.go
  4. 56 142
      backend/vxlan/vxlan_network.go
  5. 3 3
      dist/functional-test.sh

+ 37 - 114
backend/vxlan/device.go

@@ -17,13 +17,10 @@ package vxlan
 import (
 	"fmt"
 	"net"
-	"os"
 	"syscall"
-	"time"
 
 	log "github.com/golang/glog"
 	"github.com/vishvananda/netlink"
-	"github.com/vishvananda/netlink/nl"
 
 	"github.com/coreos/flannel/pkg/ip"
 )
@@ -41,17 +38,6 @@ type vxlanDevice struct {
 	link *netlink.Vxlan
 }
 
-func sysctlSet(path, value string) error {
-	f, err := os.Create(path)
-	if err != nil {
-		return err
-	}
-	defer f.Close()
-
-	_, err = f.Write([]byte(value))
-	return err
-}
-
 func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
 	link := &netlink.Vxlan{
 		LinkAttrs: netlink.LinkAttrs{
@@ -69,12 +55,6 @@ func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
 	if err != nil {
 		return nil, err
 	}
-	// this enables ARP requests being sent to userspace via netlink
-	sysctlPath := fmt.Sprintf("/proc/sys/net/ipv4/neigh/%s/app_solicit", devAttrs.name)
-	if err := sysctlSet(sysctlPath, "3"); err != nil {
-		return nil, err
-	}
-
 	return &vxlanDevice{
 		link: link,
 	}, nil
@@ -84,6 +64,7 @@ func ensureLink(vxlan *netlink.Vxlan) (*netlink.Vxlan, error) {
 	err := netlink.LinkAdd(vxlan)
 	if err == syscall.EEXIST {
 		// it's ok if the device already exists as long as config is similar
+		log.V(1).Infof("VXLAN device already exists")
 		existing, err := netlink.LinkByName(vxlan.Name)
 		if err != nil {
 			return nil, err
@@ -91,6 +72,7 @@ func ensureLink(vxlan *netlink.Vxlan) (*netlink.Vxlan, error) {
 
 		incompat := vxlanLinksIncompat(vxlan, existing)
 		if incompat == "" {
+			log.V(1).Infof("Returning existing device")
 			return existing.(*netlink.Vxlan), nil
 		}
 
@@ -123,28 +105,37 @@ func ensureLink(vxlan *netlink.Vxlan) (*netlink.Vxlan, error) {
 }
 
 func (dev *vxlanDevice) Configure(ipn ip.IP4Net) error {
-	setAddr4(dev.link, ipn.ToIPNet())
+	addr := netlink.Addr{IPNet: ipn.ToIPNet()}
+	existingAddrs, err := netlink.AddrList(dev.link, netlink.FAMILY_V4)
+	if err != nil {
+		return err
+	}
 
-	if err := netlink.LinkSetUp(dev.link); err != nil {
-		return fmt.Errorf("failed to set interface %s to UP state: %s", dev.link.Attrs().Name, err)
+	// flannel will never make this happen. This situation can only be caused by a user, so get them to sort it out.
+	if len(existingAddrs) > 1 {
+		return fmt.Errorf("link has incompatible addresses. Remove additional addresses and try again. %s", dev.link)
 	}
 
-	// explicitly add a route since there might be a route for a subnet already
-	// installed by Docker and then it won't get auto added
-	route := netlink.Route{
-		LinkIndex: dev.link.Attrs().Index,
-		Scope:     netlink.SCOPE_UNIVERSE,
-		Dst:       ipn.Network().ToIPNet(),
+	// If the device has an incompatible address then delete it. This can happen if the lease changes for example.
+	if len(existingAddrs) == 1 && !existingAddrs[0].Equal(addr) {
+		if err := netlink.AddrDel(dev.link, &existingAddrs[0]); err != nil {
+			return fmt.Errorf("failed to remove IP address %s from %s: %s", ipn.String(), dev.link.Attrs().Name, err)
+		}
+		existingAddrs = []netlink.Addr{}
 	}
-	if err := netlink.RouteAdd(&route); err != nil && err != syscall.EEXIST {
-		return fmt.Errorf("failed to add route (%s -> %s): %v", ipn.Network().String(), dev.link.Attrs().Name, err)
+
+	// Actually add the desired address to the interface if needed.
+	if len(existingAddrs) == 0 {
+		if err := netlink.AddrAdd(dev.link, &addr); err != nil {
+			return fmt.Errorf("failed to add IP address %s to %s: %s", ipn.String(), dev.link.Attrs().Name, err)
+		}
 	}
 
-	return nil
-}
+	if err := netlink.LinkSetUp(dev.link); err != nil {
+		return fmt.Errorf("failed to set interface %s to UP state: %s", dev.link.Attrs().Name, err)
+	}
 
-func (dev *vxlanDevice) Destroy() {
-	netlink.LinkDel(dev.link)
+	return nil
 }
 
 func (dev *vxlanDevice) MACAddr() net.HardwareAddr {
@@ -160,14 +151,9 @@ type neighbor struct {
 	IP  ip.IP4
 }
 
-func (dev *vxlanDevice) GetL2List() ([]netlink.Neigh, error) {
-	log.V(4).Infof("calling GetL2List() dev.link.Index: %d ", dev.link.Index)
-	return netlink.NeighList(dev.link.Index, syscall.AF_BRIDGE)
-}
-
-func (dev *vxlanDevice) AddL2(n neighbor) error {
-	log.V(4).Infof("calling NeighAdd: %v, %v", n.IP, n.MAC)
-	return netlink.NeighAdd(&netlink.Neigh{
+func (dev *vxlanDevice) AddFDB(n neighbor) error {
+	log.V(4).Infof("calling AddFDB: %v, %v", n.IP, n.MAC)
+	return netlink.NeighSet(&netlink.Neigh{
 		LinkIndex:    dev.link.Index,
 		State:        netlink.NUD_PERMANENT,
 		Family:       syscall.AF_BRIDGE,
@@ -177,8 +163,8 @@ func (dev *vxlanDevice) AddL2(n neighbor) error {
 	})
 }
 
-func (dev *vxlanDevice) DelL2(n neighbor) error {
-	log.V(4).Infof("calling NeighDel: %v, %v", n.IP, n.MAC)
+func (dev *vxlanDevice) DelFDB(n neighbor) error {
+	log.V(4).Infof("calling DelFDB: %v, %v", n.IP, n.MAC)
 	return netlink.NeighDel(&netlink.Neigh{
 		LinkIndex:    dev.link.Index,
 		Family:       syscall.AF_BRIDGE,
@@ -188,77 +174,28 @@ func (dev *vxlanDevice) DelL2(n neighbor) error {
 	})
 }
 
-func (dev *vxlanDevice) AddL3(n neighbor) error {
-	log.V(4).Infof("calling NeighSet: %v, %v", n.IP, n.MAC)
+func (dev *vxlanDevice) AddARP(n neighbor) error {
+	log.V(4).Infof("calling AddARP: %v, %v", n.IP, n.MAC)
 	return netlink.NeighSet(&netlink.Neigh{
 		LinkIndex:    dev.link.Index,
-		State:        netlink.NUD_REACHABLE,
+		State:        netlink.NUD_PERMANENT,
 		Type:         syscall.RTN_UNICAST,
 		IP:           n.IP.ToIP(),
 		HardwareAddr: n.MAC,
 	})
 }
 
-func (dev *vxlanDevice) DelL3(n neighbor) error {
-	log.V(4).Infof("calling NeighDel: %v, %v", n.IP, n.MAC)
+func (dev *vxlanDevice) DelARP(n neighbor) error {
+	log.V(4).Infof("calling DelARP: %v, %v", n.IP, n.MAC)
 	return netlink.NeighDel(&netlink.Neigh{
 		LinkIndex:    dev.link.Index,
-		State:        netlink.NUD_REACHABLE,
+		State:        netlink.NUD_PERMANENT,
 		Type:         syscall.RTN_UNICAST,
 		IP:           n.IP.ToIP(),
 		HardwareAddr: n.MAC,
 	})
 }
 
-func (dev *vxlanDevice) MonitorMisses(misses chan *netlink.Neigh) {
-	nlsock, err := nl.Subscribe(syscall.NETLINK_ROUTE, syscall.RTNLGRP_NEIGH)
-	if err != nil {
-		log.Error("Failed to subscribe to netlink RTNLGRP_NEIGH messages")
-		return
-	}
-
-	for {
-		msgs, err := nlsock.Receive()
-		if err != nil {
-			log.Errorf("Failed to receive from netlink: %v ", err)
-
-			time.Sleep(1 * time.Second)
-			continue
-		}
-
-		for _, msg := range msgs {
-			dev.processNeighMsg(msg, misses)
-		}
-	}
-}
-
-func isNeighResolving(state int) bool {
-	return (state & (netlink.NUD_INCOMPLETE | netlink.NUD_STALE | netlink.NUD_DELAY | netlink.NUD_PROBE)) != 0
-}
-
-func (dev *vxlanDevice) processNeighMsg(msg syscall.NetlinkMessage, misses chan *netlink.Neigh) {
-	neigh, err := netlink.NeighDeserialize(msg.Data)
-	if err != nil {
-		log.Error("Failed to deserialize netlink ndmsg: %v", err)
-		return
-	}
-
-	if neigh.LinkIndex != dev.link.Index {
-		return
-	}
-
-	if msg.Header.Type != syscall.RTM_GETNEIGH && msg.Header.Type != syscall.RTM_NEWNEIGH {
-		return
-	}
-
-	if !isNeighResolving(neigh.State) {
-		// misses come with NUD_STALE bit set
-		return
-	}
-
-	misses <- neigh
-}
-
 func vxlanLinksIncompat(l1, l2 netlink.Link) string {
 	if l1.Type() != l2.Type() {
 		return fmt.Sprintf("link type: %v vs %v", l1.Type(), l2.Type())
@@ -297,17 +234,3 @@ func vxlanLinksIncompat(l1, l2 netlink.Link) string {
 
 	return ""
 }
-
-// sets IP4 addr on link
-func setAddr4(link *netlink.Vxlan, ipn *net.IPNet) error {
-	// Ensure that the device has a /32 address so that no broadcast routes are created.
-	// This IP is just used as a source address for host to workload traffic (so
-	// the return path for the traffic has a decent address to use as the destination)
-	ipn.Mask = net.CIDRMask(32, 32)
-	addr := netlink.Addr{IPNet: ipn, Label: ""}
-	if err := netlink.AddrAdd(link, &addr); err != nil {
-		return fmt.Errorf("failed to add IP address %s to %s: %s", ipn.String(), link.Attrs().Name, err)
-	}
-
-	return nil
-}

+ 0 - 57
backend/vxlan/routes.go

@@ -1,57 +0,0 @@
-// Copyright 2015 flannel authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vxlan
-
-import (
-	"net"
-
-	"github.com/coreos/flannel/pkg/ip"
-)
-
-type route struct {
-	network ip.IP4Net
-	vtepMAC net.HardwareAddr
-}
-
-type routes []route
-
-func (rts *routes) set(nw ip.IP4Net, vtepMAC net.HardwareAddr) {
-	for i, rt := range *rts {
-		if rt.network.Equal(nw) {
-			(*rts)[i].vtepMAC = vtepMAC
-			return
-		}
-	}
-	*rts = append(*rts, route{nw, vtepMAC})
-}
-
-func (rts *routes) remove(nw ip.IP4Net) {
-	for i, rt := range *rts {
-		if rt.network.Equal(nw) {
-			(*rts)[i] = (*rts)[len(*rts)-1]
-			(*rts) = (*rts)[0 : len(*rts)-1]
-			return
-		}
-	}
-}
-
-func (rts routes) findByNetwork(ipAddr ip.IP4) *route {
-	for i, rt := range rts {
-		if rt.network.Contains(ipAddr) {
-			return &rts[i]
-		}
-	}
-	return nil
-}

+ 40 - 11
backend/vxlan/vxlan.go

@@ -14,6 +14,40 @@
 
 package vxlan
 
+// Some design notes and history:
+// VXLAN encapsulates L2 packets (though flannel is L3 only so don't expect to be able to send L2 packets across hosts)
+// The first versions of vxlan for flannel registered the flannel daemon as a handler for both "L2" and "L3" misses
+// - When a container sends a packet to a new IP address on the flannel network (but on a different host) this generates
+//   an L2 miss (i.e. an ARP lookup)
+// - The flannel daemon knows which flannel host the packet is destined for so it can supply the VTEP MAC to use.
+//   This is stored in the ARP table (with a timeout) to avoid constantly looking it up.
+// - The packet can then be encapsulated but the host needs to know where to send it. This creates another callout from
+//   the kernal vxlan code to the flannel daemon to get the public IP that should be used for that VTEP (this gets called
+//   an L3 miss). The L2/L3 miss hooks are registered when the vxlan device is created. At the same time a device route
+//   is created to the whole flannel network so that non-local traffic is sent over the vxlan device.
+//
+// In this scheme the scaling of table entries (per host) is:
+//  - 1 route (for the configured network out the vxlan device)
+//  - One arp entry for each remote container that this host has recently contacted
+//  - One FDB entry for each remote host
+//
+// The second version of flannel vxlan removed the need for the L3MISS callout. When a new remote host is found (either
+// during startup or when it's created), flannel simply adds the required entries so that no further lookup/callout is required.
+//
+//
+// The latest version of the vxlan backend  removes the need for the L2MISS too, which means that the flannel deamon is not
+// listening for any netlink messages anymore. This improves reliability (no problems with timeouts if
+// flannel crashes or restarts) and simplifies upgrades.
+//
+// How it works:
+// Create the vxlan device but don't register for any L2MISS or L3MISS messages
+// Then, as each remote host is discovered (either on startup or when they are added), do the following
+// 1) create routing table entry for the remote subnet. It goes via the vxlan device but also specifies a next hop (of the remote flannel host).
+// 2) Create a static ARP entry for the remote flannel host IP address (and the VTEP MAC)
+// 3) Create an FDB entry with the VTEP MAC and the public IP of the remote flannel daemon.
+//
+// In this scheme the scaling of table entries is linear to the number of remote hosts - 1 route, 1 arp entry and 1 FDB entry per host
+
 import (
 	"encoding/json"
 	"fmt"
@@ -99,25 +133,20 @@ func (be *VXLANBackend) RegisterNetwork(ctx context.Context, config *subnet.Conf
 	lease, err := be.subnetMgr.AcquireLease(ctx, subnetAttrs)
 	switch err {
 	case nil:
-
 	case context.Canceled, context.DeadlineExceeded:
 		return nil, err
-
 	default:
 		return nil, fmt.Errorf("failed to acquire lease: %v", err)
 	}
 
-	// vxlan's subnet is that of the whole overlay network (e.g. /16)
-	// and not that of the individual host (e.g. /24)
-	vxlanNet := ip.IP4Net{
-		IP:        lease.Subnet.IP,
-		PrefixLen: config.Network.PrefixLen,
-	}
-	if err = dev.Configure(vxlanNet); err != nil {
-		return nil, err
+	// Ensure that the device has a /32 address so that no broadcast routes are created.
+	// This IP is just used as a source address for host to workload traffic (so
+	// the return path for the traffic has an address on the flannel network to use as the destination)
+	if err := dev.Configure(ip.IP4Net{IP: lease.Subnet.IP, PrefixLen: 32}); err != nil {
+		return nil, fmt.Errorf("failed to configure interface %s: %s", dev.link.Attrs().Name, err)
 	}
 
-	return newNetwork(be.subnetMgr, be.extIface, dev, vxlanNet, lease)
+	return newNetwork(be.subnetMgr, be.extIface, dev, ip.IP4Net{}, lease)
 }
 
 // So we can make it JSON (un)marshalable

+ 56 - 142
backend/vxlan/vxlan_network.go

@@ -15,17 +15,16 @@
 package vxlan
 
 import (
-	"bytes"
 	"encoding/json"
-	"fmt"
 	"net"
 	"sync"
-	"time"
 
 	log "github.com/golang/glog"
 	"github.com/vishvananda/netlink"
 	"golang.org/x/net/context"
 
+	"syscall"
+
 	"github.com/coreos/flannel/backend"
 	"github.com/coreos/flannel/pkg/ip"
 	"github.com/coreos/flannel/subnet"
@@ -35,7 +34,6 @@ type network struct {
 	backend.SimpleNetwork
 	extIface  *backend.ExternalInterface
 	dev       *vxlanDevice
-	routes    routes
 	subnetMgr subnet.Manager
 }
 
@@ -53,15 +51,9 @@ func newNetwork(subnetMgr subnet.Manager, extIface *backend.ExternalInterface, d
 }
 
 func (nw *network) Run(ctx context.Context) {
-	log.V(0).Info("Watching for L3 misses")
-	misses := make(chan *netlink.Neigh, 100)
-	// Unfortunately MonitorMisses does not take a cancel channel
-	// as there's no wait to interrupt netlink socket recv
-	go nw.dev.MonitorMisses(misses)
-
 	wg := sync.WaitGroup{}
 
-	log.V(0).Info("Watching for new subnet leases")
+	log.V(0).Info("watching for new subnet leases")
 	events := make(chan []subnet.Event)
 	wg.Add(1)
 	go func() {
@@ -72,26 +64,8 @@ func (nw *network) Run(ctx context.Context) {
 
 	defer wg.Wait()
 
-	select {
-	case initialEventsBatch := <-events:
-		for {
-			err := nw.handleInitialSubnetEvents(initialEventsBatch)
-			if err == nil {
-				break
-			}
-			log.Error(err, " About to retry")
-			time.Sleep(time.Second)
-		}
-
-	case <-ctx.Done():
-		return
-	}
-
 	for {
 		select {
-		case miss := <-misses:
-			nw.handleMiss(miss)
-
 		case evtBatch := <-events:
 			nw.handleSubnetEvents(evtBatch)
 
@@ -111,140 +85,80 @@ type vxlanLeaseAttrs struct {
 
 func (nw *network) handleSubnetEvents(batch []subnet.Event) {
 	for _, event := range batch {
+		if event.Lease.Attrs.BackendType != "vxlan" {
+			log.Warningf("ignoring non-vxlan subnet(%s): type=%v", event.Lease.Subnet, event.Lease.Attrs.BackendType)
+			continue
+		}
+
+		var attrs vxlanLeaseAttrs
+		if err := json.Unmarshal(event.Lease.Attrs.BackendData, &attrs); err != nil {
+			log.Error("error decoding subnet lease JSON: ", err)
+			continue
+		}
+
+		route := netlink.Route{
+			LinkIndex: nw.dev.link.Attrs().Index,
+			Scope:     netlink.SCOPE_UNIVERSE,
+			Dst:       event.Lease.Subnet.ToIPNet(),
+			Gw:        event.Lease.Subnet.IP.ToIP(),
+		}
+		route.SetFlag(syscall.RTNH_F_ONLINK)
+
 		switch event.Type {
 		case subnet.EventAdded:
-			log.V(1).Info("Subnet added: ", event.Lease.Subnet)
+			log.V(2).Infof("adding subnet: %s PublicIP: %s VtepMAC: %s", event.Lease.Subnet, event.Lease.Attrs.PublicIP, net.HardwareAddr(attrs.VtepMAC))
 
-			if event.Lease.Attrs.BackendType != "vxlan" {
-				log.Warningf("Ignoring non-vxlan subnet: type=%v", event.Lease.Attrs.BackendType)
+			if err := nw.dev.AddARP(neighbor{IP: event.Lease.Subnet.IP, MAC: net.HardwareAddr(attrs.VtepMAC)}); err != nil {
+				log.Error("AddARP failed: ", err)
 				continue
 			}
 
-			var attrs vxlanLeaseAttrs
-			if err := json.Unmarshal(event.Lease.Attrs.BackendData, &attrs); err != nil {
-				log.Error("Error decoding subnet lease JSON: ", err)
-				continue
-			}
-			nw.routes.set(event.Lease.Subnet, net.HardwareAddr(attrs.VtepMAC))
-			nw.dev.AddL2(neighbor{IP: event.Lease.Attrs.PublicIP, MAC: net.HardwareAddr(attrs.VtepMAC)})
+			if err := nw.dev.AddFDB(neighbor{IP: event.Lease.Attrs.PublicIP, MAC: net.HardwareAddr(attrs.VtepMAC)}); err != nil {
+				log.Error("AddFDB failed: ", err)
 
-		case subnet.EventRemoved:
-			log.V(1).Info("Subnet removed: ", event.Lease.Subnet)
-
-			if event.Lease.Attrs.BackendType != "vxlan" {
-				log.Warningf("Ignoring non-vxlan subnet: type=%v", event.Lease.Attrs.BackendType)
-				continue
-			}
+				// Try to clean up the ARP entry then continue
+				if err := nw.dev.DelARP(neighbor{IP: event.Lease.Subnet.IP, MAC: net.HardwareAddr(attrs.VtepMAC)}); err != nil {
+					log.Error("DelARP failed: ", err)
+				}
 
-			var attrs vxlanLeaseAttrs
-			if err := json.Unmarshal(event.Lease.Attrs.BackendData, &attrs); err != nil {
-				log.Error("Error decoding subnet lease JSON: ", err)
 				continue
 			}
 
-			if len(attrs.VtepMAC) > 0 {
-				nw.dev.DelL2(neighbor{IP: event.Lease.Attrs.PublicIP, MAC: net.HardwareAddr(attrs.VtepMAC)})
-			}
-			nw.routes.remove(event.Lease.Subnet)
+			// Set the route - the kernel would ARP for the Gw IP address if it hadn't already been set above so make sure
+			// this is done last.
+			if err := netlink.RouteReplace(&route); err != nil {
+				log.Errorf("failed to add route (%s -> %s): %v", route.Dst, route.Gw, err)
 
-		default:
-			log.Error("Internal error: unknown event type: ", int(event.Type))
-		}
-	}
-}
+				// Try to clean up both the ARP and FDB entries then continue
+				if err := nw.dev.DelARP(neighbor{IP: event.Lease.Subnet.IP, MAC: net.HardwareAddr(attrs.VtepMAC)}); err != nil {
+					log.Error("DelARP failed: ", err)
+				}
 
-func (nw *network) handleInitialSubnetEvents(batch []subnet.Event) error {
-	log.V(1).Infof("Handling initial subnet events")
-	fdbTable, err := nw.dev.GetL2List()
-	if err != nil {
-		return fmt.Errorf("error fetching L2 table: %v", err)
-	}
+				if err := nw.dev.DelFDB(neighbor{IP: event.Lease.Attrs.PublicIP, MAC: net.HardwareAddr(attrs.VtepMAC)}); err != nil {
+					log.Error("DelFDB failed: ", err)
+				}
 
-	// Log the existing VTEP -> Public IP mappings
-	for _, fdbEntry := range fdbTable {
-		log.V(1).Infof("fdb already populated with: %s %s ", fdbEntry.IP, fdbEntry.HardwareAddr)
-	}
-
-	// "marked" events are skipped at the end.
-	eventMarker := make([]bool, len(batch))
-	leaseAttrsList := make([]vxlanLeaseAttrs, len(batch))
-	fdbEntryMarker := make([]bool, len(fdbTable))
-
-	// Run through the events "marking" ones that should be skipped
-	for eventMarkerIndex, evt := range batch {
-		if evt.Lease.Attrs.BackendType != "vxlan" {
-			log.Warningf("Ignoring non-vxlan subnet(%s): type=%v", evt.Lease.Subnet, evt.Lease.Attrs.BackendType)
-			eventMarker[eventMarkerIndex] = true
-			continue
-		}
-
-		// Parse the vxlan specific backend data
-		if err := json.Unmarshal(evt.Lease.Attrs.BackendData, &leaseAttrsList[eventMarkerIndex]); err != nil {
-			log.Error("Error decoding subnet lease JSON: ", err)
-			eventMarker[eventMarkerIndex] = true
-			continue
-		}
-
-		// Check the existing VTEP->Public IP mappings.
-		// If there's already an entry with the right VTEP and Public IP then the event can be skipped and the FDB entry can be retained
-		for j, fdbEntry := range fdbTable {
-			if evt.Lease.Attrs.PublicIP.ToIP().Equal(fdbEntry.IP) && bytes.Equal([]byte(leaseAttrsList[eventMarkerIndex].VtepMAC), []byte(fdbEntry.HardwareAddr)) {
-				eventMarker[eventMarkerIndex] = true
-				fdbEntryMarker[j] = true
-				break
+				continue
 			}
-		}
 
-		// Store off the subnet lease and VTEP
-		nw.routes.set(evt.Lease.Subnet, net.HardwareAddr(leaseAttrsList[eventMarkerIndex].VtepMAC))
-		log.V(2).Infof("Adding subnet: %s PublicIP: %s VtepMAC: %s", evt.Lease.Subnet, evt.Lease.Attrs.PublicIP, net.HardwareAddr(leaseAttrsList[eventMarkerIndex].VtepMAC))
-	}
+		case subnet.EventRemoved:
+			log.V(2).Infof("removing subnet: %s PublicIP: %s VtepMAC: %s", event.Lease.Subnet, event.Lease.Attrs.PublicIP, net.HardwareAddr(attrs.VtepMAC))
 
-	// Loop over the existing FDB entries, deleting any that shouldn't be there
-	for j, marker := range fdbEntryMarker {
-		if !marker && fdbTable[j].IP != nil {
-			err := nw.dev.DelL2(neighbor{IP: ip.FromIP(fdbTable[j].IP), MAC: fdbTable[j].HardwareAddr})
-			if err != nil {
-				log.Error("Delete L2 failed: ", err)
+			// Try to remove all entries - don't bail out if one of them fails.
+			if err := nw.dev.DelARP(neighbor{IP: event.Lease.Subnet.IP, MAC: net.HardwareAddr(attrs.VtepMAC)}); err != nil {
+				log.Error("DelARP failed: ", err)
 			}
-		}
-	}
 
-	// Loop over the events (skipping marked ones), adding them to the FDB table.
-	for i, marker := range eventMarker {
-		if !marker {
-			err := nw.dev.AddL2(neighbor{IP: batch[i].Lease.Attrs.PublicIP, MAC: net.HardwareAddr(leaseAttrsList[i].VtepMAC)})
-			if err != nil {
-				log.Error("Add L2 failed: ", err)
+			if err := nw.dev.DelFDB(neighbor{IP: event.Lease.Attrs.PublicIP, MAC: net.HardwareAddr(attrs.VtepMAC)}); err != nil {
+				log.Error("DelFDB failed: ", err)
 			}
-		}
-	}
-	return nil
-}
 
-func (nw *network) handleMiss(miss *netlink.Neigh) {
-	switch {
-	case len(miss.IP) == 0 && len(miss.HardwareAddr) == 0:
-		log.V(2).Info("Ignoring nil miss")
-
-	case len(miss.HardwareAddr) == 0:
-		nw.handleL3Miss(miss)
-
-	default:
-		log.V(4).Infof("Ignoring not a miss: %v, %v", miss.HardwareAddr, miss.IP)
-	}
-}
-
-func (nw *network) handleL3Miss(miss *netlink.Neigh) {
-	route := nw.routes.findByNetwork(ip.FromIP(miss.IP))
-	if route == nil {
-		log.V(0).Infof("L3 miss but route for %v not found", miss.IP)
-		return
-	}
+			if err := netlink.RouteDel(&route); err != nil {
+				log.Errorf("failed to delete route (%s -> %s): %v", route.Dst, route.Gw, err)
+			}
 
-	if err := nw.dev.AddL3(neighbor{IP: ip.FromIP(miss.IP), MAC: route.vtepMAC}); err != nil {
-		log.Errorf("AddL3 failed: %v", err)
-	} else {
-		log.V(2).Infof("L3 miss: AddL3 for %s succeeded", miss.IP)
+		default:
+			log.Error("internal error: unknown event type: ", int(event.Type))
+		}
 	}
 }

+ 3 - 3
dist/functional-test.sh

@@ -56,14 +56,14 @@ run_test() {
 
 	# rm any old flannel container that maybe running, ignore error as it might not exist
 	docker rm -f flannel-e2e-test-flannel1 2>/dev/null
-	docker run --name=flannel-e2e-test-flannel1 -d --privileged $flannel_img --etcd-endpoints=$etcd_endpt
+	docker run --name=flannel-e2e-test-flannel1 -d --privileged $flannel_img --etcd-endpoints=$etcd_endpt -v 10
 	if [ $? -ne 0 ]; then
 		exit 1
 	fi
 
 	# rm any old flannel container that maybe running, ignore error as it might not exist
 	docker rm -f flannel-e2e-test-flannel2 2>/dev/null
-	docker run --name=flannel-e2e-test-flannel2 -d --privileged $flannel_img --etcd-endpoints=$etcd_endpt
+	docker run --name=flannel-e2e-test-flannel2 -d --privileged $flannel_img --etcd-endpoints=$etcd_endpt -v 10
 	if [ $? -ne 0 ]; then
 		exit 1
 	fi
@@ -96,7 +96,7 @@ run_test() {
         docker rm -f flannel-e2e-test-flannel1-iperf 2>/dev/null
         docker run -d --name flannel-e2e-test-flannel1-iperf --net=container:flannel-e2e-test-flannel1 mlabbe/iperf3
         docker run --rm --net=container:flannel-e2e-test-flannel2 mlabbe/iperf3 -c $ping_dest
- fi
+    fi
 
 	docker stop flannel-e2e-test-flannel1 flannel-e2e-test-flannel2 >/dev/null