Procházet zdrojové kódy

Dynamically handles L3 misses

Since L2 misses are still handled statically DOVE extensions are
no longer required.routes.go maintains the mappings. devices.go now monitors for L3 misses and vxlan is
updated to handleL3Misses.

Fixes #203
Mohammad Ahmad před 9 roky
rodič
revize
c1234031a8
3 změnil soubory, kde provedl 162 přidání a 33 odebrání
  1. 56 7
      backend/vxlan/device.go
  2. 57 0
      backend/vxlan/routes.go
  3. 49 26
      backend/vxlan/vxlan.go

+ 56 - 7
backend/vxlan/device.go

@@ -19,9 +19,11 @@ import (
 	"net"
 	"os"
 	"syscall"
+	"time"
 
 	log "github.com/coreos/flannel/Godeps/_workspace/src/github.com/golang/glog"
 	"github.com/coreos/flannel/Godeps/_workspace/src/github.com/vishvananda/netlink"
+	"github.com/coreos/flannel/Godeps/_workspace/src/github.com/vishvananda/netlink/nl"
 
 	"github.com/coreos/flannel/pkg/ip"
 )
@@ -65,6 +67,9 @@ func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
 	if err != nil {
 		return nil, err
 	}
+	// this enables ARP requests being sent to userspace via netlink
+	sysctlPath := fmt.Sprintf("/proc/sys/net/ipv4/neigh/%s/app_solicit", devAttrs.name)
+	sysctlSet(sysctlPath, "3")
 
 	return &vxlanDevice{
 		link: link,
@@ -155,11 +160,6 @@ func (dev *vxlanDevice) GetL2List() ([]netlink.Neigh, error) {
 	return netlink.NeighList(dev.link.Index, syscall.AF_BRIDGE)
 }
 
-func (dev *vxlanDevice) GetL3List() ([]netlink.Neigh, error) {
-	log.Infof("calling GetL3List() dev.link.Index: %d ", dev.link.Index)
-	return netlink.NeighList(dev.link.Index, syscall.AF_INET)
-}
-
 func (dev *vxlanDevice) AddL2(n neigh) error {
 	log.Infof("calling NeighAdd: %v, %v", n.IP, n.MAC)
 	return netlink.NeighAdd(&netlink.Neigh{
@@ -187,7 +187,7 @@ func (dev *vxlanDevice) AddL3(n neigh) error {
 	log.Infof("calling NeighSet: %v, %v", n.IP, n.MAC)
 	return netlink.NeighSet(&netlink.Neigh{
 		LinkIndex:    dev.link.Index,
-		State:        netlink.NUD_PERMANENT,
+		State:        netlink.NUD_REACHABLE,
 		Type:         syscall.RTN_UNICAST,
 		IP:           n.IP.ToIP(),
 		HardwareAddr: n.MAC,
@@ -198,13 +198,62 @@ func (dev *vxlanDevice) DelL3(n neigh) error {
 	log.Infof("calling NeighDel: %v, %v", n.IP, n.MAC)
 	return netlink.NeighDel(&netlink.Neigh{
 		LinkIndex:    dev.link.Index,
-		State:        netlink.NUD_PERMANENT,
+		State:        netlink.NUD_REACHABLE,
 		Type:         syscall.RTN_UNICAST,
 		IP:           n.IP.ToIP(),
 		HardwareAddr: n.MAC,
 	})
 }
 
+func (dev *vxlanDevice) MonitorMisses(misses chan *netlink.Neigh) {
+	nlsock, err := nl.Subscribe(syscall.NETLINK_ROUTE, syscall.RTNLGRP_NEIGH)
+	if err != nil {
+		log.Error("Failed to subscribe to netlink RTNLGRP_NEIGH messages")
+		return
+	}
+
+	for {
+		msgs, err := nlsock.Receive()
+		if err != nil {
+			log.Errorf("Failed to receive from netlink: %v ", err)
+
+			time.Sleep(1 * time.Second)
+			continue
+		}
+
+		for _, msg := range msgs {
+			dev.processNeighMsg(msg, misses)
+		}
+	}
+}
+
+func isNeighResolving(state int) bool {
+	return (state & (netlink.NUD_INCOMPLETE | netlink.NUD_STALE | netlink.NUD_DELAY | netlink.NUD_PROBE)) != 0
+}
+
+func (dev *vxlanDevice) processNeighMsg(msg syscall.NetlinkMessage, misses chan *netlink.Neigh) {
+	neigh, err := netlink.NeighDeserialize(msg.Data)
+	if err != nil {
+		log.Error("Failed to deserialize netlink ndmsg: %v", err)
+		return
+	}
+
+	if int(neigh.LinkIndex) != dev.link.Index {
+		return
+	}
+
+	if msg.Header.Type != syscall.RTM_GETNEIGH && msg.Header.Type != syscall.RTM_NEWNEIGH {
+		return
+	}
+
+	if !isNeighResolving(neigh.State) {
+		// misses come with NUD_STALE bit set
+		return
+	}
+
+	misses <- neigh
+}
+
 func vxlanLinksIncompat(l1, l2 netlink.Link) string {
 	if l1.Type() != l2.Type() {
 		return fmt.Sprintf("link type: %v vs %v", l1.Type(), l2.Type())

+ 57 - 0
backend/vxlan/routes.go

@@ -0,0 +1,57 @@
+// Copyright 2015 CoreOS, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vxlan
+
+import (
+	"net"
+
+	"github.com/coreos/flannel/pkg/ip"
+)
+
+type route struct {
+	network ip.IP4Net
+	vtepMAC net.HardwareAddr
+}
+
+type routes []route
+
+func (rts *routes) set(nw ip.IP4Net, vtepMAC net.HardwareAddr) {
+	for i, rt := range *rts {
+		if rt.network.Equal(nw) {
+			(*rts)[i].vtepMAC = vtepMAC
+			return
+		}
+	}
+	*rts = append(*rts, route{nw, vtepMAC})
+}
+
+func (rts *routes) remove(nw ip.IP4Net) {
+	for i, rt := range *rts {
+		if rt.network.Equal(nw) {
+			(*rts)[i] = (*rts)[len(*rts)-1]
+			(*rts) = (*rts)[0 : len(*rts)-1]
+			return
+		}
+	}
+}
+
+func (rts routes) findByNetwork(ipAddr ip.IP4) *route {
+	for i, rt := range rts {
+		if rt.network.Contains(ipAddr) {
+			return &rts[i]
+		}
+	}
+	return nil
+}

+ 49 - 26
backend/vxlan/vxlan.go

@@ -18,14 +18,17 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"net"
+	"sync"
+	"time"
+
 	log "github.com/coreos/flannel/Godeps/_workspace/src/github.com/golang/glog"
+	"github.com/coreos/flannel/Godeps/_workspace/src/github.com/vishvananda/netlink"
 	"github.com/coreos/flannel/Godeps/_workspace/src/golang.org/x/net/context"
+
 	"github.com/coreos/flannel/backend"
 	"github.com/coreos/flannel/pkg/ip"
 	"github.com/coreos/flannel/subnet"
-	"net"
-	"sync"
-	"time"
 )
 
 const (
@@ -45,6 +48,7 @@ type VXLANBackend struct {
 	ctx    context.Context
 	cancel context.CancelFunc
 	wg     sync.WaitGroup
+	rts    routes
 }
 
 func New(sm subnet.Manager, network string, config *subnet.Config) backend.Backend {
@@ -146,6 +150,12 @@ func (vb *VXLANBackend) Run() {
 		vb.wg.Done()
 	}()
 
+	log.Info("Watching for L3 misses")
+	misses := make(chan *netlink.Neigh, 100)
+	// Unfrtunately MonitorMisses does not take a cancel channel
+	// as there's no wait to interrupt netlink socket recv
+	go vb.dev.MonitorMisses(misses)
+
 	log.Info("Watching for new subnet leases")
 	evts := make(chan []subnet.Event)
 	vb.wg.Add(1)
@@ -168,6 +178,9 @@ func (vb *VXLANBackend) Run() {
 
 	for {
 		select {
+		case miss := <-misses:
+			vb.handleMiss(miss)
+
 		case evtBatch := <-evts:
 			vb.handleSubnetEvents(evtBatch)
 
@@ -228,8 +241,8 @@ func (vb *VXLANBackend) handleSubnetEvents(batch []subnet.Event) {
 				log.Error("Error decoding subnet lease JSON: ", err)
 				continue
 			}
+			vb.rts.set(evt.Lease.Subnet, net.HardwareAddr(attrs.VtepMAC))
 			vb.dev.AddL2(neigh{IP: evt.Lease.Attrs.PublicIP, MAC: net.HardwareAddr(attrs.VtepMAC)})
-			vb.dev.AddL3(neigh{IP: evt.Lease.Subnet.IP, MAC: net.HardwareAddr(attrs.VtepMAC)})
 
 		case subnet.SubnetRemoved:
 			log.Info("Subnet removed: ", evt.Lease.Subnet)
@@ -247,8 +260,8 @@ func (vb *VXLANBackend) handleSubnetEvents(batch []subnet.Event) {
 
 			if len(attrs.VtepMAC) > 0 {
 				vb.dev.DelL2(neigh{IP: evt.Lease.Attrs.PublicIP, MAC: net.HardwareAddr(attrs.VtepMAC)})
-				vb.dev.DelL3(neigh{IP: evt.Lease.Subnet.IP, MAC: net.HardwareAddr(attrs.VtepMAC)})
 			}
+			vb.rts.remove(evt.Lease.Subnet)
 
 		default:
 			log.Error("Internal error: unknown event type: ", int(evt.Type))
@@ -263,19 +276,10 @@ func (vb *VXLANBackend) handleInitialSubnetEvents(batch []subnet.Event) error {
 		return fmt.Errorf("Error fetching L2 table: %v", err)
 	}
 
-	l3Table, err := vb.dev.GetL3List()
-	if err != nil {
-		return fmt.Errorf("Error fetching L3 table: %v", err)
-	}
-
 	for _, fdbEntry := range fdbTable {
 		log.Infof("fdb already populated with: %s %s ", fdbEntry.IP, fdbEntry.HardwareAddr)
 	}
 
-	for _, l3Entry := range l3Table {
-		log.Infof("l3 table already populated with: %s %s", l3Entry.IP, l3Entry.HardwareAddr)
-	}
-
 	evtMarker := make([]bool, len(batch))
 	leaseAttrsList := make([]vxlanLeaseAttrs, len(batch))
 	fdbEntryMarker := make([]bool, len(fdbTable))
@@ -300,10 +304,7 @@ func (vb *VXLANBackend) handleInitialSubnetEvents(batch []subnet.Event) error {
 				break
 			}
 		}
-	}
-
-	for _, l3Entry := range l3Table {
-		vb.dev.DelL3(neigh{IP: ip.FromIP(l3Entry.IP), MAC: l3Entry.HardwareAddr})
+		vb.rts.set(evt.Lease.Subnet, net.HardwareAddr(leaseAttrsList[i].VtepMAC))
 	}
 
 	for j, marker := range fdbEntryMarker {
@@ -321,15 +322,37 @@ func (vb *VXLANBackend) handleInitialSubnetEvents(batch []subnet.Event) error {
 			if err != nil {
 				log.Error("Add L2 failed: ", err)
 			}
-		}
-		err := vb.dev.AddL3(neigh{IP: batch[i].Lease.Subnet.IP, MAC: net.HardwareAddr(leaseAttrsList[i].VtepMAC)})
-		if err != nil {
-			log.Error("Add L3 failed: ", err)
-			err1 := vb.dev.DelL2(neigh{IP: batch[i].Lease.Attrs.PublicIP, MAC: net.HardwareAddr(leaseAttrsList[i].VtepMAC)})
-			if err1 != nil {
-				log.Error("Attempt to remove matching L2 entry failed: ", err1)
-			}
+
 		}
 	}
 	return nil
 }
+
+func (vb *VXLANBackend) handleMiss(miss *netlink.Neigh) {
+	switch {
+	case len(miss.IP) == 0 && len(miss.HardwareAddr) == 0:
+		log.Info("Ignoring nil miss")
+
+	case len(miss.HardwareAddr) == 0:
+		vb.handleL3Miss(miss)
+
+	default:
+		log.Infof("Ignoring not a miss: %v, %v", miss.HardwareAddr, miss.IP)
+	}
+}
+
+func (vb *VXLANBackend) handleL3Miss(miss *netlink.Neigh) {
+	log.Infof("L3 miss: %v", miss.IP)
+
+	rt := vb.rts.findByNetwork(ip.FromIP(miss.IP))
+	if rt == nil {
+		log.Infof("Route for %v not found", miss.IP)
+		return
+	}
+
+	if err := vb.dev.AddL3(neigh{IP: ip.FromIP(miss.IP), MAC: rt.vtepMAC}); err != nil {
+		log.Errorf("AddL3 failed: %v", err)
+	} else {
+		log.Info("AddL3 succeeded")
+	}
+}