Преглед на файлове

Windows "host-gw" and "vxlan" support

    - Add Windows host-gw
        (*) patch for https://github.com/coreos/flannel/pull/921
    - Add windows vxlan
        (*) patch for https://github.com/coreos/flannel/pull/922

Co-authored-by: rakelkar <rakelkar@outlook.com>
Co-authored-by: madhanrm <madhanrajm@outlook.com>
MaiWJ преди 6 години
родител
ревизия
e6fc2a2946
променени са 6 файла, в които са добавени 905 реда и са изтрити 11 реда
  1. 5 4
      Documentation/backends.md
  2. 232 5
      backend/hostgw/hostgw_windows.go
  3. 199 0
      backend/route_network_windows.go
  4. 224 0
      backend/vxlan/device_windows.go
  5. 125 0
      backend/vxlan/vxlan_network_windows.go
  6. 120 2
      backend/vxlan/vxlan_windows.go

+ 5 - 4
Documentation/backends.md

@@ -19,10 +19,11 @@ Use in-kernel VXLAN to encapsulate the packets.
 
 Type and options:
 * `Type` (string): `vxlan`
-* `VNI` (number): VXLAN Identifier (VNI) to be used. Defaults to 1.
-* `Port` (number): UDP port to use for sending encapsulated packets. Defaults to kernel default, currently 8472.
-* `GBP` (Boolean): Enable [VXLAN Group Based Policy](https://github.com/torvalds/linux/commit/3511494ce2f3d3b77544c79b87511a4ddb61dc89).  Defaults to `false`.
-* `DirectRouting` (Boolean): Enable direct routes (like `host-gw`) when the hosts are on the same subnet. VXLAN will only be used to encapsulate packets to hosts on different subnets. Defaults to `false`.
+* `VNI` (number): VXLAN Identifier (VNI) to be used. On Linux, defaults to 1. On Windows should be greater than or equal to 4096. 
+* `Port` (number): UDP port to use for sending encapsulated packets. On Linux, defaults to kernel default, currently 8472, but on Windows, must be 4789.
+* `GBP` (Boolean): Enable [VXLAN Group Based Policy](https://github.com/torvalds/linux/commit/3511494ce2f3d3b77544c79b87511a4ddb61dc89).  Defaults to `false`. GBP is not supported on Windows
+* `DirectRouting` (Boolean): Enable direct routes (like `host-gw`) when the hosts are on the same subnet. VXLAN will only be used to encapsulate packets to hosts on different subnets. Defaults to `false`. DirectRouting is not supported on Windows.
+* `MacPrefix` (String): Only use on Windows, set to the MAC prefix. Defaults to `0E-2A`.
 
 ### host-gw
 

+ 232 - 5
backend/hostgw/hostgw_windows.go

@@ -1,6 +1,4 @@
-// +build windows
-
-// Copyright 2015 flannel authors
+// Copyright 2018 flannel authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,14 +11,243 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// +build windows
 
 package hostgw
 
 import (
+	"fmt"
+	"strconv"
+	"sync"
+	"time"
+
+	"github.com/Microsoft/hcsshim"
+	"github.com/coreos/flannel/backend"
+	"github.com/coreos/flannel/pkg/ip"
+	"github.com/coreos/flannel/subnet"
 	log "github.com/golang/glog"
+	"github.com/juju/errors"
+	"github.com/rakelkar/gonetsh/netroute"
+	"github.com/rakelkar/gonetsh/netsh"
+	"golang.org/x/net/context"
+	"k8s.io/apimachinery/pkg/util/json"
+	"k8s.io/apimachinery/pkg/util/wait"
+	utilexec "k8s.io/utils/exec"
 )
 
 func init() {
-	log.Infof("hostgw is not supported on this platform")
+	backend.Register("host-gw", New)
+}
+
+type HostgwBackend struct {
+	sm       subnet.Manager
+	extIface *backend.ExternalInterface
+}
+
+func New(sm subnet.Manager, extIface *backend.ExternalInterface) (backend.Backend, error) {
+	if !extIface.ExtAddr.Equal(extIface.IfaceAddr) {
+		return nil, fmt.Errorf("your PublicIP differs from interface IP, meaning that probably you're on a NAT, which is not supported by host-gw backend")
+	}
+
+	be := &HostgwBackend{
+		sm:       sm,
+		extIface: extIface,
+	}
+
+	return be, nil
+}
+
+func (be *HostgwBackend) RegisterNetwork(ctx context.Context, wg sync.WaitGroup, config *subnet.Config) (backend.Network, error) {
+	// 1. Parse configuration
+	cfg := struct {
+		Name          string
+		DNSServerList string
+	}{}
+	if len(config.Backend) > 0 {
+		if err := json.Unmarshal(config.Backend, &cfg); err != nil {
+			return nil, errors.Annotate(err, "error decoding windows host-gw backend config")
+		}
+	}
+	if len(cfg.Name) == 0 {
+		cfg.Name = "cbr0"
+	}
+	log.Infof("HOST-GW config: %+v", cfg)
+
+	n := &backend.RouteNetwork{
+		SimpleNetwork: backend.SimpleNetwork{
+			ExtIface: be.extIface,
+		},
+		SM:          be.sm,
+		BackendType: "host-gw",
+		Mtu:         be.extIface.Iface.MTU,
+		LinkIndex:   be.extIface.Iface.Index,
+	}
+	n.GetRoute = func(lease *subnet.Lease) *netroute.Route {
+		return &netroute.Route{
+			DestinationSubnet: lease.Subnet.ToIPNet(),
+			GatewayAddress:    lease.Attrs.PublicIP.ToIP(),
+			LinkIndex:         n.LinkIndex,
+		}
+	}
+
+	// 2. Acquire the lease form subnet manager
+	attrs := subnet.LeaseAttrs{
+		PublicIP:    ip.FromIP(be.extIface.ExtAddr),
+		BackendType: "host-gw",
+	}
+
+	l, err := be.sm.AcquireLease(ctx, &attrs)
+	switch err {
+	case nil:
+		n.SubnetLease = l
+
+	case context.Canceled, context.DeadlineExceeded:
+		return nil, err
+
+	default:
+		return nil, errors.Annotate(err, "failed to acquire lease")
+	}
+
+	// 3. Check if the network exists and has the expected settings
+	netshHelper := netsh.New(utilexec.New())
+	createNewNetwork := true
+	expectedSubnet := n.SubnetLease.Subnet
+	expectedAddressPrefix := expectedSubnet.String()
+	expectedGatewayAddress := (expectedSubnet.IP + 1).String()
+	expectedPodGatewayAddress := expectedSubnet.IP + 2
+	networkName := cfg.Name
+	var waitErr, lastErr error
+
+	existingNetwork, err := hcsshim.GetHNSNetworkByName(networkName)
+	if err == nil {
+		for _, subnet := range existingNetwork.Subnets {
+			if subnet.AddressPrefix == expectedAddressPrefix && subnet.GatewayAddress == expectedGatewayAddress {
+				createNewNetwork = false
+				log.Infof("Found existing HNSNetwork %s", networkName)
+				break
+			}
+		}
+	}
+
+	// 4. Create a new HNSNetwork
+	expectedNetwork := existingNetwork
+	if createNewNetwork {
+		if existingNetwork != nil {
+			if _, err := existingNetwork.Delete(); err != nil {
+				return nil, errors.Annotatef(err, "failed to delete existing HNSNetwork %s", networkName)
+			}
+			log.Infof("Deleted stale HNSNetwork %s", networkName)
+		}
+
+		expectedNetwork = &hcsshim.HNSNetwork{
+			Name:          networkName,
+			Type:          "L2Bridge",
+			DNSServerList: cfg.DNSServerList,
+			Subnets: []hcsshim.Subnet{
+				{
+					AddressPrefix:  expectedAddressPrefix,
+					GatewayAddress: expectedGatewayAddress,
+				},
+			},
+		}
+		jsonRequest, err := json.Marshal(expectedNetwork)
+		if err != nil {
+			return nil, errors.Annotatef(err, "failed to marshal %+v", expectedNetwork)
+		}
+
+		log.Infof("Attempting to create HNSNetwork %s", string(jsonRequest))
+		newNetwork, err := hcsshim.HNSNetworkRequest("POST", "", string(jsonRequest))
+		if err != nil {
+			return nil, errors.Annotatef(err, "failed to create HNSNetwork %s", networkName)
+		}
+
+		// Wait for the network to populate Management IP
+		log.Infof("Waiting to get ManagementIP from HNSNetwork %s", networkName)
+		waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) {
+			newNetwork, lastErr = hcsshim.HNSNetworkRequest("GET", newNetwork.Id, "")
+			return newNetwork != nil && len(newNetwork.ManagementIP) != 0, nil
+		})
+		if waitErr == wait.ErrWaitTimeout {
+			return nil, errors.Annotatef(lastErr, "timeout, failed to get management IP from HNSNetwork %s", networkName)
+		}
+
+		// Wait for the interface with the management IP
+		log.Infof("Waiting to get net interface for HNSNetwork %s (%s)", networkName, newNetwork.ManagementIP)
+		waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) {
+			_, lastErr = netshHelper.GetInterfaceByIP(newNetwork.ManagementIP)
+			return lastErr == nil, nil
+		})
+		if waitErr == wait.ErrWaitTimeout {
+			return nil, errors.Annotatef(lastErr, "timeout, failed to get net interface for HNSNetwork %s (%s)", networkName, newNetwork.ManagementIP)
+		}
+
+		log.Infof("Created HNSNetwork %s", networkName)
+		expectedNetwork = newNetwork
+	}
+
+	// 5. Ensure a 1.2 endpoint on this network in the host compartment
+	createNewBridgeEndpoint := true
+	bridgeEndpointName := networkName + "_ep"
+	existingBridgeEndpoint, err := hcsshim.GetHNSEndpointByName(bridgeEndpointName)
+	if err == nil && existingBridgeEndpoint.IPAddress.String() == expectedPodGatewayAddress.String() {
+		log.Infof("Found existing bridge HNSEndpoint %s", bridgeEndpointName)
+		createNewBridgeEndpoint = false
+	}
+
+	// 6. Create a bridge HNSEndpoint
+	expectedBridgeEndpoint := existingBridgeEndpoint
+	if createNewBridgeEndpoint {
+		if existingBridgeEndpoint != nil {
+			if _, err = existingBridgeEndpoint.Delete(); err != nil {
+				return nil, errors.Annotatef(err, "failed to delete existing bridge HNSEndpoint %s", bridgeEndpointName)
+			}
+			log.Infof("Deleted stale bridge HNSEndpoint %s", bridgeEndpointName)
+		}
+
+		expectedBridgeEndpoint = &hcsshim.HNSEndpoint{
+			Name:           bridgeEndpointName,
+			IPAddress:      expectedPodGatewayAddress.ToIP(),
+			VirtualNetwork: expectedNetwork.Id,
+		}
+
+		log.Infof("Attempting to create bridge HNSEndpoint %+v", expectedBridgeEndpoint)
+		if expectedBridgeEndpoint, err = expectedBridgeEndpoint.Create(); err != nil {
+			return nil, errors.Annotatef(err, "failed to create bridge HNSEndpoint %s", bridgeEndpointName)
+		}
+
+		log.Infof("Created bridge HNSEndpoint %s", bridgeEndpointName)
+	}
+
+	// Wait for the bridgeEndpoint to attach to the host
+	log.Infof("Waiting to attach bridge endpoint %s to host", bridgeEndpointName)
+	waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) {
+		lastErr = expectedBridgeEndpoint.HostAttach(1)
+		return lastErr == nil, nil
+	})
+	if waitErr == wait.ErrWaitTimeout {
+		return nil, errors.Annotatef(lastErr, "failed to hot attach bridge HNSEndpoint %s to host compartment", bridgeEndpointName)
+	}
+	log.Infof("Attached bridge endpoint %s to host successfully", bridgeEndpointName)
+
+	// 7. Enable forwarding on the host interface and endpoint
+	for _, interfaceIpAddress := range []string{expectedNetwork.ManagementIP, expectedBridgeEndpoint.IPAddress.String()} {
+		netInterface, err := netshHelper.GetInterfaceByIP(interfaceIpAddress)
+		if err != nil {
+			return nil, errors.Annotatef(err, "failed to find interface for IP Address %s", interfaceIpAddress)
+		}
+		log.Infof("Found %+v interface with IP %s", netInterface, interfaceIpAddress)
+
+		// When a new hns network is created, the interface is modified, esp the name, index
+		if expectedNetwork.ManagementIP == netInterface.IpAddress {
+			n.LinkIndex = netInterface.Idx
+			n.Name = netInterface.Name
+		}
+
+		interfaceIdx := strconv.Itoa(netInterface.Idx)
+		if err := netshHelper.EnableForwarding(interfaceIdx); err != nil {
+			return nil, errors.Annotatef(err, "failed to enable forwarding on %s index %s", netInterface.Name, interfaceIdx)
+		}
+		log.Infof("Enabled forwarding on %s index %s", netInterface.Name, interfaceIdx)
+	}
+
+	return n, nil
 }

+ 199 - 0
backend/route_network_windows.go

@@ -0,0 +1,199 @@
+// Copyright 2018 flannel authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	"sync"
+	"time"
+
+	log "github.com/golang/glog"
+	"github.com/rakelkar/gonetsh/netroute"
+	"golang.org/x/net/context"
+
+	"github.com/coreos/flannel/subnet"
+	"strings"
+)
+
+const (
+	routeCheckRetries = 10
+)
+
+type RouteNetwork struct {
+	SimpleNetwork
+	Name        string
+	BackendType string
+	SM          subnet.Manager
+	GetRoute    func(lease *subnet.Lease) *netroute.Route
+	Mtu         int
+	LinkIndex   int
+	routes      []netroute.Route
+}
+
+func (n *RouteNetwork) MTU() int {
+	return n.Mtu
+}
+
+func (n *RouteNetwork) Run(ctx context.Context) {
+	wg := sync.WaitGroup{}
+
+	log.Info("Watching for new subnet leases")
+	evts := make(chan []subnet.Event)
+	wg.Add(1)
+	go func() {
+		subnet.WatchLeases(ctx, n.SM, n.SubnetLease, evts)
+		wg.Done()
+	}()
+
+	n.routes = make([]netroute.Route, 0, 10)
+	wg.Add(1)
+	go func() {
+		n.routeCheck(ctx)
+		wg.Done()
+	}()
+
+	defer wg.Wait()
+
+	for {
+		select {
+		case evtBatch := <-evts:
+			n.handleSubnetEvents(evtBatch)
+
+		case <-ctx.Done():
+			return
+		}
+	}
+}
+
+func (n *RouteNetwork) handleSubnetEvents(batch []subnet.Event) {
+	netrouteHelper := netroute.New()
+
+	for _, evt := range batch {
+		leaseSubnet := evt.Lease.Subnet
+		leaseAttrs := evt.Lease.Attrs
+		if !strings.EqualFold(leaseAttrs.BackendType, n.BackendType) {
+			log.Warningf("Ignoring non-%v subnet(%v): type=%v", n.BackendType, leaseSubnet, leaseAttrs.BackendType)
+			continue
+		}
+
+		expectedRoute := n.GetRoute(&evt.Lease)
+
+		switch evt.Type {
+		case subnet.EventAdded:
+			log.Infof("Subnet added: %v via %v", leaseSubnet, leaseAttrs.PublicIP)
+
+			existingRoutes, _ := netrouteHelper.GetNetRoutes(expectedRoute.LinkIndex, expectedRoute.DestinationSubnet)
+			if len(existingRoutes) > 0 {
+				existingRoute := existingRoutes[0]
+				if existingRoute.Equal(*expectedRoute) {
+					continue
+				}
+
+				log.Warningf("Replacing existing route %v via %v with %v via %v", leaseSubnet, existingRoute.GatewayAddress, leaseSubnet, leaseAttrs.PublicIP)
+				err := netrouteHelper.RemoveNetRoute(existingRoute.LinkIndex, existingRoute.DestinationSubnet, existingRoute.GatewayAddress)
+				if err != nil {
+					log.Errorf("Error removing route: %v", err)
+					continue
+				}
+			}
+
+			err := netrouteHelper.NewNetRoute(expectedRoute.LinkIndex, expectedRoute.DestinationSubnet, expectedRoute.GatewayAddress)
+			if err != nil {
+				log.Errorf("Error creating route: %v", err)
+				continue
+			}
+
+			n.addToRouteList(expectedRoute)
+
+		case subnet.EventRemoved:
+			log.Infof("Subnet removed: %v", leaseSubnet)
+
+			existingRoutes, _ := netrouteHelper.GetNetRoutes(expectedRoute.LinkIndex, expectedRoute.DestinationSubnet)
+			if len(existingRoutes) > 0 {
+				existingRoute := existingRoutes[0]
+				if existingRoute.Equal(*expectedRoute) {
+					log.Infof("Removing existing route %v via %v", leaseSubnet, existingRoute.GatewayAddress)
+
+					err := netrouteHelper.RemoveNetRoute(existingRoute.LinkIndex, existingRoute.DestinationSubnet, existingRoute.GatewayAddress)
+					if err != nil {
+						log.Warningf("Error removing route: %v", err)
+					}
+				}
+			}
+
+			n.removeFromRouteList(expectedRoute)
+
+		default:
+			log.Error("Internal error: unknown event type: ", int(evt.Type))
+		}
+	}
+}
+
+func (n *RouteNetwork) addToRouteList(newRoute *netroute.Route) {
+	for _, route := range n.routes {
+		if route.Equal(*newRoute) {
+			return
+		}
+	}
+
+	n.routes = append(n.routes, *newRoute)
+}
+
+func (n *RouteNetwork) removeFromRouteList(oldRoute *netroute.Route) {
+	for index, route := range n.routes {
+		if route.Equal(*oldRoute) {
+			n.routes = append(n.routes[:index], n.routes[index+1:]...)
+			return
+		}
+	}
+}
+
+func (n *RouteNetwork) routeCheck(ctx context.Context) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-time.After(routeCheckRetries * time.Second):
+			n.checkSubnetExistInRoutes()
+		}
+	}
+}
+
+func (n *RouteNetwork) checkSubnetExistInRoutes() {
+	netrouteHelper := netroute.New()
+
+	existingRoutes, err := netrouteHelper.GetNetRoutesAll()
+	if err != nil {
+		log.Errorf("Error enumerating routes: %v", err)
+		return
+	}
+	for _, expectedRoute := range n.routes {
+		exist := false
+		for _, existingRoute := range existingRoutes {
+			if expectedRoute.Equal(existingRoute) {
+				exist = true
+				break
+			}
+		}
+
+		if !exist {
+			err := netrouteHelper.NewNetRoute(expectedRoute.LinkIndex, expectedRoute.DestinationSubnet, expectedRoute.GatewayAddress)
+			if err != nil {
+				log.Warningf("Error recovering route to %v via %v on %v (%v).", expectedRoute.DestinationSubnet, expectedRoute.GatewayAddress, expectedRoute.LinkIndex, err)
+				continue
+			}
+			log.Infof("Recovered route to %v via %v on %v.", expectedRoute.DestinationSubnet, expectedRoute.GatewayAddress, expectedRoute.LinkIndex)
+		}
+	}
+}

+ 224 - 0
backend/vxlan/device_windows.go

@@ -0,0 +1,224 @@
+// Copyright 2018 flannel authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vxlan
+
+import (
+	"encoding/json"
+	"fmt"
+	"github.com/Microsoft/hcsshim"
+	"github.com/buger/jsonparser"
+	"github.com/coreos/flannel/pkg/ip"
+	log "github.com/golang/glog"
+	"github.com/juju/errors"
+	"github.com/rakelkar/gonetsh/netsh"
+	"k8s.io/apimachinery/pkg/util/wait"
+	utilexec "k8s.io/utils/exec"
+	"time"
+)
+
+type vxlanDeviceAttrs struct {
+	vni           uint32
+	name          string
+	gbp           bool
+	addressPrefix ip.IP4Net
+}
+
+type vxlanDevice struct {
+	link          *hcsshim.HNSNetwork
+	macPrefix     string
+	directRouting bool
+}
+
+func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
+	hnsNetwork := &hcsshim.HNSNetwork{
+		Name:    devAttrs.name,
+		Type:    "Overlay",
+		Subnets: make([]hcsshim.Subnet, 0, 1),
+	}
+
+	hnsNetwork, err := ensureNetwork(hnsNetwork, int64(devAttrs.vni), devAttrs.addressPrefix.String(), (devAttrs.addressPrefix.IP + 1).String())
+	if err != nil {
+		return nil, err
+	}
+
+	return &vxlanDevice{
+		link: hnsNetwork,
+	}, nil
+}
+
+func ensureNetwork(expectedNetwork *hcsshim.HNSNetwork, expectedVSID int64, expectedAddressPrefix, expectedGW string) (*hcsshim.HNSNetwork, error) {
+	createNetwork := true
+	networkName := expectedNetwork.Name
+
+	// 1. Check if the HNSNetwork exists and has the expected settings
+	existingNetwork, err := hcsshim.GetHNSNetworkByName(networkName)
+	if err == nil {
+		if existingNetwork.Type == expectedNetwork.Type {
+			for _, existingSubnet := range existingNetwork.Subnets {
+				if existingSubnet.AddressPrefix == expectedAddressPrefix && existingSubnet.GatewayAddress == expectedGW {
+					createNetwork = false
+					log.Infof("Found existing HNSNetwork %s", networkName)
+					break
+				}
+			}
+		}
+	}
+
+	// 2. Create a new HNSNetwork
+	if createNetwork {
+		if existingNetwork != nil {
+			if _, err := existingNetwork.Delete(); err != nil {
+				return nil, errors.Annotatef(err, "failed to delete existing HNSNetwork %s", networkName)
+			}
+			log.Infof("Deleted stale HNSNetwork %s", networkName)
+		}
+
+		// Add a VxLan subnet
+		expectedNetwork.Subnets = append(expectedNetwork.Subnets, hcsshim.Subnet{
+			AddressPrefix:  expectedAddressPrefix,
+			GatewayAddress: expectedGW,
+			Policies: []json.RawMessage{
+				[]byte(fmt.Sprintf(`{"Type":"VSID","VSID":%d}`, expectedVSID)),
+			},
+		})
+
+		// Config request params
+		jsonRequest, err := json.Marshal(expectedNetwork)
+		if err != nil {
+			return nil, errors.Annotatef(err, "failed to marshal %+v", expectedNetwork)
+		}
+
+		log.Infof("Attempting to create HNSNetwork %s", string(jsonRequest))
+		newNetwork, err := hcsshim.HNSNetworkRequest("POST", "", string(jsonRequest))
+		if err != nil {
+			return nil, errors.Annotatef(err, "failed to create HNSNetwork %s", networkName)
+		}
+
+		var waitErr, lastErr error
+		// Wait for the network to populate Management IP
+		log.Infof("Waiting to get ManagementIP from HNSNetwork %s", networkName)
+		waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) {
+			newNetwork, lastErr = hcsshim.HNSNetworkRequest("GET", newNetwork.Id, "")
+			return newNetwork != nil && len(newNetwork.ManagementIP) != 0, nil
+		})
+		if waitErr == wait.ErrWaitTimeout {
+			return nil, errors.Annotatef(lastErr, "timeout, failed to get management IP from HNSNetwork %s", networkName)
+		}
+
+		// Wait for the interface with the management IP
+		netshHelper := netsh.New(utilexec.New())
+		log.Infof("Waiting to get net interface for HNSNetwork %s (%s)", networkName, newNetwork.ManagementIP)
+		waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) {
+			_, lastErr = netshHelper.GetInterfaceByIP(newNetwork.ManagementIP)
+			return lastErr == nil, nil
+		})
+		if waitErr == wait.ErrWaitTimeout {
+			return nil, errors.Annotatef(lastErr, "timeout, failed to get net interface for HNSNetwork %s (%s)", networkName, newNetwork.ManagementIP)
+		}
+
+		log.Infof("Created HNSNetwork %s", networkName)
+		return newNetwork, nil
+	}
+
+	return existingNetwork, nil
+}
+
+type neighbor struct {
+	MAC               string
+	IP                ip.IP4
+	ManagementAddress string
+}
+
+func (dev *vxlanDevice) AddEndpoint(n *neighbor) error {
+	endpointName := createEndpointName(n.IP)
+
+	// 1. Check if the HNSEndpoint exists and has the expected settings
+	existingEndpoint, err := hcsshim.GetHNSEndpointByName(endpointName)
+	if err == nil && existingEndpoint.VirtualNetwork == dev.link.Id {
+		// Check policies if there is PA type
+		targetType := "PA"
+		for _, policy := range existingEndpoint.Policies {
+			policyType, _ := jsonparser.GetUnsafeString(policy, "Type")
+			if policyType == targetType {
+				actualPaIP, _ := jsonparser.GetUnsafeString(policy, targetType)
+				if actualPaIP == n.ManagementAddress {
+					log.Infof("Found existing remote HNSEndpoint %s", endpointName)
+					return nil
+				}
+			}
+		}
+	}
+
+	// 2. Create a new HNSNetwork
+	if existingEndpoint != nil {
+		if _, err := existingEndpoint.Delete(); err != nil {
+			return errors.Annotatef(err, "failed to delete existing remote HNSEndpoint %s", endpointName)
+		}
+		log.V(4).Infof("Deleted stale HNSEndpoint %s", endpointName)
+	}
+
+	newEndpoint := &hcsshim.HNSEndpoint{
+		Name:             endpointName,
+		IPAddress:        n.IP.ToIP(),
+		MacAddress:       n.MAC,
+		VirtualNetwork:   dev.link.Id,
+		IsRemoteEndpoint: true,
+		Policies: []json.RawMessage{
+			[]byte(fmt.Sprintf(`{"Type":"PA","PA":"%s"}`, n.ManagementAddress)),
+		},
+	}
+	if _, err := newEndpoint.Create(); err != nil {
+		return errors.Annotatef(err, "failed to create remote HNSEndpoint %s", endpointName)
+	}
+	log.V(4).Infof("Created HNSEndpoint %s", endpointName)
+
+	return nil
+}
+
+func (dev *vxlanDevice) DelEndpoint(n *neighbor) error {
+	endpointName := createEndpointName(n.IP)
+
+	existingEndpoint, err := hcsshim.GetHNSEndpointByName(endpointName)
+	if err == nil && existingEndpoint.VirtualNetwork == dev.link.Id {
+		// Check policies if there is PA type
+		targetType := "PA"
+		for _, policy := range existingEndpoint.Policies {
+			policyType, _ := jsonparser.GetUnsafeString(policy, "Type")
+			if policyType == targetType {
+				actualPaIP, _ := jsonparser.GetUnsafeString(policy, targetType)
+				if actualPaIP == n.ManagementAddress {
+					// Found it and delete
+					if _, err := existingEndpoint.Delete(); err != nil {
+						return errors.Annotatef(err, "failed to delete remote HNSEndpoint %s", endpointName)
+					}
+
+					log.V(4).Infof("Deleted HNSEndpoint %s", endpointName)
+					break
+				}
+			}
+		}
+	}
+
+	return nil
+}
+
+func (dev *vxlanDevice) ConjureMac(targetIP ip.IP4) string {
+	a, b, c, d := targetIP.Octets()
+	return fmt.Sprintf("%v-%02x-%02x-%02x-%02x", dev.macPrefix, a, b, c, d)
+}
+
+func createEndpointName(targetIP ip.IP4) string {
+	return "remote_" + targetIP.String()
+}

+ 125 - 0
backend/vxlan/vxlan_network_windows.go

@@ -0,0 +1,125 @@
+// Copyright 2015 flannel authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vxlan
+
+import (
+	log "github.com/golang/glog"
+	"golang.org/x/net/context"
+	"sync"
+
+	"github.com/coreos/flannel/backend"
+	"github.com/coreos/flannel/subnet"
+
+	"github.com/coreos/flannel/pkg/ip"
+	"strings"
+)
+
+type network struct {
+	backend.SimpleNetwork
+	dev       *vxlanDevice
+	subnetMgr subnet.Manager
+}
+
+const (
+	encapOverhead = 50
+)
+
+func newNetwork(subnetMgr subnet.Manager, extIface *backend.ExternalInterface, dev *vxlanDevice, _ ip.IP4Net, lease *subnet.Lease) (*network, error) {
+	nw := &network{
+		SimpleNetwork: backend.SimpleNetwork{
+			SubnetLease: lease,
+			ExtIface:    extIface,
+		},
+		subnetMgr: subnetMgr,
+		dev:       dev,
+	}
+
+	return nw, nil
+}
+
+func (nw *network) Run(ctx context.Context) {
+	wg := sync.WaitGroup{}
+
+	log.V(0).Info("Watching for new subnet leases")
+	events := make(chan []subnet.Event)
+	wg.Add(1)
+	go func() {
+		subnet.WatchLeases(ctx, nw.subnetMgr, nw.SubnetLease, events)
+		log.V(1).Info("WatchLeases exited")
+		wg.Done()
+	}()
+
+	defer wg.Wait()
+
+	for {
+		select {
+		case evtBatch := <-events:
+			nw.handleSubnetEvents(evtBatch)
+
+		case <-ctx.Done():
+			return
+		}
+	}
+}
+
+func (nw *network) MTU() int {
+	return nw.ExtIface.Iface.MTU - encapOverhead
+}
+
+func (nw *network) handleSubnetEvents(batch []subnet.Event) {
+	for _, event := range batch {
+		leaseSubnet := event.Lease.Subnet
+		leaseAttrs := event.Lease.Attrs
+		if !strings.EqualFold(leaseAttrs.BackendType, "vxlan") {
+			log.Warningf("ignoring non-vxlan subnet(%v): type=%v", leaseSubnet, leaseAttrs.BackendType)
+			continue
+		}
+
+		publicIP := leaseAttrs.PublicIP.String()
+		remoteIP := leaseSubnet.IP + 2
+		lastIP := leaseSubnet.Next().IP - 1
+
+		switch event.Type {
+		case subnet.EventAdded:
+			for ; remoteIP < lastIP; remoteIP++ {
+				n := &neighbor{
+					IP:                remoteIP,
+					MAC:               nw.dev.ConjureMac(remoteIP),
+					ManagementAddress: publicIP,
+				}
+
+				log.V(2).Infof("adding subnet: %v publicIP: %s vtepMAC: %s", leaseSubnet, n.ManagementAddress, n.MAC)
+				if err := nw.dev.AddEndpoint(n); err != nil {
+					log.Error(err)
+				}
+			}
+		case subnet.EventRemoved:
+			for ; remoteIP < lastIP; remoteIP++ {
+				n := &neighbor{
+					IP:                remoteIP,
+					MAC:               nw.dev.ConjureMac(remoteIP),
+					ManagementAddress: publicIP,
+				}
+
+				log.V(2).Infof("removing subnet: %v publicIP: %s vtepMAC: %s", leaseSubnet, n.ManagementAddress, n.MAC)
+				if err := nw.dev.DelEndpoint(n); err != nil {
+					log.Error(err)
+				}
+			}
+		default:
+			log.Error("internal error: unknown event type: ", int(event.Type))
+		}
+	}
+}

+ 120 - 2
backend/vxlan/vxlan_windows.go

@@ -11,14 +11,132 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// +build windows
 
 package vxlan
 
+// Some design notes:
+// VXLAN encapsulates L2 packets (though flannel is L3 only so don't expect to be able to send L2 packets across hosts)
+// Windows overlay decap works at L2 and so it needs the correct destination MAC for the remote host to work.
+// Windows does not expose an L3Miss interface so for now all possible remote IP/MAC pairs have to be configured upfront.
+//
+// In this scheme the scaling of table entries (per host) is:
+//  - 1 network entry for the overlay network
+//  - 1 endpoint per local container
+//  - N remote endpoints remote node (total endpoints =
 import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"sync"
+
 	log "github.com/golang/glog"
+
+	"golang.org/x/net/context"
+
+	"github.com/coreos/flannel/backend"
+	"github.com/coreos/flannel/pkg/ip"
+	"github.com/coreos/flannel/subnet"
+	"net"
 )
 
 func init() {
-	log.Infof("vxlan is not supported on this platform")
+	backend.Register("vxlan", New)
+}
+
+const (
+	defaultVNI = 4096
+	vxlanPort  = 4789
+)
+
+type VXLANBackend struct {
+	subnetMgr subnet.Manager
+	extIface  *backend.ExternalInterface
+}
+
+func New(sm subnet.Manager, extIface *backend.ExternalInterface) (backend.Backend, error) {
+	backend := &VXLANBackend{
+		subnetMgr: sm,
+		extIface:  extIface,
+	}
+
+	return backend, nil
+}
+
+func newSubnetAttrs(publicIP net.IP) (*subnet.LeaseAttrs, error) {
+	return &subnet.LeaseAttrs{
+		PublicIP:    ip.FromIP(publicIP),
+		BackendType: "vxlan",
+	}, nil
+}
+
+func (be *VXLANBackend) RegisterNetwork(ctx context.Context, wg sync.WaitGroup, config *subnet.Config) (backend.Network, error) {
+	// 1. Parse configuration
+	cfg := struct {
+		Name          string
+		MacPrefix     string
+		VNI           int
+		Port          int
+		GBP           bool
+		DirectRouting bool
+	}{
+		VNI:       defaultVNI,
+		Port:      vxlanPort,
+		MacPrefix: "0E-2A",
+	}
+
+	if len(config.Backend) > 0 {
+		if err := json.Unmarshal(config.Backend, &cfg); err != nil {
+			return nil, fmt.Errorf("error decoding VXLAN backend config: %v", err)
+		}
+	}
+
+	// 2. Verify configuration
+	if cfg.VNI < defaultVNI {
+		return nil, fmt.Errorf("invalid VXLAN backend config. VNI [%v] must be greater than or equal to %v on Windows", cfg.VNI, defaultVNI)
+	}
+	if cfg.Port != vxlanPort {
+		return nil, fmt.Errorf("invalid VXLAN backend config. Port [%v] is not supported on Windows. Omit the setting to default to port %v", cfg.Port, vxlanPort)
+	}
+	if cfg.DirectRouting {
+		return nil, errors.New("invalid VXLAN backend config. DirectRouting is not supported on Windows")
+	}
+	if cfg.GBP {
+		return nil, errors.New("invalid VXLAN backend config. GBP is not supported on Windows")
+	}
+	if len(cfg.MacPrefix) == 0 || len(cfg.MacPrefix) != 5 || cfg.MacPrefix[2] != '-' {
+		return nil, fmt.Errorf("invalid VXLAN backend config.MacPrefix [%v] is invalid, prefix must be of the format xx-xx e.g. 0E-2A", cfg.MacPrefix)
+	}
+	if len(cfg.Name) == 0 {
+		cfg.Name = fmt.Sprintf("flannel.%v", cfg.VNI)
+	}
+	log.Infof("VXLAN config: Name=%s MacPrefix=%s VNI=%d Port=%d GBP=%v DirectRouting=%v", cfg.Name, cfg.MacPrefix, cfg.VNI, cfg.Port, cfg.GBP, cfg.DirectRouting)
+
+	subnetAttrs, err := newSubnetAttrs(be.extIface.ExtAddr)
+	if err != nil {
+		return nil, err
+	}
+
+	lease, err := be.subnetMgr.AcquireLease(ctx, subnetAttrs)
+	switch err {
+	case nil:
+	case context.Canceled, context.DeadlineExceeded:
+		return nil, err
+	default:
+		return nil, fmt.Errorf("failed to acquire lease: %v", err)
+	}
+
+	devAttrs := vxlanDeviceAttrs{
+		vni:           uint32(cfg.VNI),
+		name:          cfg.Name,
+		addressPrefix: lease.Subnet,
+	}
+
+	dev, err := newVXLANDevice(&devAttrs)
+	if err != nil {
+		return nil, err
+	}
+	dev.directRouting = cfg.DirectRouting
+	dev.macPrefix = cfg.MacPrefix
+
+	return newNetwork(be.subnetMgr, be.extIface, dev, ip.IP4Net{}, lease)
 }