subnet.go 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. package subnet
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "net"
  7. "regexp"
  8. "strconv"
  9. "time"
  10. "github.com/coreos/rudder/Godeps/_workspace/src/github.com/coreos/go-etcd/etcd"
  11. log "github.com/coreos/rudder/Godeps/_workspace/src/github.com/golang/glog"
  12. "github.com/coreos/rudder/pkg/ip"
  13. "github.com/coreos/rudder/pkg/task"
  14. )
  15. const (
  16. registerRetries = 10
  17. subnetTTL = 24 * 3600
  18. renewMargin = time.Hour
  19. )
  20. // etcd error codes
  21. const (
  22. etcdKeyNotFound = 100
  23. etcdKeyAlreadyExists = 105
  24. etcdEventIndexCleared = 401
  25. )
  26. const (
  27. SubnetAdded = iota
  28. SubnetRemoved
  29. )
  30. var (
  31. subnetRegex *regexp.Regexp = regexp.MustCompile(`(\d+\.\d+.\d+.\d+)-(\d+)`)
  32. )
  33. type SubnetLease struct {
  34. Network ip.IP4Net
  35. Data string
  36. }
  37. type SubnetManager struct {
  38. registry subnetRegistry
  39. config *Config
  40. myLease SubnetLease
  41. leaseExp time.Time
  42. lastIndex uint64
  43. leases []SubnetLease
  44. }
  45. type EventType int
  46. type Event struct {
  47. Type EventType
  48. Lease SubnetLease
  49. }
  50. type EventBatch []Event
  51. func NewSubnetManager(etcdEndpoint, prefix string) (*SubnetManager, error) {
  52. esr := newEtcdSubnetRegistry(etcdEndpoint, prefix)
  53. return newSubnetManager(esr)
  54. }
  55. func (sm *SubnetManager) AcquireLease(extIP ip.IP4, data interface{}, cancel chan bool) (ip.IP4Net, error) {
  56. dataBytes, err := json.Marshal(data)
  57. if err != nil {
  58. return ip.IP4Net{}, err
  59. }
  60. var sn ip.IP4Net
  61. for {
  62. sn, err = sm.acquireLeaseOnce(extIP, string(dataBytes), cancel)
  63. switch {
  64. case err == nil:
  65. log.Info("Subnet lease acquired: ", sn)
  66. return sn, nil
  67. case err == task.ErrCanceled:
  68. return ip.IP4Net{}, err
  69. default:
  70. log.Error("Failed to acquire subnet: ", err)
  71. }
  72. select {
  73. case <-time.After(time.Second):
  74. case <-cancel:
  75. return ip.IP4Net{}, task.ErrCanceled
  76. }
  77. }
  78. }
  79. func (sm *SubnetManager) acquireLeaseOnce(extIP ip.IP4, data string, cancel chan bool) (ip.IP4Net, error) {
  80. for i := 0; i < registerRetries; i++ {
  81. var err error
  82. sm.leases, err = sm.getLeases()
  83. if err != nil {
  84. return ip.IP4Net{}, err
  85. }
  86. // try to reuse a subnet if there's one that matches our IP
  87. for _, l := range sm.leases {
  88. var ba BaseAttrs
  89. err = json.Unmarshal([]byte(l.Data), &ba)
  90. if err != nil {
  91. log.Error("Error parsing subnet lease JSON: ", err)
  92. } else {
  93. if extIP == ba.PublicIP {
  94. resp, err := sm.registry.updateSubnet(l.Network.StringSep(".", "-"), data, subnetTTL)
  95. if err != nil {
  96. return ip.IP4Net{}, err
  97. }
  98. sm.myLease.Network = l.Network
  99. sm.leaseExp = *resp.Node.Expiration
  100. return l.Network, nil
  101. }
  102. }
  103. }
  104. // no existing match, grab a new one
  105. sn, err := sm.allocateSubnet()
  106. if err != nil {
  107. return ip.IP4Net{}, err
  108. }
  109. resp, err := sm.registry.createSubnet(sn.StringSep(".", "-"), data, subnetTTL)
  110. switch {
  111. case err == nil:
  112. sm.myLease.Network = sn
  113. sm.leaseExp = *resp.Node.Expiration
  114. return sn, nil
  115. // if etcd returned Key Already Exists, try again.
  116. case err.(*etcd.EtcdError).ErrorCode == etcdKeyAlreadyExists:
  117. break
  118. default:
  119. return ip.IP4Net{}, err
  120. }
  121. // before moving on, check for cancel
  122. if interrupted(cancel) {
  123. return ip.IP4Net{}, task.ErrCanceled
  124. }
  125. }
  126. return ip.IP4Net{}, errors.New("Max retries reached trying to acquire a subnet")
  127. }
  128. func (sm *SubnetManager) UpdateSubnet(data string) error {
  129. resp, err := sm.registry.updateSubnet(sm.myLease.Network.StringSep(".", "-"), data, subnetTTL)
  130. sm.leaseExp = *resp.Node.Expiration
  131. return err
  132. }
  133. func (sm *SubnetManager) GetConfig() *Config {
  134. return sm.config
  135. }
  136. /// Implementation
  137. func parseSubnetKey(s string) (ip.IP4Net, error) {
  138. if parts := subnetRegex.FindStringSubmatch(s); len(parts) == 3 {
  139. snIp := net.ParseIP(parts[1]).To4()
  140. prefixLen, err := strconv.ParseUint(parts[2], 10, 5)
  141. if snIp != nil && err == nil {
  142. return ip.IP4Net{IP: ip.FromIP(snIp), PrefixLen: uint(prefixLen)}, nil
  143. }
  144. }
  145. return ip.IP4Net{}, errors.New("Error parsing IP Subnet")
  146. }
  147. func newSubnetManager(r subnetRegistry) (*SubnetManager, error) {
  148. cfgResp, err := r.getConfig()
  149. if err != nil {
  150. return nil, err
  151. }
  152. cfg, err := ParseConfig(cfgResp.Node.Value)
  153. if err != nil {
  154. return nil, err
  155. }
  156. sm := SubnetManager{
  157. registry: r,
  158. config: cfg,
  159. }
  160. return &sm, nil
  161. }
  162. func (sm *SubnetManager) getLeases() ([]SubnetLease, error) {
  163. resp, err := sm.registry.getSubnets()
  164. var leases []SubnetLease
  165. switch {
  166. case err == nil:
  167. for _, node := range resp.Node.Nodes {
  168. sn, err := parseSubnetKey(node.Key)
  169. if err == nil {
  170. lease := SubnetLease{sn, node.Value}
  171. leases = append(leases, lease)
  172. }
  173. }
  174. sm.lastIndex = resp.EtcdIndex
  175. case err.(*etcd.EtcdError).ErrorCode == etcdKeyNotFound:
  176. // key not found: treat it as empty set
  177. sm.lastIndex = err.(*etcd.EtcdError).Index
  178. default:
  179. return nil, err
  180. }
  181. return leases, nil
  182. }
  183. func deleteLease(l []SubnetLease, i int) []SubnetLease {
  184. l[i], l = l[len(l)-1], l[:len(l)-1]
  185. return l
  186. }
  187. func (sm *SubnetManager) applyLeases(newLeases []SubnetLease) EventBatch {
  188. var batch EventBatch
  189. for _, l := range newLeases {
  190. // skip self
  191. if l.Network.Equal(sm.myLease.Network) {
  192. continue
  193. }
  194. found := false
  195. for i, c := range sm.leases {
  196. if c.Network.Equal(l.Network) {
  197. sm.leases = deleteLease(sm.leases, i)
  198. found = true
  199. break
  200. }
  201. }
  202. if !found {
  203. // new subnet
  204. batch = append(batch, Event{SubnetAdded, l})
  205. }
  206. }
  207. // everything left in sm.leases has been deleted
  208. for _, c := range sm.leases {
  209. batch = append(batch, Event{SubnetRemoved, c})
  210. }
  211. sm.leases = newLeases
  212. return batch
  213. }
  214. func (sm *SubnetManager) applySubnetChange(action string, ipn ip.IP4Net, data string) Event {
  215. switch action {
  216. case "delete", "expire":
  217. for i, l := range sm.leases {
  218. if l.Network.Equal(ipn) {
  219. deleteLease(sm.leases, i)
  220. return Event{SubnetRemoved, l}
  221. }
  222. }
  223. log.Errorf("Removed subnet (%s) was not found", ipn)
  224. return Event{
  225. SubnetRemoved,
  226. SubnetLease{ipn, ""},
  227. }
  228. default:
  229. for i, l := range sm.leases {
  230. if l.Network.Equal(ipn) {
  231. sm.leases[i] = SubnetLease{ipn, data}
  232. return Event{SubnetAdded, sm.leases[i]}
  233. }
  234. }
  235. sm.leases = append(sm.leases, SubnetLease{ipn, data})
  236. return Event{SubnetAdded, sm.leases[len(sm.leases)-1]}
  237. }
  238. }
  239. type BaseAttrs struct {
  240. PublicIP ip.IP4
  241. }
  242. func (sm *SubnetManager) allocateSubnet() (ip.IP4Net, error) {
  243. log.Infof("Picking subnet in range %s ... %s", sm.config.SubnetMin, sm.config.SubnetMax)
  244. var bag []ip.IP4
  245. sn := ip.IP4Net{IP: sm.config.SubnetMin, PrefixLen: sm.config.SubnetLen}
  246. OuterLoop:
  247. for ; sn.IP <= sm.config.SubnetMax && len(bag) < 100; sn = sn.Next() {
  248. for _, l := range sm.leases {
  249. if sn.Overlaps(l.Network) {
  250. continue OuterLoop
  251. }
  252. }
  253. bag = append(bag, sn.IP)
  254. }
  255. if len(bag) == 0 {
  256. return ip.IP4Net{}, errors.New("out of subnets")
  257. } else {
  258. i := randInt(0, len(bag))
  259. return ip.IP4Net{IP: bag[i], PrefixLen: sm.config.SubnetLen}, nil
  260. }
  261. }
  262. func (sm *SubnetManager) WatchLeases(receiver chan EventBatch, cancel chan bool) {
  263. // "catch up" by replaying all the leases we discovered during
  264. // AcquireLease
  265. var batch EventBatch
  266. for _, l := range sm.leases {
  267. if !sm.myLease.Network.Equal(l.Network) {
  268. batch = append(batch, Event{SubnetAdded, l})
  269. }
  270. }
  271. if len(batch) > 0 {
  272. receiver <- batch
  273. }
  274. for {
  275. resp, err := sm.registry.watchSubnets(sm.lastIndex+1, cancel)
  276. // watchSubnets exited by cancel chan being signaled
  277. if err == nil && resp == nil {
  278. return
  279. }
  280. var batch *EventBatch
  281. if err == nil {
  282. batch, err = sm.parseSubnetWatchResponse(resp)
  283. } else {
  284. batch, err = sm.parseSubnetWatchError(err)
  285. }
  286. if err != nil {
  287. log.Errorf("%v", err)
  288. time.Sleep(time.Second)
  289. continue
  290. }
  291. if batch != nil {
  292. receiver <- *batch
  293. }
  294. }
  295. }
  296. func (sm *SubnetManager) parseSubnetWatchResponse(resp *etcd.Response) (batch *EventBatch, err error) {
  297. sm.lastIndex = resp.EtcdIndex
  298. sn, err := parseSubnetKey(resp.Node.Key)
  299. if err != nil {
  300. err = fmt.Errorf("Error parsing subnet IP: %s", resp.Node.Key)
  301. return
  302. }
  303. // Don't process our own changes
  304. if !sm.myLease.Network.Equal(sn) {
  305. evt := sm.applySubnetChange(resp.Action, sn, resp.Node.Value)
  306. batch = &EventBatch{evt}
  307. }
  308. return
  309. }
  310. func (sm *SubnetManager) parseSubnetWatchError(err error) (batch *EventBatch, out error) {
  311. etcdErr, ok := err.(*etcd.EtcdError)
  312. if ok && etcdErr.ErrorCode == etcdEventIndexCleared {
  313. // etcd maintains a history window for events and it's possible to fall behind.
  314. // to recover, get the current state and then "diff" against our cache to generate
  315. // events for the caller
  316. log.Warning("Watch of subnet leases failed because etcd index outside history window")
  317. leases, err := sm.getLeases()
  318. if err == nil {
  319. lb := sm.applyLeases(leases)
  320. batch = &lb
  321. } else {
  322. out = fmt.Errorf("Failed to retrieve subnet leases: %v", err)
  323. }
  324. } else {
  325. out = fmt.Errorf("Watch of subnet leases failed: %v", err)
  326. }
  327. return
  328. }
  329. func (sm *SubnetManager) LeaseRenewer(cancel chan bool) {
  330. dur := sm.leaseExp.Sub(time.Now()) - renewMargin
  331. for {
  332. select {
  333. case <-time.After(dur):
  334. resp, err := sm.registry.updateSubnet(sm.myLease.Network.StringSep(".", "-"), sm.myLease.Data, subnetTTL)
  335. if err != nil {
  336. log.Error("Error renewing lease (trying again in 1 min): ", err)
  337. dur = time.Minute
  338. continue
  339. }
  340. sm.leaseExp = *resp.Node.Expiration
  341. log.Info("Lease renewed, new expiration: ", sm.leaseExp)
  342. dur = sm.leaseExp.Sub(time.Now()) - renewMargin
  343. case <-cancel:
  344. return
  345. }
  346. }
  347. }
  348. func interrupted(cancel chan bool) bool {
  349. select {
  350. case <-cancel:
  351. return true
  352. default:
  353. return false
  354. }
  355. }