proxy_amd64.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // +build !windows
  15. #include <stdlib.h>
  16. #include <stdio.h>
  17. #include <stdarg.h>
  18. #include <memory.h>
  19. #include <assert.h>
  20. #include <errno.h>
  21. #include <poll.h>
  22. #include <unistd.h>
  23. #include <sys/types.h>
  24. #include <arpa/inet.h>
  25. #include <netinet/in.h>
  26. #include <linux/ip.h>
  27. #include <linux/icmp.h>
  28. #include <fcntl.h>
  29. #define CMD_DEFINE
  30. #include "proxy_amd64.h"
  31. struct ip_net {
  32. in_addr_t ip;
  33. in_addr_t mask;
  34. };
  35. struct route_entry {
  36. struct ip_net dst;
  37. struct sockaddr_in next_hop;
  38. };
  39. typedef struct icmp_pkt {
  40. struct iphdr iph;
  41. struct icmphdr icmph;
  42. /* dest unreachable must include IP hdr 8 bytes of upper layer proto
  43. * of the original packet. */
  44. char data[sizeof(struct iphdr) + MAX_IPOPTLEN + 8];
  45. } __attribute__ ((aligned (4))) icmp_pkt;
  46. /* we calc hdr checksums using 32bit uints that can alias other types */
  47. typedef uint32_t __attribute__((__may_alias__)) aliasing_uint32_t;
  48. struct route_entry *routes;
  49. size_t routes_alloc;
  50. size_t routes_cnt;
  51. in_addr_t tun_addr;
  52. int log_enabled;
  53. int exit_flag;
  54. static inline in_addr_t netmask(int prefix_len) {
  55. return htonl(~((uint32_t)0) << (32 - prefix_len));
  56. }
  57. static inline int contains(struct ip_net net, in_addr_t ip) {
  58. return net.ip == (ip & net.mask);
  59. }
  60. static void log_error(const char *fmt, ...) {
  61. va_list ap;
  62. if( log_enabled ) {
  63. va_start(ap, fmt);
  64. vfprintf(stderr, fmt, ap);
  65. va_end(ap);
  66. }
  67. }
  68. /* fast version -- only works with mults of 4 bytes */
  69. static uint16_t cksum(aliasing_uint32_t *buf, int len) {
  70. uint32_t sum = 0;
  71. uint16_t t1, t2;
  72. for( ; len > 0; len-- ) {
  73. uint32_t s = *buf++;
  74. sum += s;
  75. if( sum < s )
  76. sum++;
  77. }
  78. /* Fold down to 16 bits */
  79. t1 = sum;
  80. t2 = sum >> 16;
  81. t1 += t2;
  82. if( t1 < t2 )
  83. t1++;
  84. return ~t1;
  85. }
  86. static void send_net_unreachable(int tun, char *offender) {
  87. icmp_pkt pkt;
  88. int off_iph_len;
  89. struct iphdr *off_iph = (struct iphdr *)offender;
  90. size_t pktlen, nsent;
  91. off_iph_len = off_iph->ihl * 4;
  92. if( off_iph_len >= sizeof(struct iphdr) + MAX_IPOPTLEN ) {
  93. log_error("not sending net unreachable: mulformed ip pkt: iph=%d\n", (int)off_iph_len);
  94. return; /* ip pkt mulformed */
  95. }
  96. if( off_iph->protocol == IPPROTO_ICMP ) {
  97. /* To avoid infinite loops, RFC 792 instructs not to send ICMPs
  98. * about ICMPs */
  99. return;
  100. }
  101. /* Lower 3 bits (in network order) of frag_off is actually flags */
  102. if( (off_iph->frag_off & htons(0x1FFF)) != 0 ) {
  103. /* ICMP messages are only sent for first fragemnt */
  104. return;
  105. }
  106. pktlen = sizeof(struct iphdr) + sizeof(struct icmphdr) + off_iph_len + 8;
  107. memset(&pkt, 0, sizeof(pkt));
  108. /* Fill in the IP header */
  109. pkt.iph.ihl = sizeof(struct iphdr) / 4;
  110. pkt.iph.version = IPVERSION;
  111. pkt.iph.tot_len = htons(pktlen);
  112. pkt.iph.ttl = 8;
  113. pkt.iph.protocol = IPPROTO_ICMP;
  114. pkt.iph.saddr = tun_addr;
  115. pkt.iph.daddr = off_iph->saddr;
  116. pkt.iph.check = cksum((aliasing_uint32_t*) &pkt.iph, sizeof(struct iphdr) / sizeof(aliasing_uint32_t));
  117. /* Fill in the ICMP header */
  118. pkt.icmph.type = ICMP_DEST_UNREACH;
  119. pkt.icmph.code = ICMP_NET_UNREACH;
  120. /* Copy the offenders IP hdr + first 8 bytes of IP payload */
  121. memcpy(pkt.data, offender, off_iph_len + 8);
  122. /* Compute the checksum over the ICMP header and data */
  123. pkt.icmph.checksum = cksum((aliasing_uint32_t*) &pkt.icmph,
  124. (sizeof(struct icmphdr) + off_iph_len + 8) / sizeof(aliasing_uint32_t));
  125. /* Kick it back */
  126. nsent = write(tun, &pkt, pktlen);
  127. if( nsent < 0 ) {
  128. log_error("failed to send ICMP net unreachable: %s\n", strerror(errno));
  129. } else if( nsent != pktlen ) {
  130. log_error("failed to send ICMP net unreachable: only %d out of %d byte sent\n", (int)nsent, (int)pktlen);
  131. }
  132. }
  133. static int set_route(struct ip_net dst, struct sockaddr_in *next_hop) {
  134. size_t i;
  135. for( i = 0; i < routes_cnt; i++ ) {
  136. if( dst.ip == routes[i].dst.ip && dst.mask == routes[i].dst.mask ) {
  137. routes[i].next_hop = *next_hop;
  138. return 0;
  139. }
  140. }
  141. if( routes_alloc == routes_cnt ) {
  142. int new_alloc = (routes_alloc ? 2*routes_alloc : 8);
  143. struct route_entry *new_routes = (struct route_entry *) realloc(routes, new_alloc*sizeof(struct route_entry));
  144. if( !new_routes )
  145. return ENOMEM;
  146. routes = new_routes;
  147. routes_alloc = new_alloc;
  148. }
  149. routes[routes_cnt].dst = dst;
  150. routes[routes_cnt].next_hop = *next_hop;
  151. routes_cnt++;
  152. return 0;
  153. }
  154. static int del_route(struct ip_net dst) {
  155. size_t i;
  156. for( i = 0; i < routes_cnt; i++ ) {
  157. if( dst.ip == routes[i].dst.ip && dst.mask == routes[i].dst.mask ) {
  158. routes[i] = routes[routes_cnt-1];
  159. routes_cnt--;
  160. return 0;
  161. }
  162. }
  163. return ENOENT;
  164. }
  165. static struct sockaddr_in *find_route(in_addr_t dst) {
  166. size_t i;
  167. for( i = 0; i < routes_cnt; i++ ) {
  168. if( contains(routes[i].dst, dst) ) {
  169. // packets for same dest tend to come in bursts. swap to front make it faster for subsequent ones
  170. if( i != 0 ) {
  171. struct route_entry tmp = routes[i];
  172. routes[i] = routes[0];
  173. routes[0] = tmp;
  174. }
  175. return &routes[0].next_hop;
  176. }
  177. }
  178. return NULL;
  179. }
  180. static char *inaddr_str(in_addr_t a, char *buf, size_t len) {
  181. struct in_addr addr;
  182. addr.s_addr = a;
  183. strncpy(buf, inet_ntoa(addr), len);
  184. buf[len-1] = '\0';
  185. return buf;
  186. }
  187. static ssize_t tun_recv_packet(int tun, char *buf, size_t buflen) {
  188. ssize_t nread = read(tun, buf, buflen);
  189. if( nread < sizeof(struct iphdr) ) {
  190. if( nread < 0 ) {
  191. if( errno != EAGAIN && errno != EWOULDBLOCK )
  192. log_error("TUN recv failed: %s\n", strerror(errno));
  193. } else {
  194. log_error("TUN recv packet too small: %d bytes\n", (int)nread);
  195. }
  196. return -1;
  197. }
  198. return nread;
  199. }
  200. static ssize_t sock_recv_packet(int sock, char *buf, size_t buflen) {
  201. ssize_t nread = recv(sock, buf, buflen, MSG_DONTWAIT);
  202. if( nread < sizeof(struct iphdr) ) {
  203. if( nread < 0 ) {
  204. if( errno != EAGAIN && errno != EWOULDBLOCK )
  205. log_error("UDP recv failed: %s\n", strerror(errno));
  206. } else {
  207. log_error("UDP recv packet too small: %d bytes\n", (int)nread);
  208. }
  209. return -1;
  210. }
  211. return nread;
  212. }
  213. static void sock_send_packet(int sock, char *pkt, size_t pktlen, struct sockaddr_in *dst) {
  214. ssize_t nsent = sendto(sock, pkt, pktlen, 0, (struct sockaddr *)dst, sizeof(struct sockaddr_in));
  215. if( nsent != pktlen ) {
  216. if( nsent < 0 ) {
  217. log_error("UDP send to %s:%hu failed: %s\n",
  218. inet_ntoa(dst->sin_addr), ntohs(dst->sin_port), strerror(errno));
  219. } else {
  220. log_error("Was only able to send %d out of %d bytes to %s:%hu\n",
  221. (int)nsent, (int)pktlen, inet_ntoa(dst->sin_addr), ntohs(dst->sin_port));
  222. }
  223. }
  224. }
  225. static void tun_send_packet(int tun, char *pkt, size_t pktlen) {
  226. ssize_t nsent;
  227. _retry:
  228. nsent = write(tun, pkt, pktlen);
  229. if( nsent != pktlen ) {
  230. if( nsent < 0 ) {
  231. if( errno == EAGAIN || errno == EWOULDBLOCK)
  232. goto _retry;
  233. log_error("TUN send failed: %s\n", strerror(errno));
  234. } else {
  235. log_error("Was only able to send %d out of %d bytes to TUN\n", (int)nsent, (int)pktlen);
  236. }
  237. }
  238. }
  239. inline static int decrement_ttl(struct iphdr *iph) {
  240. if( --(iph->ttl) == 0 ) {
  241. char saddr[32], daddr[32];
  242. log_error("Discarding IP fragment %s -> %s due to zero TTL\n",
  243. inaddr_str(iph->saddr, saddr, sizeof(saddr)),
  244. inaddr_str(iph->daddr, daddr, sizeof(daddr)));
  245. return 0;
  246. }
  247. /* patch up IP checksum (see RFC 1624) */
  248. if( iph->check >= htons(0xFFFFu - 0x100) ) {
  249. iph->check += htons(0x100) + 1;
  250. } else {
  251. iph->check += htons(0x100);
  252. }
  253. return 1;
  254. }
  255. static int tun_to_udp(int tun, int sock, char *buf, size_t buflen) {
  256. struct iphdr *iph;
  257. struct sockaddr_in *next_hop;
  258. ssize_t pktlen = tun_recv_packet(tun, buf, buflen);
  259. if( pktlen < 0 )
  260. return 0;
  261. iph = (struct iphdr *)buf;
  262. next_hop = find_route((in_addr_t) iph->daddr);
  263. if( !next_hop ) {
  264. send_net_unreachable(tun, buf);
  265. goto _active;
  266. }
  267. if( !decrement_ttl(iph) ) {
  268. /* TTL went to 0, discard.
  269. * TODO: send back ICMP Time Exceeded
  270. */
  271. goto _active;
  272. }
  273. sock_send_packet(sock, buf, pktlen, next_hop);
  274. _active:
  275. return 1;
  276. }
  277. static int udp_to_tun(int sock, int tun, char *buf, size_t buflen) {
  278. struct iphdr *iph;
  279. ssize_t pktlen = sock_recv_packet(sock, buf, buflen);
  280. if( pktlen < 0 )
  281. return 0;
  282. iph = (struct iphdr *)buf;
  283. if( !decrement_ttl(iph) ) {
  284. /* TTL went to 0, discard.
  285. * TODO: send back ICMP Time Exceeded
  286. */
  287. goto _active;
  288. }
  289. tun_send_packet(tun, buf, pktlen);
  290. _active:
  291. return 1;
  292. }
  293. static void process_cmd(int ctl) {
  294. struct command cmd;
  295. struct ip_net ipn;
  296. struct sockaddr_in sa = {
  297. .sin_family = AF_INET
  298. };
  299. ssize_t nrecv = recv(ctl, (char *) &cmd, sizeof(cmd), 0);
  300. if( nrecv < 0 ) {
  301. log_error("CTL recv failed: %s\n", strerror(errno));
  302. return;
  303. }
  304. if( cmd.cmd == CMD_SET_ROUTE ) {
  305. ipn.mask = netmask(cmd.dest_net_len);
  306. ipn.ip = cmd.dest_net & ipn.mask;
  307. sa.sin_addr.s_addr = cmd.next_hop_ip;
  308. sa.sin_port = htons(cmd.next_hop_port);
  309. set_route(ipn, &sa);
  310. } else if( cmd.cmd == CMD_DEL_ROUTE ) {
  311. ipn.mask = netmask(cmd.dest_net_len);
  312. ipn.ip = cmd.dest_net & ipn.mask;
  313. del_route(ipn);
  314. } else if( cmd.cmd == CMD_STOP ) {
  315. exit_flag = 1;
  316. }
  317. }
  318. enum PFD {
  319. PFD_TUN = 0,
  320. PFD_SOCK,
  321. PFD_CTL,
  322. PFD_CNT
  323. };
  324. void run_proxy(int tun, int sock, int ctl, in_addr_t tun_ip, size_t tun_mtu, int log_errors) {
  325. char *buf;
  326. struct pollfd fds[PFD_CNT] = {
  327. {
  328. .fd = tun,
  329. .events = POLLIN
  330. },
  331. {
  332. .fd = sock,
  333. .events = POLLIN
  334. },
  335. {
  336. .fd = ctl,
  337. .events = POLLIN
  338. },
  339. };
  340. exit_flag = 0;
  341. tun_addr = tun_ip;
  342. log_enabled = log_errors;
  343. buf = (char *) malloc(tun_mtu);
  344. if( !buf ) {
  345. log_error("Failed to allocate %d byte buffer\n", tun_mtu);
  346. exit(1);
  347. }
  348. fcntl(tun, F_SETFL, O_NONBLOCK);
  349. while( !exit_flag ) {
  350. int nfds = poll(fds, PFD_CNT, -1), activity;
  351. if( nfds < 0 ) {
  352. if( errno == EINTR )
  353. continue;
  354. log_error("Poll failed: %s\n", strerror(errno));
  355. exit(1);
  356. }
  357. if( fds[PFD_CTL].revents & POLLIN )
  358. process_cmd(ctl);
  359. if( fds[PFD_TUN].revents & POLLIN || fds[PFD_SOCK].revents & POLLIN )
  360. do {
  361. activity = 0;
  362. activity += tun_to_udp(tun, sock, buf, tun_mtu);
  363. activity += udp_to_tun(sock, tun, buf, tun_mtu);
  364. /* As long as tun or udp is readable bypass poll().
  365. * We'll just occasionally get EAGAIN on an unreadable fd which
  366. * is cheaper than the poll() call, the rest of the time the
  367. * read/recvfrom call moves data which poll() never does for us.
  368. *
  369. * This is at the expense of the ctl socket, a counter could be
  370. * used to place an upper bound on how long we may neglect ctl.
  371. */
  372. } while( activity );
  373. }
  374. free(buf);
  375. }