proxy.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #include <stdlib.h>
  15. #include <stdio.h>
  16. #include <stdarg.h>
  17. #include <memory.h>
  18. #include <assert.h>
  19. #include <errno.h>
  20. #include <poll.h>
  21. #include <unistd.h>
  22. #include <sys/types.h>
  23. #include <arpa/inet.h>
  24. #include <netinet/in.h>
  25. #include <linux/ip.h>
  26. #include <linux/icmp.h>
  27. #include <fcntl.h>
  28. #define CMD_DEFINE
  29. #include "proxy.h"
  30. struct ip_net {
  31. in_addr_t ip;
  32. in_addr_t mask;
  33. };
  34. struct route_entry {
  35. struct ip_net dst;
  36. struct sockaddr_in next_hop;
  37. };
  38. typedef struct icmp_pkt {
  39. struct iphdr iph;
  40. struct icmphdr icmph;
  41. /* dest unreachable must include IP hdr 8 bytes of upper layer proto
  42. * of the original packet. */
  43. char data[sizeof(struct iphdr) + MAX_IPOPTLEN + 8];
  44. } __attribute__ ((aligned (4))) icmp_pkt;
  45. /* we calc hdr checksums using 32bit uints that can alias other types */
  46. typedef uint32_t __attribute__((__may_alias__)) aliasing_uint32_t;
  47. struct route_entry *routes;
  48. size_t routes_alloc;
  49. size_t routes_cnt;
  50. in_addr_t tun_addr;
  51. int log_enabled;
  52. int exit_flag;
  53. static inline in_addr_t netmask(int prefix_len) {
  54. return htonl(~((uint32_t)0) << (32 - prefix_len));
  55. }
  56. static inline int contains(struct ip_net net, in_addr_t ip) {
  57. return net.ip == (ip & net.mask);
  58. }
  59. static void log_error(const char *fmt, ...) {
  60. va_list ap;
  61. if( log_enabled ) {
  62. va_start(ap, fmt);
  63. vfprintf(stderr, fmt, ap);
  64. va_end(ap);
  65. }
  66. }
  67. /* fast version -- only works with mults of 4 bytes */
  68. static uint16_t cksum(aliasing_uint32_t *buf, int len) {
  69. uint32_t sum = 0;
  70. uint16_t t1, t2;
  71. for( ; len > 0; len-- ) {
  72. uint32_t s = *buf++;
  73. sum += s;
  74. if( sum < s )
  75. sum++;
  76. }
  77. /* Fold down to 16 bits */
  78. t1 = sum;
  79. t2 = sum >> 16;
  80. t1 += t2;
  81. if( t1 < t2 )
  82. t1++;
  83. return ~t1;
  84. }
  85. static void send_net_unreachable(int tun, char *offender) {
  86. icmp_pkt pkt;
  87. int off_iph_len;
  88. struct iphdr *off_iph = (struct iphdr *)offender;
  89. size_t pktlen, nsent;
  90. off_iph_len = off_iph->ihl * 4;
  91. if( off_iph_len >= sizeof(struct iphdr) + MAX_IPOPTLEN ) {
  92. log_error("not sending net unreachable: mulformed ip pkt: iph=%d\n", (int)off_iph_len);
  93. return; /* ip pkt mulformed */
  94. }
  95. if( off_iph->protocol == IPPROTO_ICMP ) {
  96. /* To avoid infinite loops, RFC 792 instructs not to send ICMPs
  97. * about ICMPs */
  98. return;
  99. }
  100. /* Lower 3 bits (in network order) of frag_off is actually flags */
  101. if( (off_iph->frag_off & htons(0x1FFF)) != 0 ) {
  102. /* ICMP messages are only sent for first fragemnt */
  103. return;
  104. }
  105. pktlen = sizeof(struct iphdr) + sizeof(struct icmphdr) + off_iph_len + 8;
  106. memset(&pkt, 0, sizeof(pkt));
  107. /* Fill in the IP header */
  108. pkt.iph.ihl = sizeof(struct iphdr) / 4;
  109. pkt.iph.version = IPVERSION;
  110. pkt.iph.tot_len = htons(pktlen);
  111. pkt.iph.ttl = 8;
  112. pkt.iph.protocol = IPPROTO_ICMP;
  113. pkt.iph.saddr = tun_addr;
  114. pkt.iph.daddr = off_iph->saddr;
  115. pkt.iph.check = cksum((aliasing_uint32_t*) &pkt.iph, sizeof(struct iphdr) / sizeof(aliasing_uint32_t));
  116. /* Fill in the ICMP header */
  117. pkt.icmph.type = ICMP_DEST_UNREACH;
  118. pkt.icmph.code = ICMP_NET_UNREACH;
  119. /* Copy the offenders IP hdr + first 8 bytes of IP payload */
  120. memcpy(pkt.data, offender, off_iph_len + 8);
  121. /* Compute the checksum over the ICMP header and data */
  122. pkt.icmph.checksum = cksum((aliasing_uint32_t*) &pkt.icmph,
  123. (sizeof(struct icmphdr) + off_iph_len + 8) / sizeof(aliasing_uint32_t));
  124. /* Kick it back */
  125. nsent = write(tun, &pkt, pktlen);
  126. if( nsent < 0 ) {
  127. log_error("failed to send ICMP net unreachable: %s\n", strerror(errno));
  128. } else if( nsent != pktlen ) {
  129. log_error("failed to send ICMP net unreachable: only %d out of %d byte sent\n", (int)nsent, (int)pktlen);
  130. }
  131. }
  132. static int set_route(struct ip_net dst, struct sockaddr_in *next_hop) {
  133. size_t i;
  134. for( i = 0; i < routes_cnt; i++ ) {
  135. if( dst.ip == routes[i].dst.ip && dst.mask == routes[i].dst.mask ) {
  136. routes[i].next_hop = *next_hop;
  137. return 0;
  138. }
  139. }
  140. if( routes_alloc == routes_cnt ) {
  141. int new_alloc = (routes_alloc ? 2*routes_alloc : 8);
  142. struct route_entry *new_routes = (struct route_entry *) realloc(routes, new_alloc*sizeof(struct route_entry));
  143. if( !new_routes )
  144. return ENOMEM;
  145. routes = new_routes;
  146. routes_alloc = new_alloc;
  147. }
  148. routes[routes_cnt].dst = dst;
  149. routes[routes_cnt].next_hop = *next_hop;
  150. routes_cnt++;
  151. return 0;
  152. }
  153. static int del_route(struct ip_net dst) {
  154. size_t i;
  155. for( i = 0; i < routes_cnt; i++ ) {
  156. if( dst.ip == routes[i].dst.ip && dst.mask == routes[i].dst.mask ) {
  157. routes[i] = routes[routes_cnt-1];
  158. routes_cnt--;
  159. return 0;
  160. }
  161. }
  162. return ENOENT;
  163. }
  164. static struct sockaddr_in *find_route(in_addr_t dst) {
  165. size_t i;
  166. for( i = 0; i < routes_cnt; i++ ) {
  167. if( contains(routes[i].dst, dst) ) {
  168. // packets for same dest tend to come in bursts. swap to front make it faster for subsequent ones
  169. if( i != 0 ) {
  170. struct route_entry tmp = routes[i];
  171. routes[i] = routes[0];
  172. routes[0] = tmp;
  173. }
  174. return &routes[0].next_hop;
  175. }
  176. }
  177. return NULL;
  178. }
  179. static char *inaddr_str(in_addr_t a, char *buf, size_t len) {
  180. struct in_addr addr;
  181. addr.s_addr = a;
  182. strncpy(buf, inet_ntoa(addr), len);
  183. buf[len-1] = '\0';
  184. return buf;
  185. }
  186. static ssize_t tun_recv_packet(int tun, char *buf, size_t buflen) {
  187. ssize_t nread = read(tun, buf, buflen);
  188. if( nread < sizeof(struct iphdr) ) {
  189. if( nread < 0 ) {
  190. if( errno != EAGAIN && errno != EWOULDBLOCK )
  191. log_error("TUN recv failed: %s\n", strerror(errno));
  192. } else {
  193. log_error("TUN recv packet too small: %d bytes\n", (int)nread);
  194. }
  195. return -1;
  196. }
  197. return nread;
  198. }
  199. static ssize_t sock_recv_packet(int sock, char *buf, size_t buflen) {
  200. ssize_t nread = recv(sock, buf, buflen, MSG_DONTWAIT);
  201. if( nread < sizeof(struct iphdr) ) {
  202. if( nread < 0 ) {
  203. if( errno != EAGAIN && errno != EWOULDBLOCK )
  204. log_error("UDP recv failed: %s\n", strerror(errno));
  205. } else {
  206. log_error("UDP recv packet too small: %d bytes\n", (int)nread);
  207. }
  208. return -1;
  209. }
  210. return nread;
  211. }
  212. static void sock_send_packet(int sock, char *pkt, size_t pktlen, struct sockaddr_in *dst) {
  213. ssize_t nsent = sendto(sock, pkt, pktlen, 0, (struct sockaddr *)dst, sizeof(struct sockaddr_in));
  214. if( nsent != pktlen ) {
  215. if( nsent < 0 ) {
  216. log_error("UDP send to %s:%hu failed: %s\n",
  217. inet_ntoa(dst->sin_addr), ntohs(dst->sin_port), strerror(errno));
  218. } else {
  219. log_error("Was only able to send %d out of %d bytes to %s:%hu\n",
  220. (int)nsent, (int)pktlen, inet_ntoa(dst->sin_addr), ntohs(dst->sin_port));
  221. }
  222. }
  223. }
  224. static void tun_send_packet(int tun, char *pkt, size_t pktlen) {
  225. ssize_t nsent;
  226. _retry:
  227. nsent = write(tun, pkt, pktlen);
  228. if( nsent != pktlen ) {
  229. if( nsent < 0 ) {
  230. if( errno == EAGAIN || errno == EWOULDBLOCK)
  231. goto _retry;
  232. log_error("TUN send failed: %s\n", strerror(errno));
  233. } else {
  234. log_error("Was only able to send %d out of %d bytes to TUN\n", (int)nsent, (int)pktlen);
  235. }
  236. }
  237. }
  238. inline static int decrement_ttl(struct iphdr *iph) {
  239. if( --(iph->ttl) == 0 ) {
  240. char saddr[32], daddr[32];
  241. log_error("Discarding IP fragment %s -> %s due to zero TTL\n",
  242. inaddr_str(iph->saddr, saddr, sizeof(saddr)),
  243. inaddr_str(iph->daddr, daddr, sizeof(daddr)));
  244. return 0;
  245. }
  246. /* patch up IP checksum (see RFC 1624) */
  247. if( iph->check >= htons(0xFFFFu - 0x100) ) {
  248. iph->check += htons(0x100) + 1;
  249. } else {
  250. iph->check += htons(0x100);
  251. }
  252. return 1;
  253. }
  254. static int tun_to_udp(int tun, int sock, char *buf, size_t buflen) {
  255. struct iphdr *iph;
  256. struct sockaddr_in *next_hop;
  257. ssize_t pktlen = tun_recv_packet(tun, buf, buflen);
  258. if( pktlen < 0 )
  259. return 0;
  260. iph = (struct iphdr *)buf;
  261. next_hop = find_route((in_addr_t) iph->daddr);
  262. if( !next_hop ) {
  263. send_net_unreachable(tun, buf);
  264. goto _active;
  265. }
  266. if( !decrement_ttl(iph) ) {
  267. /* TTL went to 0, discard.
  268. * TODO: send back ICMP Time Exceeded
  269. */
  270. goto _active;
  271. }
  272. sock_send_packet(sock, buf, pktlen, next_hop);
  273. _active:
  274. return 1;
  275. }
  276. static int udp_to_tun(int sock, int tun, char *buf, size_t buflen) {
  277. struct iphdr *iph;
  278. ssize_t pktlen = sock_recv_packet(sock, buf, buflen);
  279. if( pktlen < 0 )
  280. return 0;
  281. iph = (struct iphdr *)buf;
  282. if( !decrement_ttl(iph) ) {
  283. /* TTL went to 0, discard.
  284. * TODO: send back ICMP Time Exceeded
  285. */
  286. goto _active;
  287. }
  288. tun_send_packet(tun, buf, pktlen);
  289. _active:
  290. return 1;
  291. }
  292. static void process_cmd(int ctl) {
  293. struct command cmd;
  294. struct ip_net ipn;
  295. struct sockaddr_in sa = {
  296. .sin_family = AF_INET
  297. };
  298. ssize_t nrecv = recv(ctl, (char *) &cmd, sizeof(cmd), 0);
  299. if( nrecv < 0 ) {
  300. log_error("CTL recv failed: %s\n", strerror(errno));
  301. return;
  302. }
  303. if( cmd.cmd == CMD_SET_ROUTE ) {
  304. ipn.mask = netmask(cmd.dest_net_len);
  305. ipn.ip = cmd.dest_net & ipn.mask;
  306. sa.sin_addr.s_addr = cmd.next_hop_ip;
  307. sa.sin_port = htons(cmd.next_hop_port);
  308. set_route(ipn, &sa);
  309. } else if( cmd.cmd == CMD_DEL_ROUTE ) {
  310. ipn.mask = netmask(cmd.dest_net_len);
  311. ipn.ip = cmd.dest_net & ipn.mask;
  312. del_route(ipn);
  313. } else if( cmd.cmd == CMD_STOP ) {
  314. exit_flag = 1;
  315. }
  316. }
  317. enum PFD {
  318. PFD_TUN = 0,
  319. PFD_SOCK,
  320. PFD_CTL,
  321. PFD_CNT
  322. };
  323. void run_proxy(int tun, int sock, int ctl, in_addr_t tun_ip, size_t tun_mtu, int log_errors) {
  324. char *buf;
  325. struct pollfd fds[PFD_CNT] = {
  326. {
  327. .fd = tun,
  328. .events = POLLIN
  329. },
  330. {
  331. .fd = sock,
  332. .events = POLLIN
  333. },
  334. {
  335. .fd = ctl,
  336. .events = POLLIN
  337. },
  338. };
  339. exit_flag = 0;
  340. tun_addr = tun_ip;
  341. log_enabled = log_errors;
  342. buf = (char *) malloc(tun_mtu);
  343. if( !buf ) {
  344. log_error("Failed to allocate %d byte buffer\n", tun_mtu);
  345. exit(1);
  346. }
  347. fcntl(tun, F_SETFL, O_NONBLOCK);
  348. while( !exit_flag ) {
  349. int nfds = poll(fds, PFD_CNT, -1), activity;
  350. if( nfds < 0 ) {
  351. if( errno == EINTR )
  352. continue;
  353. log_error("Poll failed: %s\n", strerror(errno));
  354. exit(1);
  355. }
  356. if( fds[PFD_CTL].revents & POLLIN )
  357. process_cmd(ctl);
  358. if( fds[PFD_TUN].revents & POLLIN || fds[PFD_SOCK].revents & POLLIN )
  359. do {
  360. activity = 0;
  361. activity += tun_to_udp(tun, sock, buf, tun_mtu);
  362. activity += udp_to_tun(sock, tun, buf, tun_mtu);
  363. /* As long as tun or udp is readable bypass poll().
  364. * We'll just occasionally get EAGAIN on an unreadable fd which
  365. * is cheaper than the poll() call, the rest of the time the
  366. * read/recvfrom call moves data which poll() never does for us.
  367. *
  368. * This is at the expense of the ctl socket, a counter could be
  369. * used to place an upper bound on how long we may neglect ctl.
  370. */
  371. } while( activity );
  372. }
  373. free(buf);
  374. }