3 * Report MTU on path to specified host
5 * (c) 2008 Straylight/Edgeware
8 /*----- Licensing notice --------------------------------------------------*
10 * This file is part of Trivial IP Encryption (TrIPE).
12 * TrIPE is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * TrIPE is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with TrIPE; if not, write to the Free Software Foundation,
24 * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
27 /*----- Header files ------------------------------------------------------*/
42 #include <sys/types.h>
46 #include <sys/socket.h>
47 #include <netinet/in.h>
48 #include <arpa/inet.h>
51 #include <netinet/in_systm.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip_icmp.h>
54 #include <netinet/udp.h>
58 #include <sys/ioctl.h>
60 #include <mLib/alloc.h>
61 #include <mLib/bits.h>
62 #include <mLib/dstr.h>
64 #include <mLib/mdwopt.h>
65 #include <mLib/quis.h>
66 #include <mLib/report.h>
69 /*----- Static variables --------------------------------------------------*/
71 static unsigned char buf[65536];
75 /*----- Utility functions -------------------------------------------------*/
77 /* Step a value according to a simple LFSR. */
79 do (q) = ((q) & 0x8000) ? ((q) << 1) ^ POLY : ((q) << 1); while (0)
81 /* Fill buffer with a constant but pseudorandom string. Uses a simple
84 static void fillbuffer(unsigned char *p, size_t sz)
86 unsigned int y = 0xbc20;
87 const unsigned char *l = p + sz;
92 for (i = 0; i < 8; i++) STEP(y);
96 /* Convert a string to floating point. */
97 static double s2f(const char *s, const char *what)
104 if (errno || *q) die(EXIT_FAILURE, "bad %s", what);
108 /* Convert a floating-point value into a struct timeval. */
109 static void f2tv(struct timeval *tv, double t)
110 { tv->tv_sec = t; tv->tv_usec = (t - tv->tv_sec)*MILLION; }
112 /*----- Main algorithm skeleton -------------------------------------------*/
115 unsigned f; /* Various flags */
116 #define F_VERBOSE 1u /* Give a running commentary */
117 double retx; /* Initial retransmit interval */
118 double regr; /* Retransmit growth factor */
119 double timeout; /* Retransmission timeout */
120 int seqoff; /* Offset to write sequence number */
121 const struct probe_ops *pops; /* Probe algorithm description */
122 struct sockaddr_in sin; /* Destination address */
126 const struct param *pp;
132 const struct probe_ops *next;
134 int (*setup)(void *, int, const struct param *);
135 void (*finish)(void *);
136 void (*selprep)(void *, int *, fd_set *);
137 int (*xmit)(void *, int);
138 int (*selproc)(void *, fd_set *, struct probestate *);
149 /* or a positive MTU upper-bound */
152 /* Add a file descriptor FD to the set `fd_in', updating `*maxfd'. */
154 do { FD_SET(fd, fd_in); if (*maxfd < fd) *maxfd = fd; } while (0)
156 /* Check whether a buffer contains a packet from our current probe. */
157 static int mypacketp(struct probestate *ps,
158 const unsigned char *p, size_t sz)
160 const struct param *pp = ps->pp;
162 return (sz >= pp->seqoff + 2 && LOAD16(p + pp->seqoff) == ps->q);
165 /* See whether MTU is an acceptable MTU value. Return an appropriate
166 * RC_... code or a new suggested MTU.
168 static int probe(struct probestate *ps, void *st, int mtu)
170 const struct param *pp = ps->pp;
172 struct timeval tv, now, when, done;
173 double timer = pp->retx;
176 /* Set up the first retransmit and give-up timers. */
177 gettimeofday(&now, 0);
178 f2tv(&tv, pp->timeout); TV_ADD(&done, &now, &tv);
179 f2tv(&tv, timer); TV_ADD(&when, &now, &tv);
180 if (TV_CMP(&when, >, &done)) when = done;
182 /* Send the initial probe. */
183 if (pp->f & F_VERBOSE)
184 moan("sending probe of size %d (seq = %04x)", mtu, ps->q);
186 STORE16(buf + pp->seqoff, ps->q);
187 if ((rc = pp->pops->xmit(st, mtu)) != RC_OK) return (rc);
191 /* Wait for something interesting to happen. */
192 maxfd = 0; FD_ZERO(&fd_in);
193 pp->pops->selprep(st, &maxfd, &fd_in);
194 TV_SUB(&tv, &when, &now);
195 if (select(maxfd + 1, &fd_in, 0, 0, &tv) < 0) return (RC_FAIL);
196 gettimeofday(&now, 0);
198 /* See whether the probe method has any answers for us. */
199 if ((rc = pp->pops->selproc(st, &fd_in, ps)) != RC_OK) return (rc);
201 /* If we've waited too long, give up. If we should retransmit, do
204 if (TV_CMP(&now, >, &done))
206 else if (TV_CMP(&now, >, &when)) {
207 if (pp->f & F_VERBOSE) moan("re-sending probe of size %d", mtu);
208 if ((rc = pp->pops->xmit(st, mtu)) != RC_OK) return (rc);
210 timer *= pp->regr; f2tv(&tv, timer); TV_ADD(&when, &when, &tv);
211 } while (TV_CMP(&when, <, &now));
212 if (TV_CMP(&when, >, &done)) when = done;
217 /* Discover the path MTU to the destination address. */
218 static int pathmtu(const struct param *pp)
224 struct probestate ps;
226 /* Build and connect a UDP socket. We'll need this to know the local port
227 * number to use if nothing else. Set other stuff up.
229 if ((sk = socket(PF_INET, SOCK_DGRAM, 0)) < 0) goto fail_0;
230 if (connect(sk, (struct sockaddr *)&pp->sin, sizeof(pp->sin))) goto fail_1;
231 st = xmalloc(pp->pops->statesz);
232 if ((mtu = pp->pops->setup(st, sk, pp)) < 0) goto fail_2;
233 ps.pp = pp; ps.q = rand() & 0xffff;
236 /* And now we do a thing which is sort of like a binary search, except that
237 * we also take explicit clues as establishing a new upper bound, and we
238 * try to hug that initially.
241 rc = probe(&ps, st, mtu);
245 if (pp->f & F_VERBOSE) moan("probe failed");
249 /* If we've not seen a dropped packet before then we don't know what
250 * this means yet -- in particular, we don't know which bit of the
251 * network is swallowing packets. Send a minimum-size probe. If
252 * that doesn't come back then assume that the remote host is
253 * swallowing our packets. If it does, then we assume that dropped
254 * packets are a result of ICMP fragmentation-needed reports being
255 * lost or suppressed.
257 if (pp->f & F_VERBOSE) moan("gave up: black hole detected");
259 if (pp->f & F_VERBOSE) moan("sending minimum-size probe");
260 switch (probe(&ps, st, lo)) {
264 if (pp->f & F_VERBOSE) {
265 moan("no reply from min-size probe: "
266 "assume black hole at target");
271 if (pp->f & F_VERBOSE) {
272 moan("reply from min-size probe OK: "
273 "assume black hole in network");
278 if (pp->f & F_VERBOSE)
279 moan("unexpected return code from probe");
285 if (droppy) goto higher; else goto lower;
290 if (pp->f & F_VERBOSE)
291 moan("probe returned: remote host is not a black hole");
295 if (pp->f & F_VERBOSE) moan("probe returned: found correct MTU");
298 if (pp->f & F_VERBOSE) moan("probe returned: guessing higher");
300 mtu += (hi - lo + 1)/2;
306 if (pp->f & F_VERBOSE) moan("error returned: found correct MTU");
309 if (pp->f & F_VERBOSE) moan("error returned: guessing lower");
311 mtu -= (hi - lo + 1)/2;
315 if (pp->f & F_VERBOSE) moan("error returned with new MTU estimate");
322 /* Clean up and return our result. */
323 pp->pops->finish(st);
329 pp->pops->finish(st);
338 /*----- Doing it the hard way ---------------------------------------------*/
340 #if defined(linux) || defined(__OpenBSD__)
345 # define sane_htons htons
346 # define sane_htonl htonl
352 static int rawicmp = -1, rawudp = -1, rawerr = 0;
354 #define IPCK_INIT 0xffff
356 /* Compute an IP checksum over some data. This is a restartable interface:
357 * initialize A to `IPCK_INIT' for the first call.
359 static unsigned ipcksum(const void *buf, size_t n, unsigned a)
361 unsigned long aa = a ^ 0xffff;
362 const unsigned char *p = buf, *l = p + n;
364 while (p < l - 1) { aa += LOAD16_B(p); p += 2; }
365 if (p < l) { aa += (unsigned)(*p) << 8; }
366 do aa = (aa & 0xffff) + (aa >> 16); while (aa >= 0x10000);
367 return (aa == 0xffff ? aa : aa ^ 0xffff);
370 /* TCP/UDP pseudoheader structure. */
372 struct in_addr ph_src, ph_dst;
378 struct sockaddr_in me, sin;
379 int sk, rawicmp, rawudp;
383 static int raw_setup(void *stv, int sk, const struct param *pp)
385 struct raw_state *st = stv;
388 struct ifaddrs *ifa, *ifaa, *ifap;
391 /* If we couldn't acquire raw sockets, we fail here. */
392 if (rawerr) { errno = rawerr; goto fail_0; }
393 st->rawicmp = rawicmp; st->rawudp = rawudp; st->sk = sk;
395 /* Initialize the sequence number. */
396 st->q = rand() & 0xffff;
398 /* Snaffle the local and remote address and port number. */
401 if (getsockname(sk, (struct sockaddr *)&st->me, &sz))
404 /* There isn't a portable way to force the DF flag onto a packet through
405 * UDP, or even through raw IP, unless we write the entire IP header
406 * ourselves. This is somewhat annoying, especially since we have an
407 * uphill struggle keeping track of which systems randomly expect which
408 * header fields to be presented in host byte order. Oh, well.
411 if (setsockopt(rawudp, IPPROTO_IP, IP_HDRINCL, &i, sizeof(i))) goto fail_0;
413 /* Find an upper bound on the MTU. Do two passes over the interface
414 * list. If we can find matches for our local address then use the
415 * highest one of those; otherwise do a second pass and simply take the
416 * highest MTU of any network interface.
418 if (getifaddrs(&ifaa)) goto fail_0;
419 for (i = 0; i < 2; i++) {
420 for (ifap = 0, ifa = ifaa; ifa; ifa = ifa->ifa_next) {
421 if (!(ifa->ifa_flags & IFF_UP) || !ifa->ifa_addr ||
422 ifa->ifa_addr->sa_family != AF_INET ||
424 ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr.s_addr !=
425 st->me.sin_addr.s_addr) ||
426 (i == 1 && ifap && strcmp(ifap->ifa_name, ifa->ifa_name) == 0) ||
427 strlen(ifa->ifa_name) >= sizeof(ifr.ifr_name))
430 strcpy(ifr.ifr_name, ifa->ifa_name);
431 if (ioctl(sk, SIOCGIFMTU, &ifr)) goto fail_1;
432 if (mtu < ifr.ifr_mtu) mtu = ifr.ifr_mtu;
436 if (mtu < 0) { errno = ENOTCONN; goto fail_1; }
448 static void raw_finish(void *stv) { ; }
450 static void raw_selprep(void *stv, int *maxfd, fd_set *fd_in)
451 { struct raw_state *st = stv; ADDFD(st->sk); ADDFD(st->rawicmp); }
453 static int raw_xmit(void *stv, int mtu)
455 struct raw_state *st = stv;
456 unsigned char b[65536], *p;
462 /* Build the IP header. */
465 ip->ip_hl = sizeof(*ip)/4;
466 ip->ip_tos = IPTOS_RELIABILITY;
467 ip->ip_len = sane_htons(mtu);
468 STEP(st->q); ip->ip_id = htons(st->q);
469 ip->ip_off = sane_htons(0 | IP_DF);
471 ip->ip_p = IPPROTO_UDP;
473 ip->ip_src = st->me.sin_addr;
474 ip->ip_dst = st->sin.sin_addr;
476 /* Build a UDP packet in the output buffer. */
477 udp = (struct udphdr *)(ip + 1);
478 udp->uh_sport = st->me.sin_port;
479 udp->uh_dport = st->sin.sin_port;
480 udp->uh_ulen = htons(mtu - sizeof(*ip));
483 /* Copy the payload. */
484 p = (unsigned char *)(udp + 1);
485 memcpy(p, buf, mtu - (p - b));
487 /* Calculate the UDP checksum. */
488 ph.ph_src = ip->ip_src;
489 ph.ph_dst = ip->ip_dst;
491 ph.ph_p = IPPROTO_UDP;
492 ph.ph_len = udp->uh_ulen;
494 ck = ipcksum(&ph, sizeof(ph), ck);
495 ck = ipcksum(udp, mtu - sizeof(*ip), ck);
496 udp->uh_sum = htons(ck);
498 /* Send the whole thing off. If we're too big for the interface then we
499 * might need to trim immediately.
501 if (sendto(st->rawudp, b, mtu, 0,
502 (struct sockaddr *)&st->sin, sizeof(st->sin)) < 0) {
503 if (errno == EMSGSIZE) return (RC_LOWER);
514 static int raw_selproc(void *stv, fd_set *fd_in, struct probestate *ps)
516 struct raw_state *st = stv;
517 unsigned char b[65536];
523 /* An ICMP packet: see what's inside. */
524 if (FD_ISSET(st->rawicmp, fd_in)) {
525 if ((n = read(st->rawicmp, b, sizeof(b))) < 0) goto fail_0;
528 if (n < sizeof(*ip) || n < sizeof(4*ip->ip_hl) ||
529 ip->ip_v != 4 || ip->ip_p != IPPROTO_ICMP)
531 n -= sizeof(4*ip->ip_hl);
533 icmp = (struct icmp *)(b + 4*ip->ip_hl);
534 if (n < sizeof(*icmp) || icmp->icmp_type != ICMP_UNREACH)
536 n -= offsetof(struct icmp, icmp_ip);
539 if (n < sizeof(*ip) ||
540 ip->ip_p != IPPROTO_UDP || ip->ip_hl != sizeof(*ip)/4 ||
541 ip->ip_id != htons(st->q) ||
542 ip->ip_src.s_addr != st->me.sin_addr.s_addr ||
543 ip->ip_dst.s_addr != st->sin.sin_addr.s_addr)
547 udp = (struct udphdr *)(ip + 1);
548 if (n < sizeof(udp) || udp->uh_sport != st->me.sin_port ||
549 udp->uh_dport != st->sin.sin_port)
553 if (icmp->icmp_code == ICMP_UNREACH_PORT) return (RC_HIGHER);
554 else if (icmp->icmp_code != ICMP_UNREACH_NEEDFRAG) goto skip_icmp;
555 else if (icmp->icmp_nextmtu) return (htons(icmp->icmp_nextmtu));
556 else return (RC_LOWER);
560 /* If we got a reply to the current probe then we're good. If we got an
561 * error, or the packet's sequence number is wrong, then ignore it.
563 if (FD_ISSET(st->sk, fd_in)) {
564 if ((n = read(st->sk, b, sizeof(b))) < 0) return (RC_OK);
565 else if (mypacketp(ps, b, n)) return (RC_HIGHER);
575 static const struct probe_ops raw_ops = {
576 "raw", OPS_CHAIN, sizeof(struct raw_state),
577 raw_setup, raw_finish,
578 raw_selprep, raw_xmit, raw_selproc
582 #define OPS_CHAIN &raw_ops
584 /*----- Doing the job on Linux --------------------------------------------*/
589 # define IP_MTU 14 /* Blech! */
596 static int linux_setup(void *stv, int sk, const struct param *pp)
598 struct linux_state *st = stv;
602 /* Snaffle the UDP socket. */
605 /* Turn on kernel path-MTU discovery and force DF on. */
607 if (setsockopt(st->sk, IPPROTO_IP, IP_MTU_DISCOVER, &i, sizeof(i)))
610 /* Read the initial MTU guess back and report it. */
612 if (getsockopt(st->sk, IPPROTO_IP, IP_MTU, &mtu, &sz))
619 static void linux_finish(void *stv) { ; }
621 static void linux_selprep(void *stv, int *maxfd, fd_set *fd_in)
622 { struct linux_state *st = stv; ADDFD(st->sk); }
624 static int linux_xmit(void *stv, int mtu)
626 struct linux_state *st = stv;
628 /* Write the packet. */
629 if (write(st->sk, buf, mtu - 28) >= 0) return (RC_OK);
630 else if (errno == EMSGSIZE) return (RC_LOWER);
631 else return (RC_FAIL);
634 static int linux_selproc(void *stv, fd_set *fd_in, struct probestate *ps)
636 struct linux_state *st = stv;
640 unsigned char b[65536];
642 /* Read an answer. If it looks like the right kind of error then report a
643 * success. This is potentially wrong, since we can't tell whether an
644 * error was delayed from an earlier probe. However, we never return
645 * RC_LOWER from this method, so the packet sizes ought to be monotonically
646 * decreasing and this won't cause trouble. Otherwise update from the
647 * kernel's idea of the right MTU.
649 if (FD_ISSET(st->sk, fd_in)) {
650 n = read(st->sk, &buf, sizeof(buf));
652 mypacketp(ps, b, n) :
653 errno == ECONNREFUSED || errno == EHOSTUNREACH)
656 if (getsockopt(st->sk, IPPROTO_IP, IP_MTU, &mtu, &sz))
663 static const struct probe_ops linux_ops = {
664 "linux", OPS_CHAIN, sizeof(struct linux_state),
665 linux_setup, linux_finish,
666 linux_selprep, linux_xmit, linux_selproc
670 #define OPS_CHAIN &linux_ops
674 /*----- Help options ------------------------------------------------------*/
676 static const struct probe_ops *probe_ops = OPS_CHAIN;
678 static void version(FILE *fp)
679 { pquis(fp, "$, TrIPE version " VERSION "\n"); }
681 static void usage(FILE *fp)
683 pquis(fp, "Usage: $ [-H HEADER] [-m METHOD]\n\
684 [-r SECS] [-g FACTOR] [-t SECS] HOST [PORT]\n");
687 static void help(FILE *fp)
689 const struct probe_ops *ops;
698 -h, --help Show this help text.\n\
699 -v, --version Show version number.\n\
700 -u, --usage Show brief usage message.\n\
702 -g, --growth=FACTOR Growth factor for retransmit interval.\n\
703 -m, --method=METHOD Use METHOD to probe for MTU.\n\
704 -r, --retransmit=SECS Retransmit if no reply after SEC.\n\
705 -t, --timeout=SECS Give up expecting a reply after SECS.\n\
706 -H, --header=HEX Packet header, in hexadecimal.\n\
710 for (ops = probe_ops; ops; ops = ops->next)
711 printf("\t%s\n", ops->name);
714 /*----- Main code ---------------------------------------------------------*/
716 int main(int argc, char *argv[])
718 struct param pp = { 0, 0.333, 3.0, 8.0, 0, OPS_CHAIN };
731 if ((rawicmp = socket(PF_INET, SOCK_RAW, IPPROTO_ICMP)) < 0 ||
732 (rawudp = socket(PF_INET, SOCK_RAW, IPPROTO_UDP)) < 0)
734 if (setuid(getuid()))
738 fillbuffer(buf, sizeof(buf));
739 pp.sin.sin_port = htons(7);
742 static const struct option opts[] = {
743 { "help", 0, 0, 'h' },
744 { "version", 0, 0, 'V' },
745 { "usage", 0, 0, 'u' },
746 { "header", OPTF_ARGREQ, 0, 'H' },
747 { "growth", OPTF_ARGREQ, 0, 'g' },
748 { "method", OPTF_ARGREQ, 0, 'm' },
749 { "retransmit", OPTF_ARGREQ, 0, 'r' },
750 { "timeout", OPTF_ARGREQ, 0, 't' },
751 { "verbose", 0, 0, 'v' },
755 i = mdwopt(argc, argv, "hVu" "H:g:m:r:t:v", opts, 0, 0, 0);
758 case 'h': help(stdout); exit(0);
759 case 'V': version(stdout); exit(0);
760 case 'u': usage(stdout); exit(0);
765 hex_decode(&hc, optarg, strlen(optarg), &d);
766 hex_decode(&hc, 0, 0, &d);
767 sz = d.len < 532 ? d.len : 532;
768 memcpy(buf, d.buf, sz);
772 case 'g': pp.regr = s2f(optarg, "retransmit growth factor"); break;
773 case 'r': pp.retx = s2f(optarg, "retransmit interval"); break;
774 case 't': pp.timeout = s2f(optarg, "timeout"); break;
777 for (pp.pops = OPS_CHAIN; pp.pops; pp.pops = pp.pops->next)
778 if (strcmp(pp.pops->name, optarg) == 0) goto found_alg;
779 die(EXIT_FAILURE, "unknown probe algorithm `%s'", optarg);
783 case 'v': pp.f |= F_VERBOSE; break;
790 argv += optind; argc -= optind;
791 if ((f & f_bogus) || 1 > argc || argc > 2) {
796 if ((h = gethostbyname(*argv)) == 0)
797 die(EXIT_FAILURE, "unknown host `%s': %s", *argv, hstrerror(h_errno));
798 if (h->h_addrtype != AF_INET)
799 die(EXIT_FAILURE, "unsupported address family for host `%s'", *argv);
800 memcpy(&pp.sin.sin_addr, h->h_addr, sizeof(struct in_addr));
805 u = strtoul(*argv, &q, 0);
807 pp.sin.sin_port = htons(u);
808 else if ((s = getservbyname(*argv, "udp")) == 0)
809 die(EXIT_FAILURE, "unknown UDP service `%s'", *argv);
811 pp.sin.sin_port = s->s_port;
814 pp.sin.sin_family = AF_INET;
817 die(EXIT_FAILURE, "failed to discover MTU: %s", strerror(errno));
819 if (ferror(stdout) || fflush(stdout) || fclose(stdout))
820 die(EXIT_FAILURE, "failed to write result: %s", strerror(errno));
824 /*----- That's all, folks -------------------------------------------------*/