chiark / gitweb /
pathmtu/pathmtu.c: Place `addreq' with the rest of the `raw' strategy.
[tripe] / pathmtu / pathmtu.c
CommitLineData
c64d8cd5
MW
1/* -*-c-*-
2 *
3 * Report MTU on path to specified host
4 *
5 * (c) 2008 Straylight/Edgeware
6 */
7
8/*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of Trivial IP Encryption (TrIPE).
11 *
11ad66c2
MW
12 * TrIPE is free software: you can redistribute it and/or modify it under
13 * the terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 3 of the License, or (at your
15 * option) any later version.
c64d8cd5 16 *
11ad66c2
MW
17 * TrIPE is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
20 * for more details.
c64d8cd5
MW
21 *
22 * You should have received a copy of the GNU General Public License
11ad66c2 23 * along with TrIPE. If not, see <https://www.gnu.org/licenses/>.
c64d8cd5
MW
24 */
25
26/*----- Header files ------------------------------------------------------*/
27
28#include "config.h"
29
d245350a 30#include <assert.h>
c64d8cd5 31#include <errno.h>
88510d86 32#include <stddef.h>
c64d8cd5
MW
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <time.h>
37
38#include <sys/types.h>
39#include <sys/time.h>
40#include <unistd.h>
41
42#include <sys/socket.h>
43#include <netinet/in.h>
44#include <arpa/inet.h>
45#include <netdb.h>
46
88510d86
MW
47#include <netinet/in_systm.h>
48#include <netinet/ip.h>
49#include <netinet/ip_icmp.h>
102fa2f0
MW
50#include <netinet/ip6.h>
51#include <netinet/icmp6.h>
88510d86
MW
52#include <netinet/udp.h>
53
54#include <net/if.h>
55#include <ifaddrs.h>
56#include <sys/ioctl.h>
57
58#include <mLib/alloc.h>
59#include <mLib/bits.h>
c64d8cd5
MW
60#include <mLib/dstr.h>
61#include <mLib/hex.h>
62#include <mLib/mdwopt.h>
63#include <mLib/quis.h>
64#include <mLib/report.h>
65#include <mLib/tv.h>
66
67/*----- Static variables --------------------------------------------------*/
68
69static unsigned char buf[65536];
70
88510d86
MW
71#define POLY 0x1002d
72
c64d8cd5
MW
73/*----- Utility functions -------------------------------------------------*/
74
88510d86
MW
75/* Step a value according to a simple LFSR. */
76#define STEP(q) \
77 do (q) = ((q) & 0x8000) ? ((q) << 1) ^ POLY : ((q) << 1); while (0)
78
c64d8cd5
MW
79/* Fill buffer with a constant but pseudorandom string. Uses a simple
80 * LFSR.
81 */
82static void fillbuffer(unsigned char *p, size_t sz)
83{
84 unsigned int y = 0xbc20;
85 const unsigned char *l = p + sz;
86 int i;
c64d8cd5
MW
87
88 while (p < l) {
89 *p++ = y & 0xff;
88510d86 90 for (i = 0; i < 8; i++) STEP(y);
c64d8cd5
MW
91 }
92}
93
88510d86
MW
94/* Convert a string to floating point. */
95static double s2f(const char *s, const char *what)
96{
97 double f;
98 char *q;
c64d8cd5 99
88510d86
MW
100 errno = 0;
101 f = strtod(s, &q);
102 if (errno || *q) die(EXIT_FAILURE, "bad %s", what);
103 return (f);
104}
c64d8cd5 105
88510d86
MW
106/* Convert a floating-point value into a struct timeval. */
107static void f2tv(struct timeval *tv, double t)
108 { tv->tv_sec = t; tv->tv_usec = (t - tv->tv_sec)*MILLION; }
109
454f5a1a
MW
110union addr {
111 struct sockaddr sa;
112 struct sockaddr_in sin;
22062fb6 113 struct sockaddr_in6 sin6;
454f5a1a
MW
114};
115
22062fb6
MW
116/* Check whether an address family is even slightly supported. */
117static int addrfamok(int af)
118{
119 switch (af) {
120 case AF_INET: case AF_INET6: return (1);
121 default: return (0);
122 }
123}
124
454f5a1a
MW
125/* Return the size of a socket address. */
126static size_t addrsz(const union addr *a)
127{
128 switch (a->sa.sa_family) {
129 case AF_INET: return (sizeof(a->sin));
22062fb6 130 case AF_INET6: return (sizeof(a->sin6));
454f5a1a
MW
131 default: abort();
132 }
133}
134
88510d86
MW
135/*----- Main algorithm skeleton -------------------------------------------*/
136
137struct param {
138 unsigned f; /* Various flags */
139#define F_VERBOSE 1u /* Give a running commentary */
140 double retx; /* Initial retransmit interval */
141 double regr; /* Retransmit growth factor */
142 double timeout; /* Retransmission timeout */
143 int seqoff; /* Offset to write sequence number */
144 const struct probe_ops *pops; /* Probe algorithm description */
454f5a1a 145 union addr a; /* Destination address */
88510d86
MW
146};
147
148struct probestate {
149 const struct param *pp;
150 unsigned q;
151};
152
153struct probe_ops {
154 const char *name;
155 const struct probe_ops *next;
156 size_t statesz;
157 int (*setup)(void *, int, const struct param *);
158 void (*finish)(void *);
159 void (*selprep)(void *, int *, fd_set *);
160 int (*xmit)(void *, int);
161 int (*selproc)(void *, fd_set *, struct probestate *);
162};
163
164#define OPS_CHAIN 0
165
166enum {
167 RC_FAIL = -99,
168 RC_OK = 0,
169 RC_LOWER = -1,
170 RC_HIGHER = -2,
171 RC_NOREPLY = -3
172 /* or a positive MTU upper-bound */
173};
174
175/* Add a file descriptor FD to the set `fd_in', updating `*maxfd'. */
176#define ADDFD(fd) \
177 do { FD_SET(fd, fd_in); if (*maxfd < fd) *maxfd = fd; } while (0)
178
179/* Check whether a buffer contains a packet from our current probe. */
180static int mypacketp(struct probestate *ps,
181 const unsigned char *p, size_t sz)
182{
183 const struct param *pp = ps->pp;
c64d8cd5 184
88510d86
MW
185 return (sz >= pp->seqoff + 2 && LOAD16(p + pp->seqoff) == ps->q);
186}
187
188/* See whether MTU is an acceptable MTU value. Return an appropriate
189 * RC_... code or a new suggested MTU.
190 */
191static int probe(struct probestate *ps, void *st, int mtu)
c64d8cd5 192{
88510d86 193 const struct param *pp = ps->pp;
c64d8cd5 194 fd_set fd_in;
88510d86
MW
195 struct timeval tv, now, when, done;
196 double timer = pp->retx;
197 int rc, maxfd;
198
199 /* Set up the first retransmit and give-up timers. */
200 gettimeofday(&now, 0);
201 f2tv(&tv, pp->timeout); TV_ADD(&done, &now, &tv);
202 f2tv(&tv, timer); TV_ADD(&when, &now, &tv);
203 if (TV_CMP(&when, >, &done)) when = done;
204
205 /* Send the initial probe. */
206 if (pp->f & F_VERBOSE)
207 moan("sending probe of size %d (seq = %04x)", mtu, ps->q);
208 STEP(ps->q);
209 STORE16(buf + pp->seqoff, ps->q);
210 if ((rc = pp->pops->xmit(st, mtu)) != RC_OK) return (rc);
211
212 for (;;) {
213
214 /* Wait for something interesting to happen. */
215 maxfd = 0; FD_ZERO(&fd_in);
216 pp->pops->selprep(st, &maxfd, &fd_in);
217 TV_SUB(&tv, &when, &now);
218 if (select(maxfd + 1, &fd_in, 0, 0, &tv) < 0) return (RC_FAIL);
219 gettimeofday(&now, 0);
220
221 /* See whether the probe method has any answers for us. */
222 if ((rc = pp->pops->selproc(st, &fd_in, ps)) != RC_OK) return (rc);
223
224 /* If we've waited too long, give up. If we should retransmit, do
225 * that.
226 */
227 if (TV_CMP(&now, >, &done))
228 return (RC_NOREPLY);
229 else if (TV_CMP(&now, >, &when)) {
230 if (pp->f & F_VERBOSE) moan("re-sending probe of size %d", mtu);
231 if ((rc = pp->pops->xmit(st, mtu)) != RC_OK) return (rc);
232 do {
233 timer *= pp->regr; f2tv(&tv, timer); TV_ADD(&when, &when, &tv);
234 } while (TV_CMP(&when, <, &now));
235 if (TV_CMP(&when, >, &done)) when = done;
236 }
237 }
238}
c64d8cd5 239
88510d86
MW
240/* Discover the path MTU to the destination address. */
241static int pathmtu(const struct param *pp)
242{
243 int sk;
244 int mtu, lo, hi;
245 int rc, droppy = -1;
246 void *st;
247 struct probestate ps;
248
249 /* Build and connect a UDP socket. We'll need this to know the local port
250 * number to use if nothing else. Set other stuff up.
251 */
454f5a1a
MW
252 if ((sk = socket(pp->a.sa.sa_family, SOCK_DGRAM, IPPROTO_UDP)) < 0)
253 goto fail_0;
254 if (connect(sk, &pp->a.sa, addrsz(&pp->a))) goto fail_1;
88510d86
MW
255 st = xmalloc(pp->pops->statesz);
256 if ((mtu = pp->pops->setup(st, sk, pp)) < 0) goto fail_2;
257 ps.pp = pp; ps.q = rand() & 0xffff;
22062fb6
MW
258 switch (pp->a.sa.sa_family) {
259 case AF_INET: lo = 576; break;
260 case AF_INET6: lo = 1280; break;
261 default: abort();
262 }
263 hi = mtu;
264 if (hi < lo) { errno = EMSGSIZE; return (-1); }
88510d86
MW
265
266 /* And now we do a thing which is sort of like a binary search, except that
267 * we also take explicit clues as establishing a new upper bound, and we
268 * try to hug that initially.
269 */
c64d8cd5 270 for (;;) {
d245350a
MW
271 assert(lo <= mtu && mtu <= hi);
272 if (pp->f & F_VERBOSE) moan("probe: %d <= %d <= %d", lo, mtu, hi);
88510d86
MW
273 rc = probe(&ps, st, mtu);
274 switch (rc) {
275
276 case RC_FAIL:
277 if (pp->f & F_VERBOSE) moan("probe failed");
278 goto fail_3;
279
280 case RC_NOREPLY:
281 /* If we've not seen a dropped packet before then we don't know what
282 * this means yet -- in particular, we don't know which bit of the
283 * network is swallowing packets. Send a minimum-size probe. If
284 * that doesn't come back then assume that the remote host is
285 * swallowing our packets. If it does, then we assume that dropped
286 * packets are a result of ICMP fragmentation-needed reports being
287 * lost or suppressed.
288 */
289 if (pp->f & F_VERBOSE) moan("gave up: black hole detected");
290 if (droppy == -1) {
291 if (pp->f & F_VERBOSE) moan("sending minimum-size probe");
292 switch (probe(&ps, st, lo)) {
293 case RC_FAIL:
294 goto fail_3;
295 case RC_NOREPLY:
296 if (pp->f & F_VERBOSE) {
297 moan("no reply from min-size probe: "
298 "assume black hole at target");
299 }
300 droppy = 1;
301 break;
302 case RC_HIGHER:
303 if (pp->f & F_VERBOSE) {
304 moan("reply from min-size probe OK: "
305 "assume black hole in network");
306 }
307 droppy = 0;
308 break;
309 default:
310 if (pp->f & F_VERBOSE)
311 moan("unexpected return code from probe");
312 errno = ENOTCONN;
313 goto fail_3;
314 }
315 }
316
317 if (droppy) goto higher; else goto lower;
318
319 case RC_HIGHER:
320 higher:
321 if (droppy == -1) {
322 if (pp->f & F_VERBOSE)
323 moan("probe returned: remote host is not a black hole");
324 droppy = 0;
325 }
326 if (mtu == hi) {
327 if (pp->f & F_VERBOSE) moan("probe returned: found correct MTU");
328 goto done;
329 }
88510d86 330 lo = mtu;
d245350a
MW
331
332 /* Now we must make a new guess, between lo and hi. We know that lo
333 * is good; but we're not so sure about hi here. We know that hi >
334 * lo, so this will find an approximate midpoint, greater than lo and
335 * no more than hi.
336 */
337 if (pp->f & F_VERBOSE) moan("probe returned: guessing higher");
88510d86
MW
338 mtu += (hi - lo + 1)/2;
339 break;
340
341 case RC_LOWER:
342 lower:
d245350a
MW
343 /* If this didn't work, and we're already at the bottom of our
344 * possible range, then something has gone horribly wrong.
345 */
346 assert(lo < mtu);
347 hi = mtu - 1;
348 if (lo == hi) {
88510d86 349 if (pp->f & F_VERBOSE) moan("error returned: found correct MTU");
d245350a 350 mtu = lo;
88510d86
MW
351 goto done;
352 }
d245350a
MW
353
354 /* We must make a new guess, between lo and hi. We're probably
355 * fairly sure that lo will succeed, since either it's the minimum
356 * MTU or we've tested it already; but we're not quite sure about hi,
357 * so we want to aim high.
358 */
88510d86 359 if (pp->f & F_VERBOSE) moan("error returned: guessing lower");
88510d86
MW
360 mtu -= (hi - lo + 1)/2;
361 break;
362
363 default:
364 if (pp->f & F_VERBOSE) moan("error returned with new MTU estimate");
365 mtu = hi = rc;
366 break;
367 }
c64d8cd5 368 }
88510d86
MW
369
370done:
371 /* Clean up and return our result. */
372 pp->pops->finish(st);
373 xfree(st);
c64d8cd5
MW
374 close(sk);
375 return (mtu);
376
88510d86
MW
377fail_3:
378 pp->pops->finish(st);
379fail_2:
380 xfree(st);
c64d8cd5
MW
381fail_1:
382 close(sk);
383fail_0:
384 return (-1);
385}
386
88510d86
MW
387/*----- Doing it the hard way ---------------------------------------------*/
388
389#if defined(linux) || defined(__OpenBSD__)
4fcb3e51 390# define IPHDR_SANE
88510d86
MW
391#endif
392
393#ifdef IPHDR_SANE
394# define sane_htons htons
395# define sane_htonl htonl
c64d8cd5 396#else
88510d86
MW
397# define sane_htons
398# define sane_htonl
399#endif
400
401static int rawicmp = -1, rawudp = -1, rawerr = 0;
102fa2f0 402static int rawicmp6 = -1, rawudp6 = -1, rawerr6 = 0;
88510d86
MW
403
404#define IPCK_INIT 0xffff
405
f03efaf5
MW
406/* Compare two addresses. Maybe compare the port numbers too. */
407#define AEF_PORT 1u
408static int addreq(const union addr *a, const union addr *b, unsigned f)
409{
410 switch (a->sa.sa_family) {
411 case AF_INET:
412 return (a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr &&
413 (!(f&AEF_PORT) || a->sin.sin_port == b->sin.sin_port));
414 case AF_INET6:
415 return (!memcmp(a->sin6.sin6_addr.s6_addr,
416 b->sin6.sin6_addr.s6_addr, 16) &&
417 (!(f&AEF_PORT) || a->sin6.sin6_port == b->sin6.sin6_port));
418 default:
419 abort();
420 }
421}
422
88510d86
MW
423/* Compute an IP checksum over some data. This is a restartable interface:
424 * initialize A to `IPCK_INIT' for the first call.
425 */
426static unsigned ipcksum(const void *buf, size_t n, unsigned a)
427{
428 unsigned long aa = a ^ 0xffff;
429 const unsigned char *p = buf, *l = p + n;
430
431 while (p < l - 1) { aa += LOAD16_B(p); p += 2; }
432 if (p < l) { aa += (unsigned)(*p) << 8; }
433 do aa = (aa & 0xffff) + (aa >> 16); while (aa >= 0x10000);
434 return (aa == 0xffff ? aa : aa ^ 0xffff);
435}
436
437/* TCP/UDP pseudoheader structure. */
438struct phdr {
439 struct in_addr ph_src, ph_dst;
1d25a3ed
MW
440 uint8_t ph_z, ph_p;
441 uint16_t ph_len;
88510d86 442};
102fa2f0
MW
443struct phdr6 {
444 struct in6_addr ph6_src, ph6_dst;
445 uint32_t ph6_len;
446 uint8_t ph6_z0, ph6_z1, ph6_z2, ph6_nxt;
447};
88510d86
MW
448
449struct raw_state {
454f5a1a 450 union addr me, a;
88510d86 451 int sk, rawicmp, rawudp;
5854b1cc 452 uint16_t srcport, dstport;
88510d86
MW
453 unsigned q;
454};
455
456static int raw_setup(void *stv, int sk, const struct param *pp)
457{
458 struct raw_state *st = stv;
cb160b86 459 socklen_t sz;
88510d86
MW
460 int i, mtu = -1;
461 struct ifaddrs *ifa, *ifaa, *ifap;
462 struct ifreq ifr;
102fa2f0 463 struct icmp6_filter f6;
88510d86 464
454f5a1a
MW
465 /* Check that the address is OK, and that we have the necessary raw
466 * sockets.
102fa2f0
MW
467 *
468 * For IPv6, also set the filter so we don't get too many useless wakeups.
454f5a1a
MW
469 */
470 switch (pp->a.sa.sa_family) {
471 case AF_INET:
472 if (rawerr) { errno = rawerr; goto fail_0; }
473 st->rawicmp = rawicmp; st->rawudp = rawudp; st->sk = sk;
102fa2f0
MW
474 /* IPv4 filtering is available on Linux but isn't portable. */
475 break;
476 case AF_INET6:
477 if (rawerr6) { errno = rawerr6; goto fail_0; }
478 st->rawicmp = rawicmp6; st->rawudp = rawudp6; st->sk = sk;
479 ICMP6_FILTER_SETBLOCKALL(&f6);
480 ICMP6_FILTER_SETPASS(ICMP6_PACKET_TOO_BIG, &f6);
481 ICMP6_FILTER_SETPASS(ICMP6_DST_UNREACH, &f6);
482 if (setsockopt(st->rawicmp, IPPROTO_ICMPV6, ICMP6_FILTER,
483 &f6, sizeof(f6))) {
484 die(EXIT_FAILURE, "failed to set icmpv6 filter: %s",
485 strerror(errno));
486 }
454f5a1a
MW
487 break;
488 default:
489 errno = EPFNOSUPPORT; goto fail_0;
490 }
88510d86
MW
491
492 /* Initialize the sequence number. */
493 st->q = rand() & 0xffff;
494
495 /* Snaffle the local and remote address and port number. */
454f5a1a 496 st->a = pp->a;
88510d86 497 sz = sizeof(st->me);
454f5a1a 498 if (getsockname(sk, &st->me.sa, &sz))
88510d86
MW
499 goto fail_0;
500
102fa2f0
MW
501 /* Only now do some fiddling because Linux doesn't like port numbers in
502 * IPv6 raw destination addresses...
503 */
b9e97e20
MW
504 switch (pp->a.sa.sa_family) {
505 case AF_INET:
506 st->srcport = st->me.sin.sin_port; st->me.sin.sin_port = 0;
507 st->dstport = st->a.sin.sin_port; st->a.sin.sin_port = 0;
508 break;
102fa2f0
MW
509 case AF_INET6:
510 st->srcport = st->me.sin6.sin6_port; st->me.sin6.sin6_port = 0;
511 st->dstport = st->a.sin6.sin6_port; st->a.sin6.sin6_port = 0;
512 break;
b9e97e20
MW
513 default:
514 abort();
515 }
5854b1cc 516
88510d86
MW
517 /* There isn't a portable way to force the DF flag onto a packet through
518 * UDP, or even through raw IP, unless we write the entire IP header
519 * ourselves. This is somewhat annoying, especially since we have an
520 * uphill struggle keeping track of which systems randomly expect which
521 * header fields to be presented in host byte order. Oh, well.
522 */
523 i = 1;
524 if (setsockopt(rawudp, IPPROTO_IP, IP_HDRINCL, &i, sizeof(i))) goto fail_0;
525
526 /* Find an upper bound on the MTU. Do two passes over the interface
527 * list. If we can find matches for our local address then use the
528 * highest one of those; otherwise do a second pass and simply take the
529 * highest MTU of any network interface.
530 */
531 if (getifaddrs(&ifaa)) goto fail_0;
532 for (i = 0; i < 2; i++) {
533 for (ifap = 0, ifa = ifaa; ifa; ifa = ifa->ifa_next) {
534 if (!(ifa->ifa_flags & IFF_UP) || !ifa->ifa_addr ||
454f5a1a 535 ifa->ifa_addr->sa_family != st->me.sa.sa_family ||
88510d86 536 (i == 0 &&
454f5a1a 537 !addreq((union addr *)ifa->ifa_addr, &st->me, 0)) ||
88510d86
MW
538 (i == 1 && ifap && strcmp(ifap->ifa_name, ifa->ifa_name) == 0) ||
539 strlen(ifa->ifa_name) >= sizeof(ifr.ifr_name))
540 continue;
541 ifap = ifa;
542 strcpy(ifr.ifr_name, ifa->ifa_name);
543 if (ioctl(sk, SIOCGIFMTU, &ifr)) goto fail_1;
544 if (mtu < ifr.ifr_mtu) mtu = ifr.ifr_mtu;
545 }
546 if (mtu > 0) break;
547 }
548 if (mtu < 0) { errno = ENOTCONN; goto fail_1; }
549 freeifaddrs(ifaa);
550
551 /* Done. */
552 return (mtu);
553
554fail_1:
555 freeifaddrs(ifaa);
556fail_0:
557 return (-1);
558}
559
560static void raw_finish(void *stv) { ; }
561
562static void raw_selprep(void *stv, int *maxfd, fd_set *fd_in)
563 { struct raw_state *st = stv; ADDFD(st->sk); ADDFD(st->rawicmp); }
564
565static int raw_xmit(void *stv, int mtu)
566{
567 struct raw_state *st = stv;
568 unsigned char b[65536], *p;
569 struct ip *ip;
102fa2f0 570 struct ip6_hdr *ip6;
88510d86
MW
571 struct udphdr *udp;
572 struct phdr ph;
102fa2f0 573 struct phdr6 ph6;
88510d86
MW
574 unsigned ck;
575
b9e97e20
MW
576 switch (st->a.sa.sa_family) {
577
578 case AF_INET:
579
580 /* Build the IP header. */
581 ip = (struct ip *)b;
582 ip->ip_v = 4;
583 ip->ip_hl = sizeof(*ip)/4;
584 ip->ip_tos = IPTOS_RELIABILITY;
585 ip->ip_len = sane_htons(mtu);
586 STEP(st->q); ip->ip_id = htons(st->q);
587 ip->ip_off = sane_htons(0 | IP_DF);
588 ip->ip_ttl = 64;
589 ip->ip_p = IPPROTO_UDP;
590 ip->ip_sum = 0;
591 ip->ip_src = st->me.sin.sin_addr;
592 ip->ip_dst = st->a.sin.sin_addr;
593
594 /* Build a UDP packet in the output buffer. */
595 udp = (struct udphdr *)(ip + 1);
596 udp->uh_sport = st->srcport;
597 udp->uh_dport = st->dstport;
598 udp->uh_ulen = htons(mtu - sizeof(*ip));
599 udp->uh_sum = 0;
600
601 /* Copy the payload. */
602 p = (unsigned char *)(udp + 1);
603 memcpy(p, buf, mtu - (p - b));
604
605 /* Calculate the UDP checksum. */
606 ph.ph_src = ip->ip_src;
607 ph.ph_dst = ip->ip_dst;
608 ph.ph_z = 0;
609 ph.ph_p = IPPROTO_UDP;
610 ph.ph_len = udp->uh_ulen;
611 ck = IPCK_INIT;
612 ck = ipcksum(&ph, sizeof(ph), ck);
613 ck = ipcksum(udp, mtu - sizeof(*ip), ck);
614 udp->uh_sum = htons(ck);
615
616 break;
617
102fa2f0
MW
618 case AF_INET6:
619
620 /* Build the IP header. */
621 ip6 = (struct ip6_hdr *)b;
622 STEP(st->q); ip6->ip6_flow = htonl(0x60000000 | st->q);
623 ip6->ip6_plen = htons(mtu - sizeof(*ip6));
624 ip6->ip6_nxt = IPPROTO_UDP;
625 ip6->ip6_hlim = 64;
626 ip6->ip6_src = st->me.sin6.sin6_addr;
627 ip6->ip6_dst = st->a.sin6.sin6_addr;
628
629 /* Build a UDP packet in the output buffer. */
630 udp = (struct udphdr *)(ip6 + 1);
631 udp->uh_sport = st->srcport;
632 udp->uh_dport = st->dstport;
633 udp->uh_ulen = htons(mtu - sizeof(*ip6));
634 udp->uh_sum = 0;
635
636 /* Copy the payload. */
637 p = (unsigned char *)(udp + 1);
638 memcpy(p, buf, mtu - (p - b));
639
640 /* Calculate the UDP checksum. */
641 ph6.ph6_src = ip6->ip6_src;
642 ph6.ph6_dst = ip6->ip6_dst;
643 ph6.ph6_len = udp->uh_ulen;
644 ph6.ph6_z0 = ph6.ph6_z1 = ph6.ph6_z2 = 0;
645 ph6.ph6_nxt = IPPROTO_UDP;
646 ck = IPCK_INIT;
647 ck = ipcksum(&ph6, sizeof(ph6), ck);
648 ck = ipcksum(udp, mtu - sizeof(*ip6), ck);
649 udp->uh_sum = htons(ck);
650
651 break;
652
b9e97e20
MW
653 default:
654 abort();
655 }
88510d86
MW
656
657 /* Send the whole thing off. If we're too big for the interface then we
658 * might need to trim immediately.
659 */
454f5a1a 660 if (sendto(st->rawudp, b, mtu, 0, &st->a.sa, addrsz(&st->a)) < 0) {
88510d86
MW
661 if (errno == EMSGSIZE) return (RC_LOWER);
662 else goto fail_0;
663 }
664
665 /* Done. */
666 return (RC_OK);
667
668fail_0:
669 return (RC_FAIL);
670}
671
672static int raw_selproc(void *stv, fd_set *fd_in, struct probestate *ps)
673{
674 struct raw_state *st = stv;
675 unsigned char b[65536];
676 struct ip *ip;
102fa2f0 677 struct ip6_hdr *ip6;
88510d86 678 struct icmp *icmp;
102fa2f0 679 struct icmp6_hdr *icmp6;
88510d86 680 struct udphdr *udp;
9ad20ce0 681 const unsigned char *payload;
88510d86
MW
682 ssize_t n;
683
684 /* An ICMP packet: see what's inside. */
685 if (FD_ISSET(st->rawicmp, fd_in)) {
686 if ((n = read(st->rawicmp, b, sizeof(b))) < 0) goto fail_0;
687
b9e97e20
MW
688 switch (st->me.sa.sa_family) {
689
690 case AF_INET:
691
692 ip = (struct ip *)b;
693 if (n < sizeof(*ip) || n < sizeof(4*ip->ip_hl) ||
694 ip->ip_v != 4 || ip->ip_p != IPPROTO_ICMP)
695 goto skip_icmp;
696 n -= sizeof(4*ip->ip_hl);
697
698 icmp = (struct icmp *)(b + 4*ip->ip_hl);
699 if (n < sizeof(*icmp) || icmp->icmp_type != ICMP_UNREACH)
700 goto skip_icmp;
701 n -= offsetof(struct icmp, icmp_ip);
702
703 ip = &icmp->icmp_ip;
704 if (n < sizeof(*ip) ||
705 ip->ip_p != IPPROTO_UDP || ip->ip_hl != sizeof(*ip)/4 ||
706 ip->ip_id != htons(st->q) ||
707 ip->ip_src.s_addr != st->me.sin.sin_addr.s_addr ||
708 ip->ip_dst.s_addr != st->a.sin.sin_addr.s_addr)
709 goto skip_icmp;
710 n -= sizeof(*ip);
711
712 udp = (struct udphdr *)(ip + 1);
713 if (n < sizeof(*udp) || udp->uh_sport != st->srcport ||
714 udp->uh_dport != st->dstport)
715 goto skip_icmp;
716 n -= sizeof(*udp);
717
718 payload = (const unsigned char *)(udp + 1);
719 if (!mypacketp(ps, payload, n)) goto skip_icmp;
720
721 if (icmp->icmp_code == ICMP_UNREACH_PORT) return (RC_HIGHER);
722 else if (icmp->icmp_code != ICMP_UNREACH_NEEDFRAG) goto skip_icmp;
723 else if (icmp->icmp_nextmtu) return (htons(icmp->icmp_nextmtu));
724 else return (RC_LOWER);
725
726 break;
727
102fa2f0
MW
728 case AF_INET6:
729 icmp6 = (struct icmp6_hdr *)b;
730 if (n < sizeof(*icmp6) ||
731 (icmp6->icmp6_type != ICMP6_PACKET_TOO_BIG &&
732 icmp6->icmp6_type != ICMP6_DST_UNREACH))
733 goto skip_icmp;
734 n -= sizeof(*icmp6);
735
736 ip6 = (struct ip6_hdr *)(icmp6 + 1);
737 if (n < sizeof(*ip6) || ip6->ip6_nxt != IPPROTO_UDP ||
738 memcmp(ip6->ip6_src.s6_addr,
739 st->me.sin6.sin6_addr.s6_addr, 16) ||
740 memcmp(ip6->ip6_dst.s6_addr,
741 st->a.sin6.sin6_addr.s6_addr, 16) ||
742 (ntohl(ip6->ip6_flow)&0xffff) != st->q)
743 goto skip_icmp;
744 n -= sizeof(*ip6);
745
746 udp = (struct udphdr *)(ip6 + 1);
747 if (n < sizeof(*udp) || udp->uh_sport != st->srcport ||
748 udp->uh_dport != st->dstport)
749 goto skip_icmp;
750 n -= sizeof(*udp);
751
752 payload = (const unsigned char *)(udp + 1);
753 if (!mypacketp(ps, payload, n)) goto skip_icmp;
754
755 if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG)
756 return (ntohs(icmp6->icmp6_mtu));
757 else switch (icmp6->icmp6_code) {
758 case ICMP6_DST_UNREACH_ADMIN:
759 case ICMP6_DST_UNREACH_NOPORT:
760 return (RC_HIGHER);
761 default:
762 goto skip_icmp;
763 }
764 break;
765
b9e97e20
MW
766 default:
767 abort();
768 }
88510d86 769 }
b9e97e20 770
88510d86
MW
771skip_icmp:;
772
773 /* If we got a reply to the current probe then we're good. If we got an
774 * error, or the packet's sequence number is wrong, then ignore it.
775 */
776 if (FD_ISSET(st->sk, fd_in)) {
777 if ((n = read(st->sk, b, sizeof(b))) < 0) return (RC_OK);
778 else if (mypacketp(ps, b, n)) return (RC_HIGHER);
779 else return (RC_OK);
780 }
781
782 return (RC_OK);
783
784fail_0:
785 return (RC_FAIL);
786}
787
788static const struct probe_ops raw_ops = {
789 "raw", OPS_CHAIN, sizeof(struct raw_state),
790 raw_setup, raw_finish,
791 raw_selprep, raw_xmit, raw_selproc
792};
793
794#undef OPS_CHAIN
795#define OPS_CHAIN &raw_ops
796
797/*----- Doing the job on Linux --------------------------------------------*/
798
799#if defined(linux)
800
801#ifndef IP_MTU
802# define IP_MTU 14 /* Blech! */
803#endif
804
805struct linux_state {
10583b59 806 int sol, so_mtu_discover, so_mtu;
88510d86 807 int sk;
10583b59 808 size_t hdrlen;
88510d86
MW
809};
810
811static int linux_setup(void *stv, int sk, const struct param *pp)
812{
813 struct linux_state *st = stv;
814 int i, mtu;
cb160b86 815 socklen_t sz;
88510d86 816
454f5a1a
MW
817 /* Check that the address is OK. */
818 switch (pp->a.sa.sa_family) {
10583b59
MW
819 case AF_INET:
820 st->sol = IPPROTO_IP;
821 st->so_mtu_discover = IP_MTU_DISCOVER;
822 st->so_mtu = IP_MTU;
823 st->hdrlen = 28;
824 break;
825 case AF_INET6:
826 st->sol = IPPROTO_IPV6;
827 st->so_mtu_discover = IPV6_MTU_DISCOVER;
828 st->so_mtu = IPV6_MTU;
829 st->hdrlen = 48;
830 break;
831 default:
832 errno = EPFNOSUPPORT;
833 return (-1);
454f5a1a
MW
834 }
835
88510d86
MW
836 /* Snaffle the UDP socket. */
837 st->sk = sk;
838
839 /* Turn on kernel path-MTU discovery and force DF on. */
18d5f6eb 840 i = IP_PMTUDISC_PROBE;
10583b59 841 if (setsockopt(st->sk, st->sol, st->so_mtu_discover, &i, sizeof(i)))
88510d86
MW
842 return (-1);
843
844 /* Read the initial MTU guess back and report it. */
845 sz = sizeof(mtu);
10583b59 846 if (getsockopt(st->sk, st->sol, st->so_mtu, &mtu, &sz))
88510d86
MW
847 return (-1);
848
849 /* Done. */
850 return (mtu);
851}
852
853static void linux_finish(void *stv) { ; }
854
855static void linux_selprep(void *stv, int *maxfd, fd_set *fd_in)
856 { struct linux_state *st = stv; ADDFD(st->sk); }
857
858static int linux_xmit(void *stv, int mtu)
859{
860 struct linux_state *st = stv;
861
862 /* Write the packet. */
10583b59 863 if (write(st->sk, buf, mtu - st->hdrlen) >= 0) return (RC_OK);
88510d86
MW
864 else if (errno == EMSGSIZE) return (RC_LOWER);
865 else return (RC_FAIL);
866}
867
868static int linux_selproc(void *stv, fd_set *fd_in, struct probestate *ps)
869{
870 struct linux_state *st = stv;
871 int mtu;
cb160b86 872 socklen_t sz;
88510d86
MW
873 ssize_t n;
874 unsigned char b[65536];
875
876 /* Read an answer. If it looks like the right kind of error then report a
877 * success. This is potentially wrong, since we can't tell whether an
878 * error was delayed from an earlier probe. However, we never return
879 * RC_LOWER from this method, so the packet sizes ought to be monotonically
880 * decreasing and this won't cause trouble. Otherwise update from the
881 * kernel's idea of the right MTU.
882 */
883 if (FD_ISSET(st->sk, fd_in)) {
884 n = read(st->sk, &buf, sizeof(buf));
885 if (n >= 0 ?
886 mypacketp(ps, b, n) :
887 errno == ECONNREFUSED || errno == EHOSTUNREACH)
888 return (RC_HIGHER);
889 sz = sizeof(mtu);
10583b59 890 if (getsockopt(st->sk, st->sol, st->so_mtu, &mtu, &sz))
88510d86
MW
891 return (RC_FAIL);
892 return (mtu);
893 }
894 return (RC_OK);
895}
896
897static const struct probe_ops linux_ops = {
898 "linux", OPS_CHAIN, sizeof(struct linux_state),
899 linux_setup, linux_finish,
900 linux_selprep, linux_xmit, linux_selproc
901};
c64d8cd5 902
88510d86
MW
903#undef OPS_CHAIN
904#define OPS_CHAIN &linux_ops
c64d8cd5
MW
905
906#endif
907
908/*----- Help options ------------------------------------------------------*/
909
88510d86
MW
910static const struct probe_ops *probe_ops = OPS_CHAIN;
911
c64d8cd5
MW
912static void version(FILE *fp)
913 { pquis(fp, "$, TrIPE version " VERSION "\n"); }
914
915static void usage(FILE *fp)
88510d86 916{
22062fb6 917 pquis(fp, "Usage: $ [-46v] [-H HEADER] [-m METHOD]\n\
88510d86
MW
918 [-r SECS] [-g FACTOR] [-t SECS] HOST [PORT]\n");
919}
c64d8cd5
MW
920
921static void help(FILE *fp)
922{
88510d86
MW
923 const struct probe_ops *ops;
924
c64d8cd5
MW
925 version(fp);
926 fputc('\n', fp);
927 usage(fp);
928 fputs("\
929\n\
930Options in full:\n\
931\n\
932-h, --help Show this help text.\n\
b13c3272 933-V, --version Show version number.\n\
c64d8cd5
MW
934-u, --usage Show brief usage message.\n\
935\n\
22062fb6
MW
936-4, --ipv4 Restrict to IPv4 only.\n\
937-6, --ipv6 Restrict to IPv6 only.\n\
88510d86
MW
938-g, --growth=FACTOR Growth factor for retransmit interval.\n\
939-m, --method=METHOD Use METHOD to probe for MTU.\n\
940-r, --retransmit=SECS Retransmit if no reply after SEC.\n\
941-t, --timeout=SECS Give up expecting a reply after SECS.\n\
a8f70fe1 942-v, --verbose Write a running commentary to stderr.\n\
c64d8cd5 943-H, --header=HEX Packet header, in hexadecimal.\n\
88510d86
MW
944\n\
945Probe methods:\n\
c64d8cd5 946", fp);
88510d86
MW
947 for (ops = probe_ops; ops; ops = ops->next)
948 printf("\t%s\n", ops->name);
c64d8cd5
MW
949}
950
951/*----- Main code ---------------------------------------------------------*/
952
953int main(int argc, char *argv[])
954{
88510d86 955 struct param pp = { 0, 0.333, 3.0, 8.0, 0, OPS_CHAIN };
c64d8cd5
MW
956 hex_ctx hc;
957 dstr d = DSTR_INIT;
958 size_t sz;
22062fb6
MW
959 int i, err;
960 struct addrinfo aihint = { 0 }, *ailist, *ai;
961 const char *host, *svc = "7";
c64d8cd5
MW
962 unsigned f = 0;
963
964#define f_bogus 1u
965
88510d86
MW
966 if ((rawicmp = socket(PF_INET, SOCK_RAW, IPPROTO_ICMP)) < 0 ||
967 (rawudp = socket(PF_INET, SOCK_RAW, IPPROTO_UDP)) < 0)
968 rawerr = errno;
102fa2f0
MW
969 if ((rawicmp6 = socket(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6)) < 0 ||
970 (rawudp6 = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW)) < 0)
971 rawerr6 = errno;
88510d86
MW
972 if (setuid(getuid()))
973 abort();
974
c64d8cd5
MW
975 ego(argv[0]);
976 fillbuffer(buf, sizeof(buf));
22062fb6
MW
977
978 aihint.ai_family = AF_UNSPEC;
979 aihint.ai_protocol = IPPROTO_UDP;
980 aihint.ai_socktype = SOCK_DGRAM;
981 aihint.ai_flags = AI_ADDRCONFIG;
c64d8cd5
MW
982
983 for (;;) {
984 static const struct option opts[] = {
985 { "help", 0, 0, 'h' },
88510d86 986 { "version", 0, 0, 'V' },
c64d8cd5 987 { "usage", 0, 0, 'u' },
22062fb6
MW
988 { "ipv4", 0, 0, '4' },
989 { "ipv6", 0, 0, '6' },
c64d8cd5 990 { "header", OPTF_ARGREQ, 0, 'H' },
88510d86
MW
991 { "growth", OPTF_ARGREQ, 0, 'g' },
992 { "method", OPTF_ARGREQ, 0, 'm' },
993 { "retransmit", OPTF_ARGREQ, 0, 'r' },
c64d8cd5 994 { "timeout", OPTF_ARGREQ, 0, 't' },
88510d86 995 { "verbose", 0, 0, 'v' },
c64d8cd5
MW
996 { 0, 0, 0, 0 }
997 };
998
22062fb6 999 i = mdwopt(argc, argv, "hVu" "46H:g:m:r:t:v", opts, 0, 0, 0);
c64d8cd5
MW
1000 if (i < 0) break;
1001 switch (i) {
1002 case 'h': help(stdout); exit(0);
88510d86 1003 case 'V': version(stdout); exit(0);
c64d8cd5
MW
1004 case 'u': usage(stdout); exit(0);
1005
1006 case 'H':
1007 DRESET(&d);
1008 hex_init(&hc);
1009 hex_decode(&hc, optarg, strlen(optarg), &d);
1010 hex_decode(&hc, 0, 0, &d);
88510d86 1011 sz = d.len < 532 ? d.len : 532;
c64d8cd5 1012 memcpy(buf, d.buf, sz);
88510d86 1013 pp.seqoff = sz;
c64d8cd5
MW
1014 break;
1015
22062fb6
MW
1016 case '4': aihint.ai_family = AF_INET; break;
1017 case '6': aihint.ai_family = AF_INET6; break;
88510d86
MW
1018 case 'g': pp.regr = s2f(optarg, "retransmit growth factor"); break;
1019 case 'r': pp.retx = s2f(optarg, "retransmit interval"); break;
1020 case 't': pp.timeout = s2f(optarg, "timeout"); break;
1021
1022 case 'm':
1023 for (pp.pops = OPS_CHAIN; pp.pops; pp.pops = pp.pops->next)
1024 if (strcmp(pp.pops->name, optarg) == 0) goto found_alg;
1025 die(EXIT_FAILURE, "unknown probe algorithm `%s'", optarg);
1026 found_alg:
c64d8cd5
MW
1027 break;
1028
88510d86
MW
1029 case 'v': pp.f |= F_VERBOSE; break;
1030
c64d8cd5
MW
1031 default:
1032 f |= f_bogus;
1033 break;
1034 }
1035 }
1036 argv += optind; argc -= optind;
1037 if ((f & f_bogus) || 1 > argc || argc > 2) {
1038 usage(stderr);
1039 exit(EXIT_FAILURE);
1040 }
1041
22062fb6
MW
1042 host = argv[0];
1043 if (argv[1]) svc = argv[1];
1044 if ((err = getaddrinfo(host, svc, &aihint, &ailist)) != 0) {
1045 die(EXIT_FAILURE, "unknown host `%s' or service `%s': %s",
1046 host, svc, gai_strerror(err));
c64d8cd5 1047 }
22062fb6
MW
1048 for (ai = ailist; ai && !addrfamok(ai->ai_family); ai = ai->ai_next);
1049 if (!ai) die(EXIT_FAILURE, "no supported address families for `%s'", host);
1050 assert(ai->ai_addrlen <= sizeof(pp.a));
1051 memcpy(&pp.a, ai->ai_addr, ai->ai_addrlen);
c64d8cd5 1052
88510d86 1053 i = pathmtu(&pp);
c64d8cd5
MW
1054 if (i < 0)
1055 die(EXIT_FAILURE, "failed to discover MTU: %s", strerror(errno));
1056 printf("%d\n", i);
1057 if (ferror(stdout) || fflush(stdout) || fclose(stdout))
1058 die(EXIT_FAILURE, "failed to write result: %s", strerror(errno));
1059 return (0);
1060}
1061
1062/*----- That's all, folks -------------------------------------------------*/