X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~ianmdlvl/git?p=secnet.git;a=blobdiff_plain;f=netlink.c;h=601686425124cb7d7bb46cb03500f9df75e7c909;hp=0cbb27d10a09b025e7341cd2c65709b0b1477321;hb=46d06c3918b0080af7e87db986612e23bde4a341;hpb=baa06aeb963965b4b6a8a8051ec15b72372080dd diff --git a/netlink.c b/netlink.c index 0cbb27d..6016864 100644 --- a/netlink.c +++ b/netlink.c @@ -1,80 +1,132 @@ /* User-kernel network link */ -/* We support a variety of methods for extracting packets from the - kernel: userv-ipif, ipif on its own (when we run as root), the - kernel TUN driver. Possible future methods: SLIP to a pty, an - external netlink daemon. There is a performance/security - tradeoff. */ +/* See RFCs 791, 792, 1123 and 1812 */ -/* When dealing with SLIP (to a pty, or ipif) we have separate rx, tx - and client buffers. When receiving we may read() any amount, not - just whole packets. When transmitting we need to bytestuff anyway, - and may be part-way through receiving. */ +/* The netlink device is actually a router. Tunnels are unnumbered + point-to-point lines (RFC1812 section 2.2.7); the router has a + single address (the 'router-id'). */ -/* Each netlink device is actually a router, with its own IP address. - We do things like decreasing the TTL and recalculating the header - checksum, generating ICMP, responding to pings, etc. */ +/* This is where we currently have the anti-spoofing paranoia - before + sending a packet to the kernel we check that the tunnel it came + over could reasonably have produced it. */ -/* This is where we have the anti-spoofing paranoia - before sending a - packet to the kernel we check that the tunnel it came over could - reasonably have produced it. */ -#include "secnet.h" -#include -#include -#include -#include -#include -#include "util.h" +/* Points to note from RFC1812 (which may require changes in this + file): -#ifdef HAVE_LINUX_IF_H -#include -#include -#endif +3.3.4 Maximum Transmission Unit - MTU -/* XXX where do we find if_tun on other platforms? */ + The MTU of each logical interface MUST be configurable within the + range of legal MTUs for the interface. -#define DEFAULT_BUFSIZE 2048 -#define DEFAULT_MTU 1000 -#define ICMP_BUFSIZE 1024 + Many Link Layer protocols define a maximum frame size that may be + sent. In such cases, a router MUST NOT allow an MTU to be set which + would allow sending of frames larger than those allowed by the Link + Layer protocol. However, a router SHOULD be willing to receive a + packet as large as the maximum frame size even if that is larger than + the MTU. -#define SLIP_END 192 -#define SLIP_ESC 219 -#define SLIP_ESCEND 220 -#define SLIP_ESCESC 221 +4.2.1 A router SHOULD count datagrams discarded. -struct netlink_client { - struct subnet_list *networks; - netlink_deliver_fn *deliver; - void *dst; - string_t name; - bool_t can_deliver; - struct netlink_client *next; -}; +4.2.2.1 Source route options - we probably should implement processing +of source routes, even though mostly the security policy will prevent +their use. -/* Netlink provides one function to the device driver, to call to deliver - a packet from the device. The device driver provides one function to - netlink, for it to call to deliver a packet to the device. */ +5.3.13.4 Source Route Options -struct netlink { - closure_t cl; - struct netlink_if ops; - void *dst; /* Pointer to host interface state */ - string_t name; - uint32_t max_start_pad; - uint32_t max_end_pad; - struct subnet_list networks; - struct subnet_list exclude_remote_networks; - uint32_t local_address; /* host interface address */ - uint32_t secnet_address; /* our own address */ - uint32_t mtu; - struct netlink_client *clients; - netlink_deliver_fn *deliver_to_host; /* Provided by driver */ - struct buffer_if icmp; /* Buffer for assembly of outgoing ICMP */ -}; + A router MUST implement support for source route options in forwarded + packets. A router MAY implement a configuration option that, when + enabled, causes all source-routed packets to be discarded. However, + such an option MUST NOT be enabled by default. + +5.3.13.5 Record Route Option + + Routers MUST support the Record Route option in forwarded packets. + + A router MAY provide a configuration option that, if enabled, will + cause the router to ignore (i.e., pass through unchanged) Record + Route options in forwarded packets. If provided, such an option MUST + default to enabling the record-route. This option should not affect + the processing of Record Route options in datagrams received by the + router itself (in particular, Record Route options in ICMP echo + requests will still be processed according to Section [4.3.3.6]). + +5.3.13.6 Timestamp Option + + Routers MUST support the timestamp option in forwarded packets. A + timestamp value MUST follow the rules given [INTRO:2]. + + If the flags field = 3 (timestamp and prespecified address), the + router MUST add its timestamp if the next prespecified address + matches any of the router's IP addresses. It is not necessary that + the prespecified address be either the address of the interface on + which the packet arrived or the address of the interface over which + it will be sent. + + +4.2.2.7 Fragmentation: RFC 791 Section 3.2 + + Fragmentation, as described in [INTERNET:1], MUST be supported by a + router. + +4.2.2.8 Reassembly: RFC 791 Section 3.2 + + As specified in the corresponding section of [INTRO:2], a router MUST + support reassembly of datagrams that it delivers to itself. + +4.2.2.9 Time to Live: RFC 791 Section 3.2 + + Note in particular that a router MUST NOT check the TTL of a packet + except when forwarding it. + + A router MUST NOT discard a datagram just because it was received + with TTL equal to zero or one; if it is to the router and otherwise + valid, the router MUST attempt to receive it. + + On messages the router originates, the IP layer MUST provide a means + for the transport layer to set the TTL field of every datagram that + is sent. When a fixed TTL value is used, it MUST be configurable. + + +8.1 The Simple Network Management Protocol - SNMP +8.1.1 SNMP Protocol Elements + + Routers MUST be manageable by SNMP [MGT:3]. The SNMP MUST operate + using UDP/IP as its transport and network protocols. + + +*/ + +#include +#include +#include +#include "secnet.h" +#include "util.h" +#include "ipaddr.h" +#include "netlink.h" +#include "process.h" + +#ifdef NETLINK_DEBUG +#define MDEBUG(...) Message(M_DEBUG, __VA_ARGS__) +#else /* !NETLINK_DEBUG */ +#define MDEBUG(...) ((void)0) +#endif /* !NETLINK_DEBUG */ + +#define ICMP_TYPE_ECHO_REPLY 0 + +#define ICMP_TYPE_UNREACHABLE 3 +#define ICMP_CODE_NET_UNREACHABLE 0 +#define ICMP_CODE_PROTOCOL_UNREACHABLE 2 +#define ICMP_CODE_FRAGMENTATION_REQUIRED 4 +#define ICMP_CODE_NET_PROHIBITED 13 + +#define ICMP_TYPE_ECHO_REQUEST 8 + +#define ICMP_TYPE_TIME_EXCEEDED 11 +#define ICMP_CODE_TTL_EXCEEDED 0 /* Generic IP checksum routine */ -static inline uint16_t ip_csum(uint8_t *iph,uint32_t count) +static inline uint16_t ip_csum(const uint8_t *iph,int32_t count) { register uint32_t sum=0; @@ -98,38 +150,39 @@ static inline uint16_t ip_csum(uint8_t *iph,uint32_t count) * By Jorge Cwik , adapted for linux by * Arnt Gulbrandsen. */ -static inline uint16_t ip_fast_csum(uint8_t *iph, uint32_t ihl) { +static inline uint16_t ip_fast_csum(const uint8_t *iph, int32_t ihl) { uint32_t sum; - __asm__ __volatile__(" - movl (%1), %0 - subl $4, %2 - jbe 2f - addl 4(%1), %0 - adcl 8(%1), %0 - adcl 12(%1), %0 -1: adcl 16(%1), %0 - lea 4(%1), %1 - decl %2 - jne 1b - adcl $0, %0 - movl %0, %2 - shrl $16, %0 - addw %w2, %w0 - adcl $0, %0 - notl %0 -2: - " + __asm__ __volatile__( + "movl (%1), %0 ;\n" + "subl $4, %2 ;\n" + "jbe 2f ;\n" + "addl 4(%1), %0 ;\n" + "adcl 8(%1), %0 ;\n" + "adcl 12(%1), %0 ;\n" +"1: adcl 16(%1), %0 ;\n" + "lea 4(%1), %1 ;\n" + "decl %2 ;\n" + "jne 1b ;\n" + "adcl $0, %0 ;\n" + "movl %0, %2 ;\n" + "shrl $16, %0 ;\n" + "addw %w2, %w0 ;\n" + "adcl $0, %0 ;\n" + "notl %0 ;\n" +"2: ;\n" /* Since the input registers which are loaded with iph and ipl are modified, we must also specify them as outputs, or gcc will assume they contain their original values. */ : "=r" (sum), "=r" (iph), "=r" (ihl) - : "1" (iph), "2" (ihl)); + : "1" (iph), "2" (ihl) + : "memory"); return sum; } #else -static inline uint16_t ip_fast_csum(uint8_t *iph, uint32_t ihl) +static inline uint16_t ip_fast_csum(const uint8_t *iph, int32_t ihl) { + assert(ihl < INT_MAX/4); return ip_csum(iph,ihl*4); } #endif @@ -145,7 +198,11 @@ struct iphdr { uint8_t tos; uint16_t tot_len; uint16_t id; - uint16_t frag_off; + uint16_t frag; +#define IPHDR_FRAG_OFF ((uint16_t)0x1fff) +#define IPHDR_FRAG_MORE ((uint16_t)0x2000) +#define IPHDR_FRAG_DONT ((uint16_t)0x4000) +/* reserved 0x8000 */ uint8_t ttl; uint8_t protocol; uint16_t check; @@ -159,7 +216,7 @@ struct icmphdr { uint8_t type; uint8_t code; uint16_t check; - union { + union icmpinfofield { uint32_t unused; struct { uint8_t pointer; @@ -171,18 +228,48 @@ struct icmphdr { uint16_t id; uint16_t seq; } echo; + struct { + uint16_t unused; + uint16_t mtu; + } fragneeded; } d; }; + +static const union icmpinfofield icmp_noinfo; -static void netlink_packet_deliver(struct netlink *st, struct buffer_if *buf); +static void netlink_client_deliver(struct netlink *st, + struct netlink_client *client, + uint32_t source, uint32_t dest, + struct buffer_if *buf); +static void netlink_host_deliver(struct netlink *st, + struct netlink_client *sender, + uint32_t source, uint32_t dest, + struct buffer_if *buf); + +static const char *sender_name(struct netlink_client *sender /* or NULL */) +{ + return sender?sender->name:"(local)"; +} +static void netlink_packet_deliver(struct netlink *st, + struct netlink_client *client, + struct buffer_if *buf); + +/* XXX RFC1812 4.3.2.5: + All other ICMP error messages (Destination Unreachable, + Redirect, Time Exceeded, and Parameter Problem) SHOULD have their + precedence value set to 6 (INTERNETWORK CONTROL) or 7 (NETWORK + CONTROL). The IP Precedence value for these error messages MAY be + settable. + */ static struct icmphdr *netlink_icmp_tmpl(struct netlink *st, - uint32_t dest,uint16_t len) + uint32_t source, uint32_t dest, + uint16_t len) { struct icmphdr *h; BUF_ALLOC(&st->icmp,"netlink_icmp_tmpl"); - buffer_init(&st->icmp,st->max_start_pad); + buffer_init(&st->icmp,calculate_max_start_pad()); h=buf_append(&st->icmp,sizeof(*h)); h->iph.version=4; @@ -190,10 +277,10 @@ static struct icmphdr *netlink_icmp_tmpl(struct netlink *st, h->iph.tos=0; h->iph.tot_len=htons(len+(h->iph.ihl*4)+8); h->iph.id=0; - h->iph.frag_off=0; - h->iph.ttl=255; + h->iph.frag=0; + h->iph.ttl=255; /* XXX should be configurable */ h->iph.protocol=1; - h->iph.saddr=htonl(st->secnet_address); + h->iph.saddr=htonl(source); h->iph.daddr=htonl(dest); h->iph.check=0; h->iph.check=ip_fast_csum((uint8_t *)&h->iph,h->iph.ihl); @@ -206,7 +293,7 @@ static struct icmphdr *netlink_icmp_tmpl(struct netlink *st, /* Fill in the ICMP checksum field correctly */ static void netlink_icmp_csum(struct icmphdr *h) { - uint32_t len; + int32_t len; len=ntohs(h->iph.tot_len)-(4*h->iph.ihl); h->check=0; @@ -234,13 +321,30 @@ static void netlink_icmp_csum(struct icmphdr *h) static bool_t netlink_icmp_may_reply(struct buffer_if *buf) { struct iphdr *iph; + struct icmphdr *icmph; uint32_t source; + if (buf->size < (int)sizeof(struct icmphdr)) return False; iph=(struct iphdr *)buf->start; - if (iph->protocol==1) return False; /* Overly-broad; we may reply to - eg. icmp echo-request */ + icmph=(struct icmphdr *)buf->start; + if (iph->protocol==1) { + switch(icmph->type) { + /* Based on http://www.iana.org/assignments/icmp-parameters/icmp-parameters.xhtml#icmp-parameters-types + * as retrieved Thu, 20 Mar 2014 00:16:44 +0000. + * Deprecated, reserved, unassigned and experimental + * options are treated as not safe to reply to. + */ + case 0: /* Echo Reply */ + case 8: /* Echo */ + case 13: /* Timestamp */ + case 14: /* Timestamp Reply */ + return True; + default: + return False; + } + } /* How do we spot broadcast destination addresses? */ - if (ntohs(iph->frag_off)&0x1fff) return False; /* Non-initial fragment */ + if (ntohs(iph->frag)&IPHDR_FRAG_OFF) return False; source=ntohl(iph->saddr); if (source==0) return False; if ((source&0xff000000)==0x7f000000) return False; @@ -252,8 +356,29 @@ static bool_t netlink_icmp_may_reply(struct buffer_if *buf) /* How much of the original IP packet do we include in its ICMP response? The header plus up to 64 bits. */ + +/* XXX TODO RFC1812: +4.3.2.3 Original Message Header + + Historically, every ICMP error message has included the Internet + header and at least the first 8 data bytes of the datagram that + triggered the error. This is no longer adequate, due to the use of + IP-in-IP tunneling and other technologies. Therefore, the ICMP + datagram SHOULD contain as much of the original datagram as possible + without the length of the ICMP datagram exceeding 576 bytes. The + returned IP header (and user data) MUST be identical to that which + was received, except that the router is not required to undo any + modifications to the IP header that are normally performed in + forwarding that were performed before the error was detected (e.g., + decrementing the TTL, or updating options). Note that the + requirements of Section [4.3.3.5] supersede this requirement in some + cases (i.e., for a Parameter Problem message, if the problem is in a + modified field, the router must undo the modification). See Section + [4.3.3.5]). + */ static uint16_t netlink_icmp_reply_len(struct buffer_if *buf) { + if (buf->size < (int)sizeof(struct iphdr)) return 0; struct iphdr *iph=(struct iphdr *)buf->start; uint16_t hlen,plen; @@ -261,23 +386,60 @@ static uint16_t netlink_icmp_reply_len(struct buffer_if *buf) /* We include the first 8 bytes of the packet data, provided they exist */ hlen+=8; plen=ntohs(iph->tot_len); - return (hlen>plen?plen:hlen); + return MIN(hlen,plen); } -static void netlink_icmp_simple(struct netlink *st, struct buffer_if *buf, - uint8_t type, uint8_t code) +/* client indicates where the packet we're constructing a response to + comes from. NULL indicates the host. */ +static void netlink_icmp_simple(struct netlink *st, + struct netlink_client *origsender, + struct buffer_if *buf, + uint8_t type, uint8_t code, + union icmpinfofield info) { - struct iphdr *iph=(struct iphdr *)buf->start; struct icmphdr *h; uint16_t len; if (netlink_icmp_may_reply(buf)) { + struct iphdr *iph=(struct iphdr *)buf->start; + + uint32_t icmpdest = ntohl(iph->saddr); + uint32_t icmpsource; + const char *icmpsourcedebugprefix; + if (!st->ptp) { + icmpsource=st->secnet_address; + icmpsourcedebugprefix=""; + } else if (origsender) { + /* was from peer, send reply as if from host */ + icmpsource=st->local_address; + icmpsourcedebugprefix="L!"; + } else { + /* was from host, send reply as if from peer */ + icmpsource=st->secnet_address; /* actually, peer address */ + icmpsourcedebugprefix="P!"; + } + MDEBUG("%s: generating ICMP re %s[%s]->[%s]:" + " from %s%s type=%u code=%u\n", + st->name, sender_name(origsender), + ipaddr_to_string(ntohl(iph->saddr)), + ipaddr_to_string(ntohl(iph->daddr)), + icmpsourcedebugprefix, + ipaddr_to_string(icmpsource), + type, code); + len=netlink_icmp_reply_len(buf); - h=netlink_icmp_tmpl(st,ntohl(iph->saddr),len); - h->type=type; h->code=code; - memcpy(buf_append(&st->icmp,len),buf->start,len); + h=netlink_icmp_tmpl(st,icmpsource,icmpdest,len); + h->type=type; h->code=code; h->d=info; + BUF_ADD_BYTES(append,&st->icmp,buf->start,len); netlink_icmp_csum(h); - netlink_packet_deliver(st,&st->icmp); + + if (!st->ptp) { + netlink_packet_deliver(st,NULL,&st->icmp); + } else if (origsender) { + netlink_client_deliver(st,origsender,icmpsource,icmpdest,&st->icmp); + } else { + netlink_host_deliver(st,NULL,icmpsource,icmpdest,&st->icmp); + } BUF_ASSERT_FREE(&st->icmp); } } @@ -285,6 +447,7 @@ static void netlink_icmp_simple(struct netlink *st, struct buffer_if *buf, /* * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the * checksum. + * RFC1812: 4.2.2.5 MUST discard messages containing invalid checksums. * * Is the datagram acceptable? * @@ -293,73 +456,335 @@ static void netlink_icmp_simple(struct netlink *st, struct buffer_if *buf, * 3. Checksums correctly. * 4. Doesn't have a bogus length */ -static bool_t netlink_check(struct netlink *st, struct buffer_if *buf) +static bool_t netlink_check(struct netlink *st, struct buffer_if *buf, + char *errmsgbuf, int errmsgbuflen) { +#define BAD(...) do{ \ + snprintf(errmsgbuf,errmsgbuflen,__VA_ARGS__); \ + return False; \ + }while(0) + + if (buf->size < (int)sizeof(struct iphdr)) BAD("len %"PRIu32"",buf->size); struct iphdr *iph=(struct iphdr *)buf->start; - uint32_t len; + int32_t len; + + if (iph->ihl < 5) BAD("ihl %u",iph->ihl); + if (iph->version != 4) BAD("version %u",iph->version); + if (buf->size < iph->ihl*4) BAD("size %"PRId32"<%u*4",buf->size,iph->ihl); + if (ip_fast_csum((uint8_t *)iph, iph->ihl)!=0) BAD("csum"); + len=ntohs(iph->tot_len); + /* There should be no padding */ + if (buf->size!=len) BAD("len %"PRId32"!=%"PRId32,buf->size,len); + if (len<(iph->ihl<<2)) BAD("len %"PRId32"<(%u<<2)",len,iph->ihl); + /* XXX check that there's no source route specified */ + return True; + +#undef BAD +} - if (iph->ihl < 5 || iph->version != 4) { - printf("ihl/version check failed\n"); - return False; +static const char *fragment_filter_header(uint8_t *base, long *hlp) +{ + const int fixedhl = sizeof(struct iphdr); + long hl = *hlp; + const uint8_t *ipend = base + hl; + uint8_t *op = base + fixedhl; + const uint8_t *ip = op; + + while (ip < ipend) { + uint8_t opt = ip[0]; + int remain = ipend - ip; + if (opt == 0x00) /* End of Options List */ break; + if (opt == 0x01) /* No Operation */ continue; + if (remain < 2) return "IPv4 options truncated at length"; + int optlen = ip[1]; + if (remain < optlen) return "IPv4 options truncated in option"; + if (opt & 0x80) /* copy */ { + memmove(op, ip, optlen); + op += optlen; + } + ip += optlen; } - if (buf->size < iph->ihl*4) { - printf("buffer size check failed\n"); - return False; + while ((hl = (op - base)) & 0x3) + *op++ = 0x00 /* End of Option List */; + ((struct iphdr*)base)->ihl = hl >> 2; + *hlp = hl; + + return 0; +} + +/* Fragment or send ICMP Fragmentation Needed */ +static void netlink_maybe_fragment(struct netlink *st, + struct netlink_client *sender, + netlink_deliver_fn *deliver, + void *deliver_dst, + const char *delivery_name, + int32_t mtu, + uint32_t source, uint32_t dest, + struct buffer_if *buf) +{ + struct iphdr *iph=(struct iphdr*)buf->start; + long hl = iph->ihl*4; + const char *ssource = ipaddr_to_string(source); + + if (buf->size <= mtu) { + deliver(deliver_dst, buf); + return; } - if (ip_fast_csum((uint8_t *)iph, iph->ihl)!=0) { - printf("checksum failed\n"); - return False; + + MDEBUG("%s: fragmenting %s->%s org.size=%"PRId32"\n", + st->name, ssource, delivery_name, buf->size); + +#define BADFRAG(m, ...) \ + Message(M_WARNING, \ + "%s: fragmenting packet from source %s" \ + " for transmission via %s: " m "\n", \ + st->name, ssource, delivery_name, \ + ## __VA_ARGS__); + + unsigned orig_frag = ntohs(iph->frag); + + if (orig_frag&IPHDR_FRAG_DONT) { + union icmpinfofield info = + { .fragneeded = { .unused = 0, .mtu = htons(mtu) } }; + netlink_icmp_simple(st,sender,buf, + ICMP_TYPE_UNREACHABLE, + ICMP_CODE_FRAGMENTATION_REQUIRED, + info); + BUF_FREE(buf); + return; } - len=ntohs(iph->tot_len); - /* There should be no padding */ - if (buf->size!=len || len<(iph->ihl<<2)) { - printf("length check failed buf->size=%d len=%d\n",buf->size,len); - return False; + if (mtu < hl + 8) { + BADFRAG("mtu %"PRId32" too small", mtu); + BUF_FREE(buf); + return; } - /* XXX check that there's no source route specified */ - return True; + /* we (ab)use the icmp buffer to stash the original packet */ + struct buffer_if *orig = &st->icmp; + BUF_ALLOC(orig,"netlink_client_deliver fragment orig"); + buffer_copy(orig,buf); + BUF_FREE(buf); + + const uint8_t *startindata = orig->start + hl; + const uint8_t *indata = startindata; + const uint8_t *endindata = orig->start + orig->size; + _Bool filtered = 0; + + for (;;) { + /* compute our fragment offset */ + long dataoffset = indata - startindata + + (orig_frag & IPHDR_FRAG_OFF)*8; + assert(!(dataoffset & 7)); + if (dataoffset > IPHDR_FRAG_OFF*8) { + BADFRAG("ultimate fragment offset out of range"); + break; + } + + BUF_ALLOC(buf,"netlink_client_deliver fragment frag"); + buffer_init(buf,calculate_max_start_pad()); + + /* copy header (possibly filtered); will adjust in a bit */ + struct iphdr *fragh = buf_append(buf, hl); + memcpy(fragh, orig->start, hl); + + /* decide how much payload to copy and copy it */ + long avail = mtu - hl; + long remain = endindata - indata; + long use = avail < remain ? (avail & ~(long)7) : remain; + BUF_ADD_BYTES(append, buf, indata, use); + indata += use; + + _Bool last_frag = indata >= endindata; + + /* adjust the header */ + fragh->tot_len = htons(buf->size); + fragh->frag = + htons((orig_frag & ~IPHDR_FRAG_OFF) | + (last_frag ? 0 : IPHDR_FRAG_MORE) | + (dataoffset >> 3)); + fragh->check = 0; + fragh->check = ip_fast_csum((const void*)fragh, fragh->ihl); + + /* actually send it */ + deliver(deliver_dst, buf); + if (last_frag) + break; + + /* after copying the header for the first frag, + * we filter the header for the remaining frags */ + if (!filtered++) { + const char *bad = fragment_filter_header(orig->start, &hl); + if (bad) { BADFRAG("%s", bad); break; } + } + } + + BUF_FREE(orig); + +#undef BADFRAG } -static void netlink_packet_deliver(struct netlink *st, struct buffer_if *buf) +/* Deliver a packet _to_ client; used after we have decided + * what to do with it (and just to check that the client has + * actually registered a delivery function with us). */ +static void netlink_client_deliver(struct netlink *st, + struct netlink_client *client, + uint32_t source, uint32_t dest, + struct buffer_if *buf) { + if (!client->deliver) { + string_t s,d; + s=ipaddr_to_string(source); + d=ipaddr_to_string(dest); + Message(M_ERR,"%s: dropping %s->%s, client not registered\n", + st->name,s,d); + BUF_FREE(buf); + return; + } + netlink_maybe_fragment(st,NULL, client->deliver,client->dst,client->name, + client->mtu, source,dest,buf); + client->outcount++; +} + +/* Deliver a packet to the host; used after we have decided that that + * is what to do with it. */ +static void netlink_host_deliver(struct netlink *st, + struct netlink_client *sender, + uint32_t source, uint32_t dest, + struct buffer_if *buf) +{ + netlink_maybe_fragment(st,sender, st->deliver_to_host,st->dst,"(host)", + st->mtu, source,dest,buf); + st->outcount++; +} + +/* Deliver a packet. "sender"==NULL for packets from the host and packets + generated internally in secnet. */ +static void netlink_packet_deliver(struct netlink *st, + struct netlink_client *sender, + struct buffer_if *buf) +{ + if (buf->size < (int)sizeof(struct iphdr)) { + Message(M_ERR,"%s: trying to deliver a too-short packet" + " from %s!\n",st->name, sender_name(sender)); + BUF_FREE(buf); + return; + } + struct iphdr *iph=(struct iphdr *)buf->start; uint32_t dest=ntohl(iph->daddr); - struct netlink_client *c; + uint32_t source=ntohl(iph->saddr); + uint32_t best_quality; + bool_t allow_route=False; + bool_t found_allowed=False; + int best_match; + int i; BUF_ASSERT_USED(buf); if (dest==st->secnet_address) { - Message(M_ERROR,"%s: trying to deliver a packet to myself!\n"); + Message(M_ERR,"%s: trying to deliver a packet to myself!\n",st->name); BUF_FREE(buf); return; } - for (c=st->clients; c; c=c->next) { - if (subnet_match(c->networks,dest)) { - if (c->can_deliver) { - c->deliver(c->dst,c,buf); + /* Packets from the host (sender==NULL) may always be routed. Packets + from clients with the allow_route option will also be routed. */ + if (!sender || (sender && (sender->options & OPT_ALLOWROUTE))) + allow_route=True; + + /* If !allow_route, we check the routing table anyway, and if + there's a suitable route with OPT_ALLOWROUTE set we use it. If + there's a suitable route, but none with OPT_ALLOWROUTE set then + we generate ICMP 'communication with destination network + administratively prohibited'. */ + + best_quality=0; + best_match=-1; + for (i=0; in_clients; i++) { + if (st->routes[i]->up && + ipset_contains_addr(st->routes[i]->networks,dest)) { + /* It's an available route to the correct destination. But is + it better than the one we already have? */ + + /* If we have already found an allowed route then we don't + bother looking at routes we're not allowed to use. If + we don't yet have an allowed route we'll consider any. */ + if (!allow_route && found_allowed) { + if (!(st->routes[i]->options&OPT_ALLOWROUTE)) continue; + } + + if (st->routes[i]->link_quality>best_quality + || best_quality==0) { + best_quality=st->routes[i]->link_quality; + best_match=i; + if (st->routes[i]->options&OPT_ALLOWROUTE) + found_allowed=True; + /* If quality isn't perfect we may wish to + consider kicking the tunnel with a 0-length + packet to prompt it to perform a key setup. + Then it'll eventually decide it's up or + down. */ + /* If quality is perfect and we're allowed to use the + route we don't need to search any more. */ + if (best_quality>=MAXIMUM_LINK_QUALITY && + (allow_route || found_allowed)) break; + } + } + } + if (best_match==-1) { + /* The packet's not going down a tunnel. It might (ought to) + be for the host. */ + if (ipset_contains_addr(st->networks,dest)) { + netlink_host_deliver(st,sender,source,dest,buf); + BUF_ASSERT_FREE(buf); + } else { + string_t s,d; + s=ipaddr_to_string(source); + d=ipaddr_to_string(dest); + Message(M_DEBUG,"%s: don't know where to deliver packet " + "(s=%s, d=%s)\n", st->name, s, d); + netlink_icmp_simple(st,sender,buf,ICMP_TYPE_UNREACHABLE, + ICMP_CODE_NET_UNREACHABLE, icmp_noinfo); + BUF_FREE(buf); + } + } else { + if (!allow_route && + !(st->routes[best_match]->options&OPT_ALLOWROUTE)) { + string_t s,d; + s=ipaddr_to_string(source); + d=ipaddr_to_string(dest); + /* We have a usable route but aren't allowed to use it. + Generate ICMP destination unreachable: communication + with destination network administratively prohibited */ + Message(M_NOTICE,"%s: denied forwarding for packet (s=%s, d=%s)\n", + st->name,s,d); + + netlink_icmp_simple(st,sender,buf,ICMP_TYPE_UNREACHABLE, + ICMP_CODE_NET_PROHIBITED, icmp_noinfo); + BUF_FREE(buf); + } else { + if (best_quality>0) { + netlink_client_deliver(st,st->routes[best_match], + source,dest,buf); BUF_ASSERT_FREE(buf); } else { /* Generate ICMP destination unreachable */ - netlink_icmp_simple(st,buf,3,0); + netlink_icmp_simple(st,sender,buf, + ICMP_TYPE_UNREACHABLE, + ICMP_CODE_NET_UNREACHABLE, + icmp_noinfo); BUF_FREE(buf); } - return; } } - if (subnet_match(&st->networks,dest)) { - st->deliver_to_host(st->dst,NULL,buf); - BUF_ASSERT_FREE(buf); - return; - } - Message(M_ERROR,"%s: failed to deliver a packet (bad destination address)" - "\nXXX make this message clearer\n"); - BUF_FREE(buf); + BUF_ASSERT_FREE(buf); } -static void netlink_packet_forward(struct netlink *st, struct buffer_if *buf) +static void netlink_packet_forward(struct netlink *st, + struct netlink_client *sender, + struct buffer_if *buf) { + if (buf->size < (int)sizeof(struct iphdr)) return; struct iphdr *iph=(struct iphdr *)buf->start; BUF_ASSERT_USED(buf); @@ -367,7 +792,8 @@ static void netlink_packet_forward(struct netlink *st, struct buffer_if *buf) /* Packet has already been checked */ if (iph->ttl<=1) { /* Generate ICMP time exceeded */ - netlink_icmp_simple(st,buf,11,0); + netlink_icmp_simple(st,sender,buf,ICMP_TYPE_TIME_EXCEEDED, + ICMP_CODE_TTL_EXCEEDED,icmp_noinfo); BUF_FREE(buf); return; } @@ -375,43 +801,57 @@ static void netlink_packet_forward(struct netlink *st, struct buffer_if *buf) iph->check=0; iph->check=ip_fast_csum((uint8_t *)iph,iph->ihl); - netlink_packet_deliver(st,buf); + netlink_packet_deliver(st,sender,buf); BUF_ASSERT_FREE(buf); } -/* Someone has been foolish enough to address a packet to us. I - suppose we should reply to it, just to be polite. */ -static void netlink_packet_local(struct netlink *st, struct buffer_if *buf) +/* Deal with packets addressed explicitly to us */ +static void netlink_packet_local(struct netlink *st, + struct netlink_client *sender, + struct buffer_if *buf) { struct icmphdr *h; + st->localcount++; + + if (buf->size < (int)sizeof(struct icmphdr)) { + Message(M_WARNING,"%s: short packet addressed to secnet; " + "ignoring it\n",st->name); + BUF_FREE(buf); + return; + } h=(struct icmphdr *)buf->start; - if ((ntohs(h->iph.frag_off)&0xbfff)!=0) { - Message(M_WARNING,"%s: fragmented packet addressed to us\n",st->name); + unsigned fraginfo = ntohs(h->iph.frag); + if ((fraginfo&(IPHDR_FRAG_OFF|IPHDR_FRAG_MORE))!=0) { + if (!(fraginfo & IPHDR_FRAG_OFF)) + /* report only for first fragment */ + Message(M_WARNING,"%s: fragmented packet addressed to secnet; " + "ignoring it\n",st->name); BUF_FREE(buf); return; } if (h->iph.protocol==1) { /* It's ICMP */ - if (h->type==8 && h->code==0) { + if (h->type==ICMP_TYPE_ECHO_REQUEST && h->code==0) { /* ICMP echo-request. Special case: we re-use the buffer to construct the reply. */ - h->type=0; + h->type=ICMP_TYPE_ECHO_REPLY; h->iph.daddr=h->iph.saddr; h->iph.saddr=htonl(st->secnet_address); - h->iph.ttl=255; /* Be nice and bump it up again... */ + h->iph.ttl=255; h->iph.check=0; h->iph.check=ip_fast_csum((uint8_t *)h,h->iph.ihl); netlink_icmp_csum(h); - netlink_packet_deliver(st,buf); + netlink_packet_deliver(st,NULL,buf); return; } Message(M_WARNING,"%s: unknown incoming ICMP\n",st->name); } else { /* Send ICMP protocol unreachable */ - netlink_icmp_simple(st,buf,3,2); + netlink_icmp_simple(st,sender,buf,ICMP_TYPE_UNREACHABLE, + ICMP_CODE_PROTOCOL_UNREACHABLE,icmp_noinfo); BUF_FREE(buf); return; } @@ -419,695 +859,420 @@ static void netlink_packet_local(struct netlink *st, struct buffer_if *buf) BUF_FREE(buf); } -/* Called by site code when remote packet is available */ -/* buf is allocated on entry and free on return */ -static void netlink_from_tunnel(void *sst, void *cst, struct buffer_if *buf) +/* If cid==NULL packet is from host, otherwise cid specifies which tunnel + it came from. */ +static void netlink_incoming(struct netlink *st, struct netlink_client *sender, + struct buffer_if *buf) { - struct netlink *st=sst; - struct netlink_client *client=cst; uint32_t source,dest; struct iphdr *iph; + char errmsgbuf[50]; + const char *sourcedesc=sender?sender->name:"host"; BUF_ASSERT_USED(buf); - if (!netlink_check(st,buf)) { - Message(M_WARNING,"%s: bad IP packet from tunnel %s\n", - st->name,client->name); + + if (!netlink_check(st,buf,errmsgbuf,sizeof(errmsgbuf))) { + Message(M_WARNING,"%s: bad IP packet from %s: %s\n", + st->name,sourcedesc, + errmsgbuf); BUF_FREE(buf); return; } + assert(buf->size >= (int)sizeof(struct iphdr)); iph=(struct iphdr *)buf->start; source=ntohl(iph->saddr); dest=ntohl(iph->daddr); - /* Check that the packet source is in 'nets' and its destination is - in client->networks */ - if (!subnet_match(client->networks,source)) { - string_t s,d; - s=ipaddr_to_string(source); - d=ipaddr_to_string(dest); - Message(M_WARNING,"%s: packet from tunnel %s with bad source address " - "(s=%s,d=%s)\n",st->name,client->name,s,d); - free(s); free(d); - BUF_FREE(buf); - return; - } - /* (st->secnet_address needs checking before matching against - st->networks because secnet's IP address may not be in the - range the host is willing to deal with) */ - if (dest==st->secnet_address) { - netlink_packet_local(st,buf); - BUF_ASSERT_FREE(buf); - return; - } - if (!subnet_match(&st->networks,dest)) { - string_t s,d; - s=ipaddr_to_string(source); - d=ipaddr_to_string(dest); - Message(M_WARNING,"%s: incoming packet from tunnel %s " - "with bad destination address " - "(s=%s,d=%s)\n",st->name,client->name,s,d); - free(s); free(d); - BUF_FREE(buf); - return; + /* Check source. If we don't like the source, there's no point + generating ICMP because we won't know how to get it to the + source of the packet. */ + if (sender) { + /* Check that the packet source is appropriate for the tunnel + it came down */ + if (!ipset_contains_addr(sender->networks,source)) { + string_t s,d; + s=ipaddr_to_string(source); + d=ipaddr_to_string(dest); + Message(M_WARNING,"%s: packet from tunnel %s with bad " + "source address (s=%s,d=%s)\n",st->name,sender->name,s,d); + BUF_FREE(buf); + return; + } + } else { + /* Check that the packet originates in our configured local + network, and hasn't been forwarded from elsewhere or + generated with the wrong source address */ + if (!ipset_contains_addr(st->networks,source)) { + string_t s,d; + s=ipaddr_to_string(source); + d=ipaddr_to_string(dest); + Message(M_WARNING,"%s: outgoing packet with bad source address " + "(s=%s,d=%s)\n",st->name,s,d); + BUF_FREE(buf); + return; + } } - netlink_packet_forward(st,buf); - - BUF_ASSERT_FREE(buf); -} - -/* Called by driver code when packet is received from kernel */ -/* cid should be NULL */ -/* buf should be allocated on entry, and is free on return */ -static void netlink_from_host(void *sst, void *cid, struct buffer_if *buf) -{ - struct netlink *st=sst; - uint32_t source,dest; - struct iphdr *iph; - - BUF_ASSERT_USED(buf); - if (!netlink_check(st,buf)) { - Message(M_WARNING,"%s: bad IP packet from host\n", - st->name); - BUF_FREE(buf); + /* If this is a point-to-point device we don't examine the + destination address at all; we blindly send it down our + one-and-only registered tunnel, or to the host, depending on + where it came from. It's up to external software to check + address validity and generate ICMP, etc. */ + if (st->ptp) { + if (sender) { + netlink_host_deliver(st,sender,source,dest,buf); + } else { + netlink_client_deliver(st,st->clients,source,dest,buf); + } + BUF_ASSERT_FREE(buf); return; } - iph=(struct iphdr *)buf->start; - source=ntohl(iph->saddr); - dest=ntohl(iph->daddr); - - if (!subnet_match(&st->networks,source)) { - string_t s,d; - s=ipaddr_to_string(source); - d=ipaddr_to_string(dest); - Message(M_WARNING,"%s: outgoing packet with bad source address " - "(s=%s,d=%s)\n",st->name,s,d); - free(s); free(d); - BUF_FREE(buf); - return; - } + /* st->secnet_address needs checking before matching destination + addresses */ if (dest==st->secnet_address) { - netlink_packet_local(st,buf); + netlink_packet_local(st,sender,buf); BUF_ASSERT_FREE(buf); return; } - netlink_packet_forward(st,buf); + netlink_packet_forward(st,sender,buf); BUF_ASSERT_FREE(buf); } -static void netlink_set_delivery(void *sst, void *cid, bool_t can_deliver) +static void netlink_inst_incoming(void *sst, struct buffer_if *buf) { - struct netlink_client *c=cid; + struct netlink_client *c=sst; + struct netlink *st=c->nst; - c->can_deliver=can_deliver; + netlink_incoming(st,c,buf); } -static void *netlink_regnets(void *sst, struct subnet_list *nets, - netlink_deliver_fn *deliver, void *dst, - uint32_t max_start_pad, uint32_t max_end_pad, - string_t client_name) +static void netlink_dev_incoming(void *sst, struct buffer_if *buf) { struct netlink *st=sst; - struct netlink_client *c; - - Message(M_DEBUG_CONFIG,"netlink_regnets: request for %d networks, " - "max_start_pad=%d, max_end_pad=%d\n", - nets->entries,max_start_pad,max_end_pad); - - /* Check that nets does not intersect with st->networks or - st->exclude_remote_networks; refuse to register if it does. */ - if (subnet_lists_intersect(&st->networks,nets)) { - Message(M_ERROR,"%s: site %s specifies networks that " - "intersect with our local networks\n",st->name,client_name); - return False; - } - if (subnet_lists_intersect(&st->exclude_remote_networks,nets)) { - Message(M_ERROR,"%s: site %s specifies networks that " - "intersect with the explicitly excluded remote networks\n", - st->name,client_name); - return False; - } - c=safe_malloc(sizeof(*c),"netlink_regnets"); - c->networks=nets; - c->deliver=deliver; - c->dst=dst; - c->name=client_name; /* XXX copy it? */ - c->can_deliver=False; - c->next=st->clients; - st->clients=c; - if (max_start_pad > st->max_start_pad) st->max_start_pad=max_start_pad; - if (max_end_pad > st->max_end_pad) st->max_end_pad=max_end_pad; - - return c; + netlink_incoming(st,NULL,buf); } -static netlink_deliver_fn *netlink_init(struct netlink *st, - void *dst, struct cloc loc, - dict_t *dict, string_t description, - netlink_deliver_fn *to_host) +static void netlink_set_quality(void *sst, uint32_t quality) { - st->dst=dst; - st->cl.description=description; - st->cl.type=CL_NETLINK; - st->cl.apply=NULL; - st->cl.interface=&st->ops; - st->ops.st=st; - st->ops.regnets=netlink_regnets; - st->ops.deliver=netlink_from_tunnel; - st->ops.set_delivery=netlink_set_delivery; - st->max_start_pad=0; - st->max_end_pad=0; - st->clients=NULL; - st->deliver_to_host=to_host; - - st->name=dict_read_string(dict,"name",False,"netlink",loc); - if (!st->name) st->name=description; - dict_read_subnet_list(dict, "networks", True, "netlink", loc, - &st->networks); - dict_read_subnet_list(dict, "exclude-remote-networks", False, "netlink", - loc, &st->exclude_remote_networks); - /* local-address and secnet-address do not have to be in local-networks; - however, they should be advertised in the 'sites' file for the - local site. */ - st->local_address=string_to_ipaddr( - dict_find_item(dict,"local-address", True, "netlink", loc),"netlink"); - st->secnet_address=string_to_ipaddr( - dict_find_item(dict,"secnet-address", True, "netlink", loc),"netlink"); - st->mtu=dict_read_number(dict, "mtu", False, "netlink", loc, DEFAULT_MTU); - buffer_new(&st->icmp,ICMP_BUFSIZE); + struct netlink_client *c=sst; + struct netlink *st=c->nst; - return netlink_from_host; + c->link_quality=quality; + c->up=(c->link_quality==LINK_QUALITY_DOWN)?False:True; + if (c->options&OPT_SOFTROUTE) { + st->set_routes(st->dst,c); + } } -/* Connection to the kernel through userv-ipif */ - -struct userv { - struct netlink nl; - int txfd; /* We transmit to userv */ - int rxfd; /* We receive from userv */ - string_t userv_path; - string_t service_user; - string_t service_name; - uint32_t txbuflen; - struct buffer_if *buff; /* We unstuff received packets into here - and send them to the site code. */ - bool_t pending_esc; - netlink_deliver_fn *netlink_to_tunnel; -}; - -static int userv_beforepoll(void *sst, struct pollfd *fds, int *nfds_io, - int *timeout_io, const struct timeval *tv_now, - uint64_t *now) +static void netlink_output_subnets(struct netlink *st, uint32_t loglevel, + struct subnet_list *snets) { - struct userv *st=sst; - *nfds_io=2; - fds[0].fd=st->txfd; - fds[0].events=POLLERR; /* Might want to pick up POLLOUT sometime */ - fds[1].fd=st->rxfd; - fds[1].events=POLLIN|POLLERR|POLLHUP; - return 0; + int32_t i; + string_t net; + + for (i=0; ientries; i++) { + net=subnet_to_string(snets->list[i]); + Message(loglevel,"%s ",net); + } } -static void userv_afterpoll(void *sst, struct pollfd *fds, int nfds, - const struct timeval *tv_now, uint64_t *now) +static void netlink_dump_routes(struct netlink *st, bool_t requested) { - struct userv *st=sst; - uint8_t rxbuf[DEFAULT_BUFSIZE]; - int l,i; - - if (fds[1].revents&POLLERR) { - printf("userv_afterpoll: hup!\n"); - } - if (fds[1].revents&POLLIN) { - l=read(st->rxfd,rxbuf,DEFAULT_BUFSIZE); - if (l<0) { - fatal_perror("userv_afterpoll: read(rxfd)"); - } - if (l==0) { - fatal("userv_afterpoll: read(rxfd)=0; userv gone away?\n"); + int i; + string_t net; + uint32_t c=M_INFO; + + if (requested) c=M_WARNING; + if (st->ptp) { + net=ipaddr_to_string(st->secnet_address); + Message(c,"%s: point-to-point (remote end is %s); routes: ", + st->name, net); + netlink_output_subnets(st,c,st->clients->subnets); + Message(c,"\n"); + } else { + Message(c,"%s: routing table:\n",st->name); + for (i=0; in_clients; i++) { + netlink_output_subnets(st,c,st->routes[i]->subnets); + Message(c,"-> tunnel %s (%s,mtu %d,%s routes,%s," + "quality %d,use %d,pri %lu)\n", + st->routes[i]->name, + st->routes[i]->up?"up":"down", + st->routes[i]->mtu, + st->routes[i]->options&OPT_SOFTROUTE?"soft":"hard", + st->routes[i]->options&OPT_ALLOWROUTE?"free":"restricted", + st->routes[i]->link_quality, + st->routes[i]->outcount, + (unsigned long)st->routes[i]->priority); } - /* XXX really crude unstuff code */ - /* XXX check for buffer overflow */ - BUF_ASSERT_USED(st->buff); - for (i=0; ipending_esc) { - st->pending_esc=False; - switch(rxbuf[i]) { - case SLIP_ESCEND: - *(uint8_t *)buf_append(st->buff,1)=SLIP_END; - break; - case SLIP_ESCESC: - *(uint8_t *)buf_append(st->buff,1)=SLIP_ESC; - break; - default: - fatal("userv_afterpoll: bad SLIP escape character\n"); - } - } else { - switch (rxbuf[i]) { - case SLIP_END: - if (st->buff->size>0) { - st->netlink_to_tunnel(&st->nl,NULL, - st->buff); - BUF_ALLOC(st->buff,"userv_afterpoll"); - } - buffer_init(st->buff,st->nl.max_start_pad); - break; - case SLIP_ESC: - st->pending_esc=True; - break; - default: - *(uint8_t *)buf_append(st->buff,1)=rxbuf[i]; - break; - } - } + net=ipaddr_to_string(st->secnet_address); + Message(c,"%s/32 -> netlink \"%s\" (use %d)\n", + net,st->name,st->localcount); + for (i=0; isubnets->entries; i++) { + net=subnet_to_string(st->subnets->list[i]); + Message(c,"%s ",net); } + if (i>0) + Message(c,"-> host (use %d)\n",st->outcount); } } -/* Send buf to the kernel. Free buf before returning. */ -static void userv_deliver_to_kernel(void *sst, void *cid, - struct buffer_if *buf) +/* ap is a pointer to a member of the routes array */ +static int netlink_compare_client_priority(const void *ap, const void *bp) { - struct userv *st=sst; - uint8_t txbuf[DEFAULT_BUFSIZE]; - uint8_t *i; - uint32_t j; - - BUF_ASSERT_USED(buf); + const struct netlink_client *const*a=ap; + const struct netlink_client *const*b=bp; - /* Spit the packet at userv-ipif: SLIP start marker, then - bytestuff the packet, then SLIP end marker */ - /* XXX crunchy bytestuff code */ - j=0; - txbuf[j++]=SLIP_END; - for (i=buf->start; i<(buf->start+buf->size); i++) { - switch (*i) { - case SLIP_END: - txbuf[j++]=SLIP_ESC; - txbuf[j++]=SLIP_ESCEND; - break; - case SLIP_ESC: - txbuf[j++]=SLIP_ESC; - txbuf[j++]=SLIP_ESCESC; - break; - default: - txbuf[j++]=*i; - break; - } - } - txbuf[j++]=SLIP_END; - if (write(st->txfd,txbuf,j)<0) { - fatal_perror("userv_deliver_to_kernel: write()"); - } - BUF_FREE(buf); + if ((*a)->priority==(*b)->priority) return 0; + if ((*a)->priority<(*b)->priority) return 1; + return -1; } -static void userv_phase_hook(void *sst, uint32_t newphase) +static void netlink_phase_hook(void *sst, uint32_t new_phase) { - struct userv *st=sst; - pid_t child; - int c_stdin[2]; - int c_stdout[2]; - string_t addrs; - string_t nets; - string_t s; + struct netlink *st=sst; struct netlink_client *c; - int i; - - /* This is where we actually invoke userv - all the networks we'll - be using should already have been registered. */ - - addrs=safe_malloc(512,"userv_phase_hook:addrs"); - snprintf(addrs,512,"%s,%s,%d,slip",ipaddr_to_string(st->nl.local_address), - ipaddr_to_string(st->nl.secnet_address),st->nl.mtu); - - nets=safe_malloc(1024,"userv_phase_hook:nets"); - *nets=0; - for (c=st->nl.clients; c; c=c->next) { - for (i=0; inetworks->entries; i++) { - s=subnet_to_string(&c->networks->list[i]); - strcat(nets,s); - strcat(nets,","); - free(s); - } - } - nets[strlen(nets)-1]=0; - - Message(M_INFO,"\nuserv_phase_hook: %s %s %s %s %s\n",st->userv_path, - st->service_user,st->service_name,addrs,nets); - - /* Allocate buffer, plus space for padding. Make sure we end up - with the start of the packet well-aligned. */ - /* ALIGN(st->max_start_pad,16); */ - /* ALIGN(st->max_end_pad,16); */ - - st->pending_esc=False; - - /* Invoke userv */ - if (pipe(c_stdin)!=0) { - fatal_perror("userv_phase_hook: pipe(c_stdin)"); - } - if (pipe(c_stdout)!=0) { - fatal_perror("userv_phase_hook: pipe(c_stdout)"); + int32_t i; + + /* All the networks serviced by the various tunnels should now + * have been registered. We build a routing table by sorting the + * clients by priority. */ + st->routes=safe_malloc_ary(sizeof(*st->routes),st->n_clients, + "netlink_phase_hook"); + /* Fill the table */ + i=0; + for (c=st->clients; c; c=c->next) { + assert(iroutes[i++]=c; } - st->txfd=c_stdin[1]; - st->rxfd=c_stdout[0]; + /* Sort the table in descending order of priority */ + qsort(st->routes,st->n_clients,sizeof(*st->routes), + netlink_compare_client_priority); - child=fork(); - if (child==-1) { - fatal_perror("userv_phase_hook: fork()"); - } - if (child==0) { - char **argv; - - /* We are the child. Modify our stdin and stdout, then exec userv */ - dup2(c_stdin[0],0); - dup2(c_stdout[1],1); - close(c_stdin[1]); - close(c_stdout[0]); - - /* The arguments are: - userv - service-user - service-name - local-addr,secnet-addr,mtu,protocol - route1,route2,... */ - argv=malloc(sizeof(*argv)*6); - argv[0]=st->userv_path; - argv[1]=st->service_user; - argv[2]=st->service_name; - argv[3]=addrs; - argv[4]=nets; - argv[5]=NULL; - execvp(st->userv_path,argv); - perror("netlink-userv-ipif: execvp"); - - exit(1); - } - /* We are the parent... */ - - /* Register for poll() */ - register_for_poll(st, userv_beforepoll, userv_afterpoll, 2, st->nl.name); + netlink_dump_routes(st,False); } -static list_t *userv_apply(closure_t *self, struct cloc loc, dict_t *context, - list_t *args) +static void netlink_signal_handler(void *sst, int signum) { - struct userv *st; - item_t *item; - dict_t *dict; - - st=safe_malloc(sizeof(*st),"userv_apply"); - - /* First parameter must be a dict */ - item=list_elem(args,0); - if (!item || item->type!=t_dict) - cfgfatal(loc,"userv-ipif","parameter must be a dictionary\n"); - - dict=item->data.dict; - - st->netlink_to_tunnel= - netlink_init(&st->nl,st,loc,dict, - "netlink-userv-ipif",userv_deliver_to_kernel); - - st->userv_path=dict_read_string(dict,"userv-path",False,"userv-netlink", - loc); - st->service_user=dict_read_string(dict,"service-user",False, - "userv-netlink",loc); - st->service_name=dict_read_string(dict,"service-name",False, - "userv-netlink",loc); - if (!st->userv_path) st->userv_path="userv"; - if (!st->service_user) st->service_user="root"; - if (!st->service_name) st->service_name="ipif"; - st->buff=find_cl_if(dict,"buffer",CL_BUFFER,True,"userv-netlink",loc); - BUF_ALLOC(st->buff,"netlink:userv_apply"); - - st->rxfd=-1; st->txfd=-1; - add_hook(PHASE_DROPPRIV,userv_phase_hook,st); - - return new_closure(&st->nl.cl); -} - -/* Connection to the kernel through the universal TUN/TAP driver */ - -struct tun { - struct netlink nl; - int fd; - string_t device_path; - string_t interface_name; - string_t ifconfig_path; - string_t route_path; - bool_t tun_old; - bool_t search_for_if; /* Applies to tun-old only */ - struct buffer_if *buff; /* We receive packets into here - and send them to the netlink code. */ - netlink_deliver_fn *netlink_to_tunnel; -}; - -static int tun_beforepoll(void *sst, struct pollfd *fds, int *nfds_io, - int *timeout_io, const struct timeval *tv_now, - uint64_t *now) -{ - struct tun *st=sst; - *nfds_io=1; - fds[0].fd=st->fd; - fds[0].events=POLLIN|POLLERR|POLLHUP; - return 0; + struct netlink *st=sst; + Message(M_INFO,"%s: route dump requested by SIGUSR1\n",st->name); + netlink_dump_routes(st,True); } -static void tun_afterpoll(void *sst, struct pollfd *fds, int nfds, - const struct timeval *tv_now, uint64_t *now) +static void netlink_inst_set_mtu(void *sst, int32_t new_mtu) { - struct tun *st=sst; - int l; + struct netlink_client *c=sst; - if (fds[0].revents&POLLERR) { - printf("tun_afterpoll: hup!\n"); - } - if (fds[0].revents&POLLIN) { - BUF_ALLOC(st->buff,"tun_afterpoll"); - buffer_init(st->buff,st->nl.max_start_pad); - l=read(st->fd,st->buff->start,st->buff->len-st->nl.max_start_pad); - if (l<0) { - fatal_perror("tun_afterpoll: read()"); - } - if (l==0) { - fatal("tun_afterpoll: read()=0; device gone away?\n"); - } - if (l>0) { - st->buff->size=l; - st->netlink_to_tunnel(&st->nl,NULL,st->buff); - BUF_ASSERT_FREE(st->buff); - } - } + c->mtu=new_mtu; } -static void tun_deliver_to_kernel(void *sst, void *cid, - struct buffer_if *buf) +static void netlink_inst_reg(void *sst, netlink_deliver_fn *deliver, + void *dst, uint32_t *localmtu_r) { - struct tun *st=sst; + struct netlink_client *c=sst; + struct netlink *st=c->nst; - BUF_ASSERT_USED(buf); + c->deliver=deliver; + c->dst=dst; - /* No error checking, because we'd just throw the packet away anyway */ - write(st->fd,buf->start,buf->size); - BUF_FREE(buf); + if (localmtu_r) + *localmtu_r=st->mtu; } -static void tun_phase_hook(void *sst, uint32_t newphase) +static struct flagstr netlink_option_table[]={ + { "soft", OPT_SOFTROUTE }, + { "allow-route", OPT_ALLOWROUTE }, + { NULL, 0} +}; +/* This is the routine that gets called when the closure that's + returned by an invocation of a netlink device closure (eg. tun, + userv-ipif) is invoked. It's used to create routes and pass in + information about them; the closure it returns is used by site + code. */ +static closure_t *netlink_inst_create(struct netlink *st, + struct cloc loc, dict_t *dict) { - struct tun *st=sst; - string_t hostaddr,secnetaddr; - uint8_t mtu[6]; - string_t network,mask; struct netlink_client *c; - int i; + string_t name; + struct ipset *networks; + uint32_t options,priority; + int32_t mtu; + list_t *l; + + name=dict_read_string(dict, "name", True, st->name, loc); + + l=dict_lookup(dict,"routes"); + if (!l) + cfgfatal(loc,st->name,"required parameter \"routes\" not found\n"); + networks=string_list_to_ipset(l,loc,st->name,"routes"); + options=string_list_to_word(dict_lookup(dict,"options"), + netlink_option_table,st->name); + + priority=dict_read_number(dict,"priority",False,st->name,loc,0); + mtu=dict_read_number(dict,"mtu",False,st->name,loc,0); + + if ((options&OPT_SOFTROUTE) && !st->set_routes) { + cfgfatal(loc,st->name,"this netlink device does not support " + "soft routes.\n"); + return NULL; + } - if (st->tun_old) { - if (st->search_for_if) { - string_t dname; - int i; - - /* ASSERT st->interface_name */ - dname=safe_malloc(strlen(st->device_path)+4,"tun_old_apply"); - st->interface_name=safe_malloc(8,"tun_phase_hook"); - - for (i=0; i<255; i++) { - sprintf(dname,"%s%d",st->device_path,i); - if ((st->fd=open(dname,O_RDWR))>0) { - sprintf(st->interface_name,"tun%d",i); - Message(M_INFO,"%s: allocated network interface %s " - "through %s\n",st->nl.name,st->interface_name, - dname); - break; - } - } - if (st->fd==-1) { - fatal("%s: unable to open any TUN device (%s...)\n", - st->nl.name,st->device_path); - } - } else { - st->fd=open(st->device_path,O_RDWR); - if (st->fd==-1) { - fatal_perror("%s: unable to open TUN device file %s", - st->nl.name,st->device_path); - } + if (options&OPT_SOFTROUTE) { + /* XXX for now we assume that soft routes require root privilege; + this may not always be true. The device driver can tell us. */ + require_root_privileges=True; + require_root_privileges_explanation="netlink: soft routes"; + if (st->ptp) { + cfgfatal(loc,st->name,"point-to-point netlinks do not support " + "soft routes.\n"); + return NULL; } - } else { -#ifdef HAVE_LINUX_IF_H - struct ifreq ifr; - - /* New TUN interface: open the device, then do ioctl TUNSETIFF - to set or find out the network interface name. */ - st->fd=open(st->device_path,O_RDWR); - if (st->fd==-1) { - fatal_perror("%s: can't open device file %s",st->nl.name, - st->device_path); - } - memset(&ifr,0,sizeof(ifr)); - ifr.ifr_flags = IFF_TUN | IFF_NO_PI; /* Just send/receive IP packets, - no extra headers */ - if (st->interface_name) - strncpy(ifr.ifr_name,st->interface_name,IFNAMSIZ); - if (ioctl(st->fd,TUNSETIFF,&ifr)<0) { - fatal_perror("%s: ioctl(TUNSETIFF)",st->nl.name); - } - if (!st->interface_name) { - st->interface_name=safe_malloc(strlen(ifr.ifr_name)+1,"tun_apply"); - strcpy(st->interface_name,ifr.ifr_name); - Message(M_INFO,"%s: allocated network interface %s\n",st->nl.name, - st->interface_name); - } -#else - fatal("netlink.c:tun_phase_hook:!tun_old unexpected\n"); -#endif /* HAVE_LINUX_IF_H */ } - /* All the networks we'll be using have been registered. Invoke ifconfig - to set the TUN device's address, and route to add routes to all - our networks. */ - - hostaddr=ipaddr_to_string(st->nl.local_address); - secnetaddr=ipaddr_to_string(st->nl.secnet_address); - snprintf(mtu,6,"%d",st->nl.mtu); - mtu[5]=0; - - sys_cmd(st->ifconfig_path,"ifconfig",st->interface_name, - hostaddr,"netmask","255.255.255.255","-broadcast", - "pointopoint",secnetaddr,"mtu",mtu,"up",(char *)0); - - for (c=st->nl.clients; c; c=c->next) { - for (i=0; inetworks->entries; i++) { - network=ipaddr_to_string(c->networks->list[i].prefix); - mask=ipaddr_to_string(c->networks->list[i].mask); - sys_cmd(st->route_path,"route","add","-net",network, - "netmask",mask,"gw",secnetaddr,(char *)0); - } + + /* Check that nets are a subset of st->remote_networks; + refuse to register if they are not. */ + if (!ipset_is_subset(st->remote_networks,networks)) { + cfgfatal(loc,st->name,"routes are not allowed\n"); + return NULL; } - /* Register for poll() */ - register_for_poll(st, tun_beforepoll, tun_afterpoll, 1, st->nl.name); + c=safe_malloc(sizeof(*c),"netlink_inst_create"); + c->cl.description=name; + c->cl.type=CL_NETLINK; + c->cl.apply=NULL; + c->cl.interface=&c->ops; + c->ops.st=c; + c->ops.reg=netlink_inst_reg; + c->ops.deliver=netlink_inst_incoming; + c->ops.set_quality=netlink_set_quality; + c->ops.set_mtu=netlink_inst_set_mtu; + c->nst=st; + + c->networks=networks; + c->subnets=ipset_to_subnet_list(networks); + c->priority=priority; + c->deliver=NULL; + c->dst=NULL; + c->name=name; + c->link_quality=LINK_QUALITY_UNUSED; + c->mtu=mtu?mtu:st->mtu; + c->options=options; + c->outcount=0; + c->up=False; + c->kup=False; + c->next=st->clients; + st->clients=c; + assert(st->n_clients < INT_MAX); + st->n_clients++; + + return &c->cl; } -#ifdef HAVE_LINUX_IF_H -static list_t *tun_apply(closure_t *self, struct cloc loc, dict_t *context, - list_t *args) +static list_t *netlink_inst_apply(closure_t *self, struct cloc loc, + dict_t *context, list_t *args) { - struct tun *st; - item_t *item; - dict_t *dict; + struct netlink *st=self->interface; - st=safe_malloc(sizeof(*st),"tun_apply"); + dict_t *dict; + item_t *item; + closure_t *cl; - /* First parameter must be a dict */ item=list_elem(args,0); - if (!item || item->type!=t_dict) - cfgfatal(loc,"tun","parameter must be a dictionary\n"); - + if (!item || item->type!=t_dict) { + cfgfatal(loc,st->name,"must have a dictionary argument\n"); + } dict=item->data.dict; - st->netlink_to_tunnel= - netlink_init(&st->nl,st,loc,dict, - "netlink-tun",tun_deliver_to_kernel); - - st->tun_old=False; - st->device_path=dict_read_string(dict,"device",False,"tun-netlink",loc); - st->interface_name=dict_read_string(dict,"interface",False, - "tun-netlink",loc); - st->ifconfig_path=dict_read_string(dict,"ifconfig-path", - False,"tun-netlink",loc); - st->route_path=dict_read_string(dict,"route-path", - False,"tun-netlink",loc); - - if (!st->device_path) st->device_path="/dev/net/tun"; - if (!st->ifconfig_path) st->ifconfig_path="ifconfig"; - if (!st->route_path) st->route_path="route"; - st->buff=find_cl_if(dict,"buffer",CL_BUFFER,True,"tun-netlink",loc); - - add_hook(PHASE_GETRESOURCES,tun_phase_hook,st); + cl=netlink_inst_create(st,loc,dict); - return new_closure(&st->nl.cl); + return new_closure(cl); } -#endif /* HAVE_LINUX_IF_H */ -static list_t *tun_old_apply(closure_t *self, struct cloc loc, dict_t *context, - list_t *args) +netlink_deliver_fn *netlink_init(struct netlink *st, + void *dst, struct cloc loc, + dict_t *dict, cstring_t description, + netlink_route_fn *set_routes, + netlink_deliver_fn *to_host) { - struct tun *st; - item_t *item; - dict_t *dict; - - st=safe_malloc(sizeof(*st),"tun_old_apply"); - - Message(M_WARNING,"the tun-old code has never been tested. Please report " - "success or failure to steve@greenend.org.uk\n"); + item_t *sa, *ptpa; + list_t *l; - /* First parameter must be a dict */ - item=list_elem(args,0); - if (!item || item->type!=t_dict) - cfgfatal(loc,"tun","parameter must be a dictionary\n"); - - dict=item->data.dict; + st->dst=dst; + st->cl.description=description; + st->cl.type=CL_PURE; + st->cl.apply=netlink_inst_apply; + st->cl.interface=st; + st->clients=NULL; + st->routes=NULL; + st->n_clients=0; + st->set_routes=set_routes; + st->deliver_to_host=to_host; - st->netlink_to_tunnel= - netlink_init(&st->nl,st,loc,dict, - "netlink-tun",tun_deliver_to_kernel); - - st->tun_old=True; - st->device_path=dict_read_string(dict,"device",False,"tun-netlink",loc); - st->interface_name=dict_read_string(dict,"interface",False, - "tun-netlink",loc); - st->search_for_if=dict_read_bool(dict,"interface-search",False, - "tun-netlink",loc,st->device_path==NULL); - st->ifconfig_path=dict_read_string(dict,"ifconfig-path",False, - "tun-netlink",loc); - st->route_path=dict_read_string(dict,"route-path",False,"tun-netlink",loc); - - if (!st->device_path) st->device_path="/dev/tun"; - if (!st->ifconfig_path) st->ifconfig_path="ifconfig"; - if (!st->route_path) st->route_path="route"; - st->buff=find_cl_if(dict,"buffer",CL_BUFFER,True,"tun-netlink",loc); - - /* Old TUN interface: the network interface name depends on which - /dev/tunX file we open. If 'interface-search' is set to true, treat - 'device' as the prefix and try numbers from 0--255. If it's set - to false, treat 'device' as the whole name, and require than an - appropriate interface name be specified. */ - if (st->search_for_if && st->interface_name) { - cfgfatal(loc,"tun-old","you may not specify an interface name " - "in interface-search mode\n"); + st->name=dict_read_string(dict,"name",False,description,loc); + if (!st->name) st->name=description; + l=dict_lookup(dict,"networks"); + if (l) + st->networks=string_list_to_ipset(l,loc,st->name,"networks"); + else { + struct ipset *empty; + empty=ipset_new(); + st->networks=ipset_complement(empty); + ipset_free(empty); } - if (!st->search_for_if && !st->interface_name) { - cfgfatal(loc,"tun-old","you must specify an interface name " - "when you explicitly specify a TUN device file\n"); + l=dict_lookup(dict,"remote-networks"); + if (l) { + st->remote_networks=string_list_to_ipset(l,loc,st->name, + "remote-networks"); + } else { + struct ipset *empty; + empty=ipset_new(); + st->remote_networks=ipset_complement(empty); + ipset_free(empty); } + st->local_address=string_item_to_ipaddr( + dict_find_item(dict,"local-address", True, "netlink", loc),"netlink"); - - add_hook(PHASE_GETRESOURCES,tun_phase_hook,st); - - return new_closure(&st->nl.cl); + sa=dict_find_item(dict,"secnet-address",False,"netlink",loc); + ptpa=dict_find_item(dict,"ptp-address",False,"netlink",loc); + if (sa && ptpa) { + cfgfatal(loc,st->name,"you may not specify secnet-address and " + "ptp-address in the same netlink device\n"); + } + if (!(sa || ptpa)) { + cfgfatal(loc,st->name,"you must specify secnet-address or " + "ptp-address for this netlink device\n"); + } + if (sa) { + st->secnet_address=string_item_to_ipaddr(sa,"netlink"); + st->ptp=False; + } else { + st->secnet_address=string_item_to_ipaddr(ptpa,"netlink"); + st->ptp=True; + } + /* To be strictly correct we could subtract secnet_address from + networks here. It shouldn't make any practical difference, + though, and will make the route dump look complicated... */ + st->subnets=ipset_to_subnet_list(st->networks); + st->mtu=dict_read_number(dict, "mtu", False, "netlink", loc, DEFAULT_MTU); + buffer_new(&st->icmp,MAX(ICMP_BUFSIZE,st->mtu)); + st->outcount=0; + st->localcount=0; + + add_hook(PHASE_SETUP,netlink_phase_hook,st); + request_signal_notification(SIGUSR1, netlink_signal_handler, st); + + /* If we're point-to-point then we return a CL_NETLINK directly, + rather than a CL_NETLINK_OLD or pure closure (depending on + compatibility). This CL_NETLINK is for our one and only + client. Our cl.apply function is NULL. */ + if (st->ptp) { + closure_t *cl; + cl=netlink_inst_create(st,loc,dict); + st->cl=*cl; + } + return netlink_dev_incoming; } /* No connection to the kernel at all... */ @@ -1116,7 +1281,21 @@ struct null { struct netlink nl; }; -static void null_deliver(void *sst, void *cid, struct buffer_if *buf) +static bool_t null_set_route(void *sst, struct netlink_client *routes) +{ + struct null *st=sst; + + if (routes->up!=routes->kup) { + Message(M_INFO,"%s: setting routes for tunnel %s to state %s\n", + st->nl.name,routes->name, + routes->up?"up":"down"); + routes->kup=routes->up; + return True; + } + return False; +} + +static void null_deliver(void *sst, struct buffer_if *buf) { return; } @@ -1136,23 +1315,13 @@ static list_t *null_apply(closure_t *self, struct cloc loc, dict_t *context, dict=item->data.dict; - netlink_init(&st->nl,st,loc,dict,"null-netlink",null_deliver); + netlink_init(&st->nl,st,loc,dict,"null-netlink",null_set_route, + null_deliver); return new_closure(&st->nl.cl); } -init_module netlink_module; void netlink_module(dict_t *dict) { - add_closure(dict,"userv-ipif",userv_apply); -#ifdef HAVE_LINUX_IF_H - add_closure(dict,"tun",tun_apply); -#endif - add_closure(dict,"tun-old",tun_old_apply); add_closure(dict,"null-netlink",null_apply); -#if 0 - /* TODO */ - add_closure(dict,"pty-slip",ptyslip_apply); - add_closure(dict,"slipd",slipd_apply); -#endif /* 0 */ }