chiark / gitweb /
Import release 0.1.14
[secnet.git] / netlink.c
index 3fbc3cfabde6a711d04fbc760ca912de8c2cacc4..8e843bc58172c459d977c414facc07b9aeea9b0e 100644 (file)
--- a/netlink.c
+++ b/netlink.c
 /* User-kernel network link */
 
-/* We will eventually support a variety of methods for extracting
-   packets from the kernel: userv-ipif, ipif on its own (when we run
-   as root), the kernel TUN driver, SLIP to a pty, an external netlink
-   daemon. There is a performance/security tradeoff. */
+/* See RFCs 791, 792, 1123 and 1812 */
 
-/* When dealing with SLIP (to a pty, or ipif) we have separate rx, tx
-   and client buffers. When receiving we may read() any amount, not
-   just whole packets. When transmitting we need to bytestuff anyway,
-   and may be part-way through receiving. */
+/* The netlink device is actually a router.  Tunnels are unnumbered
+   point-to-point lines (RFC1812 section 2.2.7); the router has a
+   single address (the 'router-id'). */
 
-/* Each netlink device is actually a router, with its own IP
-   address. We do things like decreasing the TTL and recalculating the
-   header checksum, generating ICMP, responding to pings, etc. */
+/* This is where we currently have the anti-spoofing paranoia - before
+   sending a packet to the kernel we check that the tunnel it came
+   over could reasonably have produced it. */
 
-/* This is where we have the anti-spoofing paranoia - before sending a
-   packet to the kernel we check that the tunnel it came over could
-   reasonably have produced it. */
 
-#include "secnet.h"
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include "util.h"
+/* Points to note from RFC1812 (which may require changes in this
+   file):
 
-#ifdef HAVE_LINUX_IF_H
-#include <linux/if.h>
-#include <linux/if_tun.h>
-#endif
+3.3.4 Maximum Transmission Unit - MTU
 
-/* XXX where do we find if_tun on other architectures? */
+   The MTU of each logical interface MUST be configurable within the
+   range of legal MTUs for the interface.
 
-#define DEFAULT_BUFSIZE 2048
-#define DEFAULT_MTU 1000
-#define ICMP_BUFSIZE 1024
+   Many Link Layer protocols define a maximum frame size that may be
+   sent.  In such cases, a router MUST NOT allow an MTU to be set which
+   would allow sending of frames larger than those allowed by the Link
+   Layer protocol.  However, a router SHOULD be willing to receive a
+   packet as large as the maximum frame size even if that is larger than
+   the MTU.
 
-#define SLIP_END    192
-#define SLIP_ESC    219
-#define SLIP_ESCEND 220
-#define SLIP_ESCESC 221
+4.2.1  A router SHOULD count datagrams discarded.
 
-struct netlink_client {
-    struct subnet_list *networks;
-    netlink_deliver_fn *deliver;
-    void *dst;
-    string_t name;
-    bool_t can_deliver;
-    struct netlink_client *next;
-};
+4.2.2.1 Source route options - we probably should implement processing
+of source routes, even though mostly the security policy will prevent
+their use.
 
-/* Netlink provides one function to the device driver, to call to deliver
-   a packet from the device. The device driver provides one function to
-   netlink, for it to call to deliver a packet to the device. */
+5.3.13.4 Source Route Options
 
-struct netlink {
-    closure_t cl;
-    struct netlink_if ops;
-    void *dst; /* Pointer to host interface state */
-    string_t name;
-    uint32_t max_start_pad;
-    uint32_t max_end_pad;
-    struct subnet_list networks;
-    uint32_t local_address; /* host interface address */
-    uint32_t secnet_address; /* our own address */
-    uint32_t mtu;
-    struct netlink_client *clients;
-    netlink_deliver_fn *deliver_to_host; /* Provided by driver */
-    struct buffer_if icmp; /* Buffer for assembly of outgoing ICMP */
-};
+   A router MUST implement support for source route options in forwarded
+   packets.  A router MAY implement a configuration option that, when
+   enabled, causes all source-routed packets to be discarded.  However,
+   such an option MUST NOT be enabled by default.
+
+5.3.13.5 Record Route Option
+
+   Routers MUST support the Record Route option in forwarded packets.
+
+   A router MAY provide a configuration option that, if enabled, will
+   cause the router to ignore (i.e., pass through unchanged) Record
+   Route options in forwarded packets.  If provided, such an option MUST
+   default to enabling the record-route.  This option should not affect
+   the processing of Record Route options in datagrams received by the
+   router itself (in particular, Record Route options in ICMP echo
+   requests will still be processed according to Section [4.3.3.6]).
+
+5.3.13.6 Timestamp Option
+
+   Routers MUST support the timestamp option in forwarded packets.  A
+   timestamp value MUST follow the rules given [INTRO:2].
+
+   If the flags field = 3 (timestamp and prespecified address), the
+   router MUST add its timestamp if the next prespecified address
+   matches any of the router's IP addresses.  It is not necessary that
+   the prespecified address be either the address of the interface on
+   which the packet arrived or the address of the interface over which
+   it will be sent.
+
+
+4.2.2.7 Fragmentation: RFC 791 Section 3.2
+
+   Fragmentation, as described in [INTERNET:1], MUST be supported by a
+   router.
+
+4.2.2.8 Reassembly: RFC 791 Section 3.2
+
+   As specified in the corresponding section of [INTRO:2], a router MUST
+   support reassembly of datagrams that it delivers to itself.
+
+4.2.2.9 Time to Live: RFC 791 Section 3.2
+
+   Note in particular that a router MUST NOT check the TTL of a packet
+   except when forwarding it.
+
+   A router MUST NOT discard a datagram just because it was received
+   with TTL equal to zero or one; if it is to the router and otherwise
+   valid, the router MUST attempt to receive it.
+
+   On messages the router originates, the IP layer MUST provide a means
+   for the transport layer to set the TTL field of every datagram that
+   is sent.  When a fixed TTL value is used, it MUST be configurable.
+
+
+8.1 The Simple Network Management Protocol - SNMP
+8.1.1 SNMP Protocol Elements
+
+   Routers MUST be manageable by SNMP [MGT:3].  The SNMP MUST operate
+   using UDP/IP as its transport and network protocols.
+
+
+*/
+
+#include "secnet.h"
+#include "util.h"
+#include "ipaddr.h"
+#include "netlink.h"
+#include "process.h"
+
+#define OPT_SOFTROUTE   1
+#define OPT_ALLOWROUTE  2
+
+#define ICMP_TYPE_ECHO_REPLY             0
+
+#define ICMP_TYPE_UNREACHABLE            3
+#define ICMP_CODE_NET_UNREACHABLE        0
+#define ICMP_CODE_PROTOCOL_UNREACHABLE   2
+#define ICMP_CODE_FRAGMENTATION_REQUIRED 4
+#define ICMP_CODE_NET_PROHIBITED        13
+
+#define ICMP_TYPE_ECHO_REQUEST           8
+
+#define ICMP_TYPE_TIME_EXCEEDED         11
+#define ICMP_CODE_TTL_EXCEEDED           0
 
 /* Generic IP checksum routine */
 static inline uint16_t ip_csum(uint8_t *iph,uint32_t count)
@@ -172,8 +220,17 @@ struct icmphdr {
     } d;
 };
     
-static void netlink_packet_deliver(struct netlink *st, struct buffer_if *buf);
-
+static void netlink_packet_deliver(struct netlink *st,
+                                  struct netlink_client *client,
+                                  struct buffer_if *buf);
+
+/* XXX RFC1812 4.3.2.5:
+   All other ICMP error messages (Destination Unreachable,
+   Redirect, Time Exceeded, and Parameter Problem) SHOULD have their
+   precedence value set to 6 (INTERNETWORK CONTROL) or 7 (NETWORK
+   CONTROL).  The IP Precedence value for these error messages MAY be
+   settable.
+   */
 static struct icmphdr *netlink_icmp_tmpl(struct netlink *st,
                                         uint32_t dest,uint16_t len)
 {
@@ -189,7 +246,7 @@ static struct icmphdr *netlink_icmp_tmpl(struct netlink *st,
     h->iph.tot_len=htons(len+(h->iph.ihl*4)+8);
     h->iph.id=0;
     h->iph.frag_off=0;
-    h->iph.ttl=255;
+    h->iph.ttl=255; /* XXX should be configurable */
     h->iph.protocol=1;
     h->iph.saddr=htonl(st->secnet_address);
     h->iph.daddr=htonl(dest);
@@ -232,11 +289,19 @@ static void netlink_icmp_csum(struct icmphdr *h)
 static bool_t netlink_icmp_may_reply(struct buffer_if *buf)
 {
     struct iphdr *iph;
+    struct icmphdr *icmph;
     uint32_t source;
 
     iph=(struct iphdr *)buf->start;
-    if (iph->protocol==1) return False; /* Overly-broad; we may reply to
-                                          eg. icmp echo-request */
+    icmph=(struct icmphdr *)buf->start;
+    if (iph->protocol==1) {
+       switch(icmph->type) {
+       case 3: /* Destination unreachable */
+       case 11: /* Time Exceeded */
+       case 12: /* Parameter Problem */
+           return False;
+       }
+    }
     /* How do we spot broadcast destination addresses? */
     if (ntohs(iph->frag_off)&0x1fff) return False; /* Non-initial fragment */
     source=ntohl(iph->saddr);
@@ -250,6 +315,26 @@ static bool_t netlink_icmp_may_reply(struct buffer_if *buf)
 
 /* How much of the original IP packet do we include in its ICMP
    response? The header plus up to 64 bits. */
+
+/* XXX TODO RFC1812:
+4.3.2.3 Original Message Header
+
+   Historically, every ICMP error message has included the Internet
+   header and at least the first 8 data bytes of the datagram that
+   triggered the error.  This is no longer adequate, due to the use of
+   IP-in-IP tunneling and other technologies.  Therefore, the ICMP
+   datagram SHOULD contain as much of the original datagram as possible
+   without the length of the ICMP datagram exceeding 576 bytes.  The
+   returned IP header (and user data) MUST be identical to that which
+   was received, except that the router is not required to undo any
+   modifications to the IP header that are normally performed in
+   forwarding that were performed before the error was detected (e.g.,
+   decrementing the TTL, or updating options).  Note that the
+   requirements of Section [4.3.3.5] supersede this requirement in some
+   cases (i.e., for a Parameter Problem message, if the problem is in a
+   modified field, the router must undo the modification).  See Section
+   [4.3.3.5]).
+   */
 static uint16_t netlink_icmp_reply_len(struct buffer_if *buf)
 {
     struct iphdr *iph=(struct iphdr *)buf->start;
@@ -262,7 +347,10 @@ static uint16_t netlink_icmp_reply_len(struct buffer_if *buf)
     return (hlen>plen?plen:hlen);
 }
 
+/* client indicates where the packet we're constructing a response to
+   comes from. NULL indicates the host. */
 static void netlink_icmp_simple(struct netlink *st, struct buffer_if *buf,
+                               struct netlink_client *client,
                                uint8_t type, uint8_t code)
 {
     struct iphdr *iph=(struct iphdr *)buf->start;
@@ -275,7 +363,7 @@ static void netlink_icmp_simple(struct netlink *st, struct buffer_if *buf,
        h->type=type; h->code=code;
        memcpy(buf_append(&st->icmp,len),buf->start,len);
        netlink_icmp_csum(h);
-       netlink_packet_deliver(st,&st->icmp);
+       netlink_packet_deliver(st,NULL,&st->icmp);
        BUF_ASSERT_FREE(&st->icmp);
     }
 }
@@ -283,6 +371,7 @@ static void netlink_icmp_simple(struct netlink *st, struct buffer_if *buf,
 /*
  * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the
  * checksum.
+ * RFC1812: 4.2.2.5 MUST discard messages containing invalid checksums.
  *
  * Is the datagram acceptable?
  *
@@ -296,67 +385,138 @@ static bool_t netlink_check(struct netlink *st, struct buffer_if *buf)
     struct iphdr *iph=(struct iphdr *)buf->start;
     uint32_t len;
 
-    if (iph->ihl < 5 || iph->version != 4) {
-       printf("ihl/version check failed\n");
-       return False;
-    }
-    if (buf->size < iph->ihl*4) {
-       printf("buffer size check failed\n");
-       return False;
-    }
-    if (ip_fast_csum((uint8_t *)iph, iph->ihl)!=0) {
-       printf("checksum failed\n");
-       return False;
-    }
+    if (iph->ihl < 5 || iph->version != 4) return False;
+    if (buf->size < iph->ihl*4) return False;
+    if (ip_fast_csum((uint8_t *)iph, iph->ihl)!=0) return False;
     len=ntohs(iph->tot_len);
     /* There should be no padding */
-    if (buf->size!=len || len<(iph->ihl<<2)) {
-       printf("length check failed buf->size=%d len=%d\n",buf->size,len);
-       return False;
-    }
-
+    if (buf->size!=len || len<(iph->ihl<<2)) return False;
     /* XXX check that there's no source route specified */
     return True;
 }
 
-static void netlink_packet_deliver(struct netlink *st, struct buffer_if *buf)
+/* Deliver a packet. "client" is the _origin_ of the packet, not its
+   destination, and is NULL for packets from the host and packets
+   generated internally in secnet.  */
+static void netlink_packet_deliver(struct netlink *st,
+                                  struct netlink_client *client,
+                                  struct buffer_if *buf)
 {
     struct iphdr *iph=(struct iphdr *)buf->start;
     uint32_t dest=ntohl(iph->daddr);
-    struct netlink_client *c;
+    uint32_t source=ntohl(iph->saddr);
+    uint32_t best_quality;
+    bool_t allow_route=False;
+    bool_t found_allowed=False;
+    int best_match;
+    int i;
 
     BUF_ASSERT_USED(buf);
 
     if (dest==st->secnet_address) {
-       Message(M_ERROR,"%s: trying to deliver a packet to myself!\n");
+       Message(M_ERR,"%s: trying to deliver a packet to myself!\n",st->name);
        BUF_FREE(buf);
        return;
     }
     
-    for (c=st->clients; c; c=c->next) {
-       if (subnet_match(c->networks,dest)) {
-           if (c->can_deliver) {
-               c->deliver(c->dst,c,buf);
-               BUF_ASSERT_FREE(buf);
-           } else {
-               /* Generate ICMP destination unreachable */
-               netlink_icmp_simple(st,buf,3,0);
-               BUF_FREE(buf);
+    /* Packets from the host (client==NULL) may always be routed.  Packets
+       from clients with the allow_route option will also be routed. */
+    if (!client || (client && (client->options & OPT_ALLOWROUTE)))
+       allow_route=True;
+
+    /* If !allow_route, we check the routing table anyway, and if
+       there's a suitable route with OPT_ALLOWROUTE set we use it.  If
+       there's a suitable route, but none with OPT_ALLOWROUTE set then
+       we generate ICMP 'communication with destination network
+       administratively prohibited'. */
+
+    best_quality=0;
+    best_match=-1;
+    for (i=0; i<st->n_clients; i++) {
+       if (st->routes[i]->up &&
+           ipset_contains_addr(st->routes[i]->networks,dest)) {
+           /* It's an available route to the correct destination. But is
+              it better than the one we already have? */
+
+           /* If we have already found an allowed route then we don't
+              bother looking at routes we're not allowed to use.  If
+              we don't yet have an allowed route we'll consider any.  */
+           if (!allow_route && found_allowed) {
+               if (!(st->routes[i]->options&OPT_ALLOWROUTE)) continue;
+           }
+           
+           if (st->routes[i]->link_quality>best_quality
+               || best_quality==0) {
+               best_quality=st->routes[i]->link_quality;
+               best_match=i;
+               if (st->routes[i]->options&OPT_ALLOWROUTE)
+                   found_allowed=True;
+               /* If quality isn't perfect we may wish to
+                  consider kicking the tunnel with a 0-length
+                  packet to prompt it to perform a key setup.
+                  Then it'll eventually decide it's up or
+                  down. */
+               /* If quality is perfect and we're allowed to use the
+                  route we don't need to search any more. */
+               if (best_quality>=MAXIMUM_LINK_QUALITY && 
+                   (allow_route || found_allowed)) break;
            }
-           return;
        }
     }
-    if (subnet_match(&st->networks,dest)) {
-       st->deliver_to_host(st->dst,NULL,buf);
-       BUF_ASSERT_FREE(buf);
-       return;
+    if (best_match==-1) {
+       /* The packet's not going down a tunnel.  It might (ought to)
+          be for the host.   */
+       if (ipset_contains_addr(st->networks,dest)) {
+           st->deliver_to_host(st->dst,buf);
+           st->outcount++;
+           BUF_ASSERT_FREE(buf);
+       } else {
+           string_t s,d;
+           s=ipaddr_to_string(source);
+           d=ipaddr_to_string(dest);
+           Message(M_DEBUG,"%s: don't know where to deliver packet "
+                   "(s=%s, d=%s)\n", st->name, s, d);
+           free(s); free(d);
+           netlink_icmp_simple(st,buf,client,ICMP_TYPE_UNREACHABLE,
+                               ICMP_CODE_NET_UNREACHABLE);
+           BUF_FREE(buf);
+       }
+    } else {
+       if (!allow_route &&
+           !(st->routes[best_match]->options&OPT_ALLOWROUTE)) {
+           string_t s,d;
+           s=ipaddr_to_string(source);
+           d=ipaddr_to_string(dest);
+           /* We have a usable route but aren't allowed to use it.
+              Generate ICMP destination unreachable: communication
+              with destination network administratively prohibited */
+           Message(M_NOTICE,"%s: denied forwarding for packet (s=%s, d=%s)\n",
+                   st->name,s,d);
+           free(s); free(d);
+                   
+           netlink_icmp_simple(st,buf,client,ICMP_TYPE_UNREACHABLE,
+                               ICMP_CODE_NET_PROHIBITED);
+           BUF_FREE(buf);
+       }
+       if (best_quality>0) {
+           /* XXX Fragment if required */
+           st->routes[best_match]->deliver(
+               st->routes[best_match]->dst, buf);
+           st->routes[best_match]->outcount++;
+           BUF_ASSERT_FREE(buf);
+       } else {
+           /* Generate ICMP destination unreachable */
+           netlink_icmp_simple(st,buf,client,ICMP_TYPE_UNREACHABLE,
+                               ICMP_CODE_NET_UNREACHABLE); /* client==NULL */
+           BUF_FREE(buf);
+       }
     }
-    Message(M_ERROR,"%s: failed to deliver a packet (bad destination address)"
-           "\nXXX make this message clearer\n");
-    BUF_FREE(buf);
+    BUF_ASSERT_FREE(buf);
 }
 
-static void netlink_packet_forward(struct netlink *st, struct buffer_if *buf)
+static void netlink_packet_forward(struct netlink *st, 
+                                  struct netlink_client *client,
+                                  struct buffer_if *buf)
 {
     struct iphdr *iph=(struct iphdr *)buf->start;
     
@@ -365,7 +525,8 @@ static void netlink_packet_forward(struct netlink *st, struct buffer_if *buf)
     /* Packet has already been checked */
     if (iph->ttl<=1) {
        /* Generate ICMP time exceeded */
-       netlink_icmp_simple(st,buf,11,0);
+       netlink_icmp_simple(st,buf,client,ICMP_TYPE_TIME_EXCEEDED,
+                           ICMP_CODE_TTL_EXCEEDED);
        BUF_FREE(buf);
        return;
     }
@@ -373,43 +534,48 @@ static void netlink_packet_forward(struct netlink *st, struct buffer_if *buf)
     iph->check=0;
     iph->check=ip_fast_csum((uint8_t *)iph,iph->ihl);
 
-    netlink_packet_deliver(st,buf);
+    netlink_packet_deliver(st,client,buf);
     BUF_ASSERT_FREE(buf);
 }
 
-/* Someone has been foolish enough to address a packet to us. I
-   suppose we should reply to it, just to be polite. */
-static void netlink_packet_local(struct netlink *st, struct buffer_if *buf)
+/* Deal with packets addressed explicitly to us */
+static void netlink_packet_local(struct netlink *st,
+                                struct netlink_client *client,
+                                struct buffer_if *buf)
 {
     struct icmphdr *h;
 
+    st->localcount++;
+
     h=(struct icmphdr *)buf->start;
 
     if ((ntohs(h->iph.frag_off)&0xbfff)!=0) {
-       Message(M_WARNING,"%s: fragmented packet addressed to us\n",st->name);
+       Message(M_WARNING,"%s: fragmented packet addressed to secnet; "
+               "ignoring it\n",st->name);
        BUF_FREE(buf);
        return;
     }
 
     if (h->iph.protocol==1) {
        /* It's ICMP */
-       if (h->type==8 && h->code==0) {
+       if (h->type==ICMP_TYPE_ECHO_REQUEST && h->code==0) {
            /* ICMP echo-request. Special case: we re-use the buffer
               to construct the reply. */
-           h->type=0;
+           h->type=ICMP_TYPE_ECHO_REPLY;
            h->iph.daddr=h->iph.saddr;
            h->iph.saddr=htonl(st->secnet_address);
-           h->iph.ttl=255; /* Be nice and bump it up again... */
+           h->iph.ttl=255;
            h->iph.check=0;
            h->iph.check=ip_fast_csum((uint8_t *)h,h->iph.ihl);
            netlink_icmp_csum(h);
-           netlink_packet_deliver(st,buf);
+           netlink_packet_deliver(st,NULL,buf);
            return;
        }
        Message(M_WARNING,"%s: unknown incoming ICMP\n",st->name);
     } else {
        /* Send ICMP protocol unreachable */
-       netlink_icmp_simple(st,buf,3,2);
+       netlink_icmp_simple(st,buf,client,ICMP_TYPE_UNREACHABLE,
+                           ICMP_CODE_PROTOCOL_UNREACHABLE);
        BUF_FREE(buf);
        return;
     }
@@ -417,19 +583,18 @@ static void netlink_packet_local(struct netlink *st, struct buffer_if *buf)
     BUF_FREE(buf);
 }
 
-/* Called by site code when remote packet is available */
-/* buf is allocated on entry and free on return */
-static void netlink_from_tunnel(void *sst, void *cst, struct buffer_if *buf)
+/* If cid==NULL packet is from host, otherwise cid specifies which tunnel 
+   it came from. */
+static void netlink_incoming(struct netlink *st, struct netlink_client *client,
+                            struct buffer_if *buf)
 {
-    struct netlink *st=sst;
-    struct netlink_client *client=cst;
     uint32_t source,dest;
     struct iphdr *iph;
 
     BUF_ASSERT_USED(buf);
     if (!netlink_check(st,buf)) {
-       Message(M_WARNING,"%s: bad IP packet from tunnel %s\n",
-               st->name,client->name);
+       Message(M_WARNING,"%s: bad IP packet from %s\n",
+               st->name,client?client->name:"host");
        BUF_FREE(buf);
        return;
     }
@@ -438,645 +603,419 @@ static void netlink_from_tunnel(void *sst, void *cst, struct buffer_if *buf)
     source=ntohl(iph->saddr);
     dest=ntohl(iph->daddr);
 
-    /* Check that the packet source is in 'nets' and its destination is
-       in client->networks */
-    if (!subnet_match(client->networks,source)) {
-       string_t s,d;
-       s=ipaddr_to_string(source);
-       d=ipaddr_to_string(dest);
-       Message(M_WARNING,"%s: packet from tunnel %s with bad source address "
-               "(s=%s,d=%s)\n",st->name,client->name,s,d);
-       free(s); free(d);
-       BUF_FREE(buf);
-       return;
-    }
-    /* (st->secnet_address needs checking before matching against
-       st->networks because secnet's IP address may not be in the
-       range the host is willing to deal with) */
-    if (dest==st->secnet_address) {
-        netlink_packet_local(st,buf);
-       BUF_ASSERT_FREE(buf);
-       return;
-    }
-    if (!subnet_match(&st->networks,dest)) {
-       string_t s,d;
-       s=ipaddr_to_string(source);
-       d=ipaddr_to_string(dest);
-       Message(M_WARNING,"%s: incoming packet from tunnel %s "
-               "with bad destination address "
-               "(s=%s,d=%s)\n",st->name,client->name,s,d);
-       free(s); free(d);
-       BUF_FREE(buf);
-       return;
+    /* Check source. If we don't like the source, there's no point
+       generating ICMP because we won't know how to get it to the
+       source of the packet. */
+    if (client) {
+       /* Check that the packet source is appropriate for the tunnel
+          it came down */
+       if (!ipset_contains_addr(client->networks,source)) {
+           string_t s,d;
+           s=ipaddr_to_string(source);
+           d=ipaddr_to_string(dest);
+           Message(M_WARNING,"%s: packet from tunnel %s with bad "
+                   "source address (s=%s,d=%s)\n",st->name,client->name,s,d);
+           free(s); free(d);
+           BUF_FREE(buf);
+           return;
+       }
+    } else {
+       /* Check that the packet originates in our configured local
+          network, and hasn't been forwarded from elsewhere or
+          generated with the wrong source address */
+       if (!ipset_contains_addr(st->networks,source)) {
+           string_t s,d;
+           s=ipaddr_to_string(source);
+           d=ipaddr_to_string(dest);
+           Message(M_WARNING,"%s: outgoing packet with bad source address "
+                   "(s=%s,d=%s)\n",st->name,s,d);
+           free(s); free(d);
+           BUF_FREE(buf);
+           return;
+       }
     }
 
-    netlink_packet_forward(st,buf);
-
-    BUF_ASSERT_FREE(buf);
-}
-
-/* Called by driver code when packet is received from kernel */
-/* cid should be NULL */
-/* buf should be allocated on entry, and is free on return */
-static void netlink_from_host(void *sst, void *cid, struct buffer_if *buf)
-{
-    struct netlink *st=sst;
-    uint32_t source,dest;
-    struct iphdr *iph;
-
-    BUF_ASSERT_USED(buf);
-    if (!netlink_check(st,buf)) {
-       Message(M_WARNING,"%s: bad IP packet from host\n",
-               st->name);
-       BUF_FREE(buf);
+    /* If this is a point-to-point device we don't examine the
+       destination address at all; we blindly send it down our
+       one-and-only registered tunnel, or to the host, depending on
+       where it came from.  It's up to external software to check
+       address validity and generate ICMP, etc. */
+    if (st->ptp) {
+       if (client) {
+           st->deliver_to_host(st->dst,buf);
+       } else {
+           st->clients->deliver(st->clients->dst,buf);
+       }
+       BUF_ASSERT_FREE(buf);
        return;
     }
-    iph=(struct iphdr *)buf->start;
 
-    source=ntohl(iph->saddr);
-    dest=ntohl(iph->daddr);
-
-    if (!subnet_match(&st->networks,source)) {
-       string_t s,d;
-       s=ipaddr_to_string(source);
-       d=ipaddr_to_string(dest);
-       Message(M_WARNING,"%s: outgoing packet with bad source address "
-               "(s=%s,d=%s)\n",st->name,s,d);
-       free(s); free(d);
-       BUF_FREE(buf);
-       return;
-    }
+    /* st->secnet_address needs checking before matching destination
+       addresses */
     if (dest==st->secnet_address) {
-       netlink_packet_local(st,buf);
+       netlink_packet_local(st,client,buf);
        BUF_ASSERT_FREE(buf);
        return;
     }
-    netlink_packet_forward(st,buf);
+    netlink_packet_forward(st,client,buf);
     BUF_ASSERT_FREE(buf);
 }
 
-static void netlink_set_delivery(void *sst, void *cid, bool_t can_deliver)
+static void netlink_inst_incoming(void *sst, struct buffer_if *buf)
 {
-    struct netlink_client *c=cid;
+    struct netlink_client *c=sst;
+    struct netlink *st=c->nst;
 
-    c->can_deliver=can_deliver;
+    netlink_incoming(st,c,buf);
 }
 
-static void *netlink_regnets(void *sst, struct subnet_list *nets,
-                            netlink_deliver_fn *deliver, void *dst,
-                            uint32_t max_start_pad, uint32_t max_end_pad,
-                            string_t client_name)
+static void netlink_dev_incoming(void *sst, struct buffer_if *buf)
 {
     struct netlink *st=sst;
-    struct netlink_client *c;
-
-    Message(M_DEBUG_CONFIG,"netlink_regnets: request for %d networks, "
-           "max_start_pad=%d, max_end_pad=%d\n",
-           nets->entries,max_start_pad,max_end_pad);
-
-    c=safe_malloc(sizeof(*c),"netlink_regnets");
-    c->networks=nets;
-    c->deliver=deliver;
-    c->dst=dst;
-    c->name=client_name; /* XXX copy it? */
-    c->can_deliver=False;
-    c->next=st->clients;
-    st->clients=c;
-    if (max_start_pad > st->max_start_pad) st->max_start_pad=max_start_pad;
-    if (max_end_pad > st->max_end_pad) st->max_end_pad=max_end_pad;
 
-    return c;
+    netlink_incoming(st,NULL,buf);
 }
 
-static netlink_deliver_fn *netlink_init(struct netlink *st,
-                                       void *dst, struct cloc loc,
-                                       dict_t *dict, string_t description,
-                                       netlink_deliver_fn *to_host)
+static void netlink_set_quality(void *sst, uint32_t quality)
 {
-    st->dst=dst;
-    st->cl.description=description;
-    st->cl.type=CL_NETLINK;
-    st->cl.apply=NULL;
-    st->cl.interface=&st->ops;
-    st->ops.st=st;
-    st->ops.regnets=netlink_regnets;
-    st->ops.deliver=netlink_from_tunnel;
-    st->ops.set_delivery=netlink_set_delivery;
-    st->max_start_pad=0;
-    st->max_end_pad=0;
-    st->clients=NULL;
-    st->deliver_to_host=to_host;
+    struct netlink_client *c=sst;
+    struct netlink *st=c->nst;
 
-    st->name=dict_read_string(dict,"name",False,"netlink",loc);
-    if (!st->name) st->name=description;
-    dict_read_subnet_list(dict, "networks", True, "netlink", loc,
-                         &st->networks);
-    st->local_address=string_to_ipaddr(
-       dict_find_item(dict,"local-address", True, "netlink", loc),"netlink");
-    st->secnet_address=string_to_ipaddr(
-       dict_find_item(dict,"secnet-address", True, "netlink", loc),"netlink");
-    if (!subnet_match(&st->networks,st->local_address)) {
-       cfgfatal(loc,"netlink","local-address must be in local networks\n");
+    c->link_quality=quality;
+    c->up=(c->link_quality==LINK_QUALITY_DOWN)?False:True;
+    if (c->options&OPT_SOFTROUTE) {
+       st->set_routes(st->dst,c);
     }
-    st->mtu=dict_read_number(dict, "mtu", False, "netlink", loc, DEFAULT_MTU);
-    buffer_new(&st->icmp,ICMP_BUFSIZE);
-
-    return netlink_from_host;
 }
 
-/* Connection to the kernel through userv-ipif */
-
-struct userv {
-    struct netlink nl;
-    int txfd; /* We transmit to userv */
-    int rxfd; /* We receive from userv */
-    string_t userv_path;
-    string_t service_user;
-    string_t service_name;
-    uint32_t txbuflen;
-    struct buffer_if *buff; /* We unstuff received packets into here
-                              and send them to the site code. */
-    bool_t pending_esc;
-    netlink_deliver_fn *netlink_to_tunnel;
-};
-
-static int userv_beforepoll(void *sst, struct pollfd *fds, int *nfds_io,
-                           int *timeout_io, const struct timeval *tv_now,
-                           uint64_t *now)
+static void netlink_output_subnets(struct netlink *st, uint32_t loglevel,
+                                  struct subnet_list *snets)
 {
-    struct userv *st=sst;
-    *nfds_io=2;
-    fds[0].fd=st->txfd;
-    fds[0].events=POLLERR; /* Might want to pick up POLLOUT sometime */
-    fds[1].fd=st->rxfd;
-    fds[1].events=POLLIN|POLLERR|POLLHUP;
-    return 0;
+    uint32_t i;
+    string_t net;
+
+    for (i=0; i<snets->entries; i++) {
+       net=subnet_to_string(snets->list[i]);
+       Message(loglevel,"%s ",net);
+       free(net);
+    }
 }
 
-static void userv_afterpoll(void *sst, struct pollfd *fds, int nfds,
-                           const struct timeval *tv_now, uint64_t *now)
+static void netlink_dump_routes(struct netlink *st, bool_t requested)
 {
-    struct userv *st=sst;
-    uint8_t rxbuf[DEFAULT_BUFSIZE];
-    int l,i;
-
-    if (fds[1].revents&POLLERR) {
-       printf("userv_afterpoll: hup!\n");
-    }
-    if (fds[1].revents&POLLIN) {
-       l=read(st->rxfd,rxbuf,DEFAULT_BUFSIZE);
-       if (l<0) {
-           fatal_perror("userv_afterpoll: read(rxfd)");
-       }
-       if (l==0) {
-           fatal("userv_afterpoll: read(rxfd)=0; userv gone away?\n");
+    int i;
+    string_t net;
+    uint32_t c=M_INFO;
+
+    if (requested) c=M_WARNING;
+    if (st->ptp) {
+       net=ipaddr_to_string(st->secnet_address);
+       Message(c,"%s: point-to-point (remote end is %s); routes:\n",
+               st->name, net);
+       free(net);
+       netlink_output_subnets(st,c,st->clients->subnets);
+       Message(c,"\n");
+    } else {
+       Message(c,"%s: routing table:\n",st->name);
+       for (i=0; i<st->n_clients; i++) {
+           netlink_output_subnets(st,c,st->routes[i]->subnets);
+           Message(c,"-> tunnel %s (%s,mtu %d,%s routes,%s,"
+                   "quality %d,use %d)\n",
+                   st->routes[i]->name,
+                   st->routes[i]->up?"up":"down",
+                   st->routes[i]->mtu,
+                   st->routes[i]->options&OPT_SOFTROUTE?"soft":"hard",
+                   st->routes[i]->options&OPT_ALLOWROUTE?"free":"restricted",
+                   st->routes[i]->link_quality,
+                   st->routes[i]->outcount);
        }
-       /* XXX really crude unstuff code */
-       /* XXX check for buffer overflow */
-       BUF_ASSERT_USED(st->buff);
-       for (i=0; i<l; i++) {
-           if (st->pending_esc) {
-               st->pending_esc=False;
-               switch(rxbuf[i]) {
-               case SLIP_ESCEND:
-                   *(uint8_t *)buf_append(st->buff,1)=SLIP_END;
-                   break;
-               case SLIP_ESCESC:
-                   *(uint8_t *)buf_append(st->buff,1)=SLIP_ESC;
-                   break;
-               default:
-                   fatal("userv_afterpoll: bad SLIP escape character\n");
-               }
-           } else {
-               switch (rxbuf[i]) {
-               case SLIP_END:
-                   if (st->buff->size>0) {
-                       st->netlink_to_tunnel(&st->nl,NULL,
-                                             st->buff);
-                       BUF_ALLOC(st->buff,"userv_afterpoll");
-                   }
-                   buffer_init(st->buff,st->nl.max_start_pad);
-                   break;
-               case SLIP_ESC:
-                   st->pending_esc=True;
-                   break;
-               default:
-                   *(uint8_t *)buf_append(st->buff,1)=rxbuf[i];
-                   break;
-               }
-           }
+       net=ipaddr_to_string(st->secnet_address);
+       Message(c,"%s/32 -> netlink \"%s\" (use %d)\n",
+               net,st->name,st->localcount);
+       free(net);
+       for (i=0; i<st->subnets->entries; i++) {
+           net=subnet_to_string(st->subnets->list[i]);
+           Message(c,"%s ",net);
+           free(net);
        }
+       if (i>0)
+           Message(c,"-> host (use %d)\n",st->outcount);
     }
 }
 
-/* Send buf to the kernel. Free buf before returning. */
-static void userv_deliver_to_kernel(void *sst, void *cid,
-                                   struct buffer_if *buf)
+/* ap is a pointer to a member of the routes array */
+static int netlink_compare_client_priority(const void *ap, const void *bp)
 {
-    struct userv *st=sst;
-    uint8_t txbuf[DEFAULT_BUFSIZE];
-    uint8_t *i;
-    uint32_t j;
+    const struct netlink_client *const*a=ap;
+    const struct netlink_client *const*b=bp;
 
-    BUF_ASSERT_USED(buf);
-
-    /* Spit the packet at userv-ipif: SLIP start marker, then
-       bytestuff the packet, then SLIP end marker */
-    /* XXX crunchy bytestuff code */
-    j=0;
-    txbuf[j++]=SLIP_END;
-    for (i=buf->start; i<(buf->start+buf->size); i++) {
-       switch (*i) {
-       case SLIP_END:
-           txbuf[j++]=SLIP_ESC;
-           txbuf[j++]=SLIP_ESCEND;
-           break;
-       case SLIP_ESC:
-           txbuf[j++]=SLIP_ESC;
-           txbuf[j++]=SLIP_ESCESC;
-           break;
-       default:
-           txbuf[j++]=*i;
-           break;
-       }
-    }
-    txbuf[j++]=SLIP_END;
-    if (write(st->txfd,txbuf,j)<0) {
-       fatal_perror("userv_deliver_to_kernel: write()");
-    }
-    BUF_FREE(buf);
+    if ((*a)->priority==(*b)->priority) return 0;
+    if ((*a)->priority<(*b)->priority) return 1;
+    return -1;
 }
 
-static void userv_phase_hook(void *sst, uint32_t newphase)
+static void netlink_phase_hook(void *sst, uint32_t new_phase)
 {
-    struct userv *st=sst;
-    pid_t child;
-    int c_stdin[2];
-    int c_stdout[2];
-    string_t addrs;
-    string_t nets;
-    string_t s;
+    struct netlink *st=sst;
     struct netlink_client *c;
-    int i;
-
-    /* This is where we actually invoke userv - all the networks we'll
-       be using should already have been registered. */
-
-    addrs=safe_malloc(512,"userv_phase_hook:addrs");
-    snprintf(addrs,512,"%s,%s,%d,slip",ipaddr_to_string(st->nl.local_address),
-            ipaddr_to_string(st->nl.secnet_address),st->nl.mtu);
-
-    nets=safe_malloc(1024,"userv_phase_hook:nets");
-    *nets=0;
-    for (c=st->nl.clients; c; c=c->next) {
-       for (i=0; i<c->networks->entries; i++) {
-           s=subnet_to_string(&c->networks->list[i]);
-           strcat(nets,s);
-           strcat(nets,",");
-           free(s);
-       }
-    }
-    nets[strlen(nets)-1]=0;
-
-    Message(M_INFO,"\nuserv_phase_hook: %s %s %s %s %s\n",st->userv_path,
-          st->service_user,st->service_name,addrs,nets);
-
-    /* Allocate buffer, plus space for padding. Make sure we end up
-       with the start of the packet well-aligned. */
-    /* ALIGN(st->max_start_pad,16); */
-    /* ALIGN(st->max_end_pad,16); */
-
-    st->pending_esc=False;
-
-    /* Invoke userv */
-    if (pipe(c_stdin)!=0) {
-       fatal_perror("userv_phase_hook: pipe(c_stdin)");
-    }
-    if (pipe(c_stdout)!=0) {
-       fatal_perror("userv_phase_hook: pipe(c_stdout)");
-    }
-    st->txfd=c_stdin[1];
-    st->rxfd=c_stdout[0];
-
-    child=fork();
-    if (child==-1) {
-       fatal_perror("userv_phase_hook: fork()");
-    }
-    if (child==0) {
-       char **argv;
-
-       /* We are the child. Modify our stdin and stdout, then exec userv */
-       dup2(c_stdin[0],0);
-       dup2(c_stdout[1],1);
-       close(c_stdin[1]);
-       close(c_stdout[0]);
-
-       /* The arguments are:
-          userv
-          service-user
-          service-name
-          local-addr,secnet-addr,mtu,protocol
-          route1,route2,... */
-       argv=malloc(sizeof(*argv)*6);
-       argv[0]=st->userv_path;
-       argv[1]=st->service_user;
-       argv[2]=st->service_name;
-       argv[3]=addrs;
-       argv[4]=nets;
-       argv[5]=NULL;
-       execvp(st->userv_path,argv);
-       perror("netlink-userv-ipif: execvp");
-
-       exit(1);
-    }
-    /* We are the parent... */
-          
-    /* Register for poll() */
-    register_for_poll(st, userv_beforepoll, userv_afterpoll, 2, st->nl.name);
+    uint32_t i;
+
+    /* All the networks serviced by the various tunnels should now
+     * have been registered.  We build a routing table by sorting the
+     * clients by priority.  */
+    st->routes=safe_malloc(st->n_clients*sizeof(*st->routes),
+                          "netlink_phase_hook");
+    /* Fill the table */
+    i=0;
+    for (c=st->clients; c; c=c->next)
+       st->routes[i++]=c;
+    /* Sort the table in descending order of priority */
+    qsort(st->routes,st->n_clients,sizeof(*st->routes),
+         netlink_compare_client_priority);
+
+    netlink_dump_routes(st,False);
 }
 
-static list_t *userv_apply(closure_t *self, struct cloc loc, dict_t *context,
-                          list_t *args)
+static void netlink_signal_handler(void *sst, int signum)
 {
-    struct userv *st;
-    item_t *item;
-    dict_t *dict;
-
-    st=safe_malloc(sizeof(*st),"userv_apply");
-
-    /* First parameter must be a dict */
-    item=list_elem(args,0);
-    if (!item || item->type!=t_dict)
-       cfgfatal(loc,"userv-ipif","parameter must be a dictionary\n");
-    
-    dict=item->data.dict;
-
-    st->netlink_to_tunnel=
-       netlink_init(&st->nl,st,loc,dict,
-                    "netlink-userv-ipif",userv_deliver_to_kernel);
-
-    st->userv_path=dict_read_string(dict,"userv-path",False,"userv-netlink",
-                                   loc);
-    st->service_user=dict_read_string(dict,"service-user",False,
-                                     "userv-netlink",loc);
-    st->service_name=dict_read_string(dict,"service-name",False,
-                                     "userv-netlink",loc);
-    if (!st->userv_path) st->userv_path="userv";
-    if (!st->service_user) st->service_user="root";
-    if (!st->service_name) st->service_name="ipif";
-    st->buff=find_cl_if(dict,"buffer",CL_BUFFER,True,"userv-netlink",loc);
-    BUF_ALLOC(st->buff,"netlink:userv_apply");
-
-    st->rxfd=-1; st->txfd=-1;
-    add_hook(PHASE_DROPPRIV,userv_phase_hook,st);
-
-    return new_closure(&st->nl.cl);
+    struct netlink *st=sst;
+    Message(M_INFO,"%s: route dump requested by SIGUSR1\n",st->name);
+    netlink_dump_routes(st,True);
 }
 
-/* Connection to the kernel through the universal TUN/TAP driver */
+static void netlink_inst_output_config(void *sst, struct buffer_if *buf)
+{
+/*    struct netlink_client *c=sst; */
+/*    struct netlink *st=c->nst; */
 
-struct tun {
-    struct netlink nl;
-    int fd;
-    string_t device_path;
-    string_t interface_name;
-    string_t ifconfig_path;
-    string_t route_path;
-    struct buffer_if *buff; /* We receive packets into here
-                              and send them to the netlink code. */
-    netlink_deliver_fn *netlink_to_tunnel;
-};
+    /* For now we don't output anything */
+    BUF_ASSERT_USED(buf);
+}
 
-static int tun_beforepoll(void *sst, struct pollfd *fds, int *nfds_io,
-                         int *timeout_io, const struct timeval *tv_now,
-                         uint64_t *now)
+static bool_t netlink_inst_check_config(void *sst, struct buffer_if *buf)
 {
-    struct tun *st=sst;
-    *nfds_io=1;
-    fds[0].fd=st->fd;
-    fds[0].events=POLLIN|POLLERR|POLLHUP;
-    return 0;
+/*    struct netlink_client *c=sst; */
+/*    struct netlink *st=c->nst; */
+
+    BUF_ASSERT_USED(buf);
+    /* We need to eat all of the configuration information from the buffer
+       for backward compatibility. */
+    buf->size=0;
+    return True;
 }
 
-static void tun_afterpoll(void *sst, struct pollfd *fds, int nfds,
-                           const struct timeval *tv_now, uint64_t *now)
+static void netlink_inst_set_mtu(void *sst, uint32_t new_mtu)
 {
-    struct tun *st=sst;
-    int l;
+    struct netlink_client *c=sst;
 
-    if (fds[0].revents&POLLERR) {
-       printf("tun_afterpoll: hup!\n");
-    }
-    if (fds[0].revents&POLLIN) {
-       BUF_ALLOC(st->buff,"tun_afterpoll");
-       buffer_init(st->buff,st->nl.max_start_pad);
-       l=read(st->fd,st->buff->start,st->buff->len-st->nl.max_start_pad);
-       if (l<0) {
-           fatal_perror("tun_afterpoll: read()");
-       }
-       if (l==0) {
-           fatal("tun_afterpoll: read()=0; device gone away?\n");
-       }
-       if (l>0) {
-           st->buff->size=l;
-           st->netlink_to_tunnel(&st->nl,NULL,st->buff);
-           BUF_ASSERT_FREE(st->buff);
-       }
-    }
+    c->mtu=new_mtu;
 }
 
-static void tun_deliver_to_kernel(void *sst, void *cid,
-                                 struct buffer_if *buf)
+static void netlink_inst_reg(void *sst, netlink_deliver_fn *deliver, 
+                            void *dst, uint32_t max_start_pad,
+                            uint32_t max_end_pad)
 {
-    struct tun *st=sst;
-
-    BUF_ASSERT_USED(buf);
+    struct netlink_client *c=sst;
+    struct netlink *st=c->nst;
 
-    /* No error checking, because we'd just throw the packet away anyway */
-    write(st->fd,buf->start,buf->size);
-    BUF_FREE(buf);
+    if (max_start_pad > st->max_start_pad) st->max_start_pad=max_start_pad;
+    if (max_end_pad > st->max_end_pad) st->max_end_pad=max_end_pad;
+    c->deliver=deliver;
+    c->dst=dst;
 }
 
-static void tun_phase_hook(void *sst, uint32_t newphase)
+static struct flagstr netlink_option_table[]={
+    { "soft", OPT_SOFTROUTE },
+    { "allow-route", OPT_ALLOWROUTE },
+    { NULL, 0}
+};
+/* This is the routine that gets called when the closure that's
+   returned by an invocation of a netlink device closure (eg. tun,
+   userv-ipif) is invoked.  It's used to create routes and pass in
+   information about them; the closure it returns is used by site
+   code.  */
+static closure_t *netlink_inst_create(struct netlink *st,
+                                     struct cloc loc, dict_t *dict)
 {
-    struct tun *st=sst;
-    string_t hostaddr,secnetaddr;
-    uint8_t mtu[6];
-    string_t network,mask;
     struct netlink_client *c;
-    int i;
+    string_t name;
+    struct ipset *networks;
+    uint32_t options,priority,mtu;
+    list_t *l;
+
+    name=dict_read_string(dict, "name", True, st->name, loc);
+
+    l=dict_lookup(dict,"routes");
+    if (!l)
+       cfgfatal(loc,st->name,"required parameter \"routes\" not found\n");
+    networks=string_list_to_ipset(l,loc,st->name,"routes");
+    options=string_list_to_word(dict_lookup(dict,"options"),
+                               netlink_option_table,st->name);
+
+    priority=dict_read_number(dict,"priority",False,st->name,loc,0);
+    mtu=dict_read_number(dict,"mtu",False,st->name,loc,0);
+
+    if ((options&OPT_SOFTROUTE) && !st->set_routes) {
+       cfgfatal(loc,st->name,"this netlink device does not support "
+                "soft routes.\n");
+       return NULL;
+    }
 
-    /* All the networks we'll be using have been registered. Invoke ifconfig
-       to set the TUN device's address, and route to add routes to all
-       our networks. */
-
-    hostaddr=ipaddr_to_string(st->nl.local_address);
-    secnetaddr=ipaddr_to_string(st->nl.secnet_address);
-    snprintf(mtu,6,"%d",st->nl.mtu);
-    mtu[5]=0;
-
-    sys_cmd(st->ifconfig_path,"ifconfig",st->interface_name,
-           hostaddr,"netmask","255.255.255.255","-broadcast",
-           "pointopoint",secnetaddr,"mtu",mtu,"up",(char *)0);
-
-    for (c=st->nl.clients; c; c=c->next) {
-       for (i=0; i<c->networks->entries; i++) {
-           network=ipaddr_to_string(c->networks->list[i].prefix);
-           mask=ipaddr_to_string(c->networks->list[i].mask);
-           sys_cmd(st->route_path,"route","add","-net",network,
-                   "netmask",mask,"gw",secnetaddr,(char *)0);
+    if (options&OPT_SOFTROUTE) {
+       /* XXX for now we assume that soft routes require root privilege;
+          this may not always be true. The device driver can tell us. */
+       require_root_privileges=True;
+       require_root_privileges_explanation="netlink: soft routes";
+       if (st->ptp) {
+           cfgfatal(loc,st->name,"point-to-point netlinks do not support "
+                    "soft routes.\n");
+           return NULL;
        }
     }
 
-    /* Register for poll() */
-    register_for_poll(st, tun_beforepoll, tun_afterpoll, 1, st->nl.name);
+    /* Check that nets are a subset of st->remote_networks;
+       refuse to register if they are not. */
+    if (!ipset_is_subset(st->remote_networks,networks)) {
+       cfgfatal(loc,st->name,"routes are not allowed\n");
+       return NULL;
+    }
+
+    c=safe_malloc(sizeof(*c),"netlink_inst_create");
+    c->cl.description=name;
+    c->cl.type=CL_NETLINK;
+    c->cl.apply=NULL;
+    c->cl.interface=&c->ops;
+    c->ops.st=c;
+    c->ops.reg=netlink_inst_reg;
+    c->ops.deliver=netlink_inst_incoming;
+    c->ops.set_quality=netlink_set_quality;
+    c->ops.output_config=netlink_inst_output_config;
+    c->ops.check_config=netlink_inst_check_config;
+    c->ops.set_mtu=netlink_inst_set_mtu;
+    c->nst=st;
+
+    c->networks=networks;
+    c->subnets=ipset_to_subnet_list(networks);
+    c->priority=priority;
+    c->deliver=NULL;
+    c->dst=NULL;
+    c->name=name;
+    c->link_quality=LINK_QUALITY_DOWN;
+    c->mtu=mtu?mtu:st->mtu;
+    c->options=options;
+    c->outcount=0;
+    c->up=False;
+    c->kup=False;
+    c->next=st->clients;
+    st->clients=c;
+    st->n_clients++;
+
+    return &c->cl;
 }
 
-#ifdef HAVE_LINUX_IF_H
-static list_t *tun_apply(closure_t *self, struct cloc loc, dict_t *context,
-                        list_t *args)
+static list_t *netlink_inst_apply(closure_t *self, struct cloc loc,
+                                 dict_t *context, list_t *args)
 {
-    struct tun *st;
-    item_t *item;
-    dict_t *dict;
-    struct ifreq ifr;
+    struct netlink *st=self->interface;
 
-    st=safe_malloc(sizeof(*st),"tun_apply");
+    dict_t *dict;
+    item_t *item;
+    closure_t *cl;
 
-    /* First parameter must be a dict */
     item=list_elem(args,0);
-    if (!item || item->type!=t_dict)
-       cfgfatal(loc,"tun","parameter must be a dictionary\n");
-    
-    dict=item->data.dict;
-
-    st->netlink_to_tunnel=
-       netlink_init(&st->nl,st,loc,dict,
-                    "netlink-tun",tun_deliver_to_kernel);
-
-    st->device_path=dict_read_string(dict,"device",False,"tun-netlink",loc);
-    st->interface_name=dict_read_string(dict,"interface",False,
-                                       "tun-netlink",loc);
-    st->ifconfig_path=dict_read_string(dict,"ifconfig-path",
-                                      False,"tun-netlink",loc);
-    st->route_path=dict_read_string(dict,"route-path",
-                                   False,"tun-netlink",loc);
-
-    if (!st->device_path) st->device_path="/dev/net/tun";
-    if (!st->ifconfig_path) st->ifconfig_path="ifconfig";
-    if (!st->route_path) st->route_path="route";
-    st->buff=find_cl_if(dict,"buffer",CL_BUFFER,True,"tun-netlink",loc);
-
-    /* New TUN interface: open the device, then do ioctl TUNSETIFF
-       to set or find out the network interface name. */
-    st->fd=open(st->device_path,O_RDWR);
-    if (st->fd==-1) {
-       fatal_perror("%s: can't open device file %s",st->nl.name,
-                    st->device_path);
-    }
-    memset(&ifr,0,sizeof(ifr));
-    ifr.ifr_flags = IFF_TUN | IFF_NO_PI; /* Just send/receive IP packets,
-                                           no extra headers */
-    if (st->interface_name)
-       strncpy(ifr.ifr_name,st->interface_name,IFNAMSIZ);
-    if (ioctl(st->fd,TUNSETIFF,&ifr)<0) {
-       fatal_perror("%s: ioctl(TUNSETIFF)",st->nl.name);
-    }
-    if (!st->interface_name) {
-       st->interface_name=safe_malloc(strlen(ifr.ifr_name)+1,"tun_apply");
-       strcpy(st->interface_name,ifr.ifr_name);
-       Message(M_INFO,"%s: allocated network interface %s\n",st->nl.name,
-               st->interface_name);
+    if (!item || item->type!=t_dict) {
+       cfgfatal(loc,st->name,"must have a dictionary argument\n");
     }
+    dict=item->data.dict;
 
-    add_hook(PHASE_DROPPRIV,tun_phase_hook,st);
+    cl=netlink_inst_create(st,loc,dict);
 
-    return new_closure(&st->nl.cl);
+    return new_closure(cl);
 }
-#endif /* HAVE_LINUX_IF_H */
 
-static list_t *tun_old_apply(closure_t *self, struct cloc loc, dict_t *context,
-                            list_t *args)
+netlink_deliver_fn *netlink_init(struct netlink *st,
+                                void *dst, struct cloc loc,
+                                dict_t *dict, string_t description,
+                                netlink_route_fn *set_routes,
+                                netlink_deliver_fn *to_host)
 {
-    struct tun *st;
-    item_t *item;
-    dict_t *dict;
-    bool_t search_for_if;
-
-    st=safe_malloc(sizeof(*st),"tun_old_apply");
+    item_t *sa, *ptpa;
+    list_t *l;
 
-    Message(M_WARNING,"the tun-old code has never been tested. Please report "
-           "success or failure to steve@greenend.org.uk\n");
-
-    /* First parameter must be a dict */
-    item=list_elem(args,0);
-    if (!item || item->type!=t_dict)
-       cfgfatal(loc,"tun","parameter must be a dictionary\n");
-    
-    dict=item->data.dict;
+    st->dst=dst;
+    st->cl.description=description;
+    st->cl.type=CL_PURE;
+    st->cl.apply=netlink_inst_apply;
+    st->cl.interface=st;
+    st->max_start_pad=0;
+    st->max_end_pad=0;
+    st->clients=NULL;
+    st->routes=NULL;
+    st->n_clients=0;
+    st->set_routes=set_routes;
+    st->deliver_to_host=to_host;
 
-    st->netlink_to_tunnel=
-       netlink_init(&st->nl,st,loc,dict,
-                    "netlink-tun",tun_deliver_to_kernel);
-
-    st->device_path=dict_read_string(dict,"device",False,"tun-netlink",loc);
-    st->interface_name=dict_read_string(dict,"interface",False,
-                                       "tun-netlink",loc);
-    search_for_if=dict_read_bool(dict,"interface-search",False,"tun-netlink",
-                                loc,st->device_path==NULL);
-    st->ifconfig_path=dict_read_string(dict,"ifconfig-path",False,
-                                      "tun-netlink",loc);
-    st->route_path=dict_read_string(dict,"route-path",False,"tun-netlink",loc);
-
-    if (!st->device_path) st->device_path="/dev/tun";
-    if (!st->ifconfig_path) st->ifconfig_path="ifconfig";
-    if (!st->route_path) st->route_path="route";
-    st->buff=find_cl_if(dict,"buffer",CL_BUFFER,True,"tun-netlink",loc);
-
-    /* Old TUN interface: the network interface name depends on which
-       /dev/tunX file we open. If 'interface-search' is set to true, treat
-       'device' as the prefix and try numbers from 0--255. If it's set
-       to false, treat 'device' as the whole name, and require than an
-       appropriate interface name be specified. */
-    if (search_for_if) {
-       string_t dname;
-       int i;
-
-       if (st->interface_name) {
-           cfgfatal(loc,"tun-old","you may not specify an interface name "
-                    "in interface-search mode\n");
-       }
-       dname=safe_malloc(strlen(st->device_path)+4,"tun_old_apply");
-       st->interface_name=safe_malloc(8,"tun_old_apply");
-       
-       for (i=0; i<255; i++) {
-           sprintf(dname,"%s%d",st->device_path,i);
-           if ((st->fd=open(dname,O_RDWR))>0) {
-               sprintf(st->interface_name,"tun%d",i);
-               Message(M_INFO,"%s: allocated network interface %s "
-                       "through %s\n",st->nl.name,st->interface_name,dname);
-               break;
-           }
-       }
-       if (st->fd==-1) {
-           fatal("%s: unable to open any TUN device (%s...)\n",
-                 st->nl.name,st->device_path);
-       }
+    st->name=dict_read_string(dict,"name",False,description,loc);
+    if (!st->name) st->name=description;
+    l=dict_lookup(dict,"networks");
+    if (l) 
+       st->networks=string_list_to_ipset(l,loc,st->name,"networks");
+    else {
+       struct ipset *empty;
+       empty=ipset_new();
+       st->networks=ipset_complement(empty);
+       ipset_free(empty);
+    }
+    l=dict_lookup(dict,"remote-networks");
+    if (l) {
+       st->remote_networks=string_list_to_ipset(l,loc,st->name,
+                                                "remote-networks");
     } else {
-       if (!st->interface_name) {
-           cfgfatal(loc,"tun-old","you must specify an interface name "
-                    "when you explicitly specify a TUN device file\n");
-       }
-       st->fd=open(st->device_path,O_RDWR);
-       if (st->fd==-1) {
-           fatal_perror("%s: unable to open TUN device file %s",
-                        st->nl.name,st->device_path);
-       }
+       struct ipset *empty;
+       empty=ipset_new();
+       st->remote_networks=ipset_complement(empty);
+       ipset_free(empty);
     }
 
-    add_hook(PHASE_DROPPRIV,tun_phase_hook,st);
-
-    return new_closure(&st->nl.cl);
+    sa=dict_find_item(dict,"secnet-address",False,"netlink",loc);
+    ptpa=dict_find_item(dict,"ptp-address",False,"netlink",loc);
+    if (sa && ptpa) {
+       cfgfatal(loc,st->name,"you may not specify secnet-address and "
+                "ptp-address in the same netlink device\n");
+    }
+    if (!(sa || ptpa)) {
+       cfgfatal(loc,st->name,"you must specify secnet-address or "
+                "ptp-address for this netlink device\n");
+    }
+    if (sa) {
+       st->secnet_address=string_item_to_ipaddr(sa,"netlink");
+       st->ptp=False;
+    } else {
+       st->secnet_address=string_item_to_ipaddr(ptpa,"netlink");
+       st->ptp=True;
+    }
+    /* To be strictly correct we could subtract secnet_address from
+       networks here.  It shouldn't make any practical difference,
+       though, and will make the route dump look complicated... */
+    st->subnets=ipset_to_subnet_list(st->networks);
+    st->mtu=dict_read_number(dict, "mtu", False, "netlink", loc, DEFAULT_MTU);
+    buffer_new(&st->icmp,ICMP_BUFSIZE);
+    st->outcount=0;
+    st->localcount=0;
+
+    add_hook(PHASE_SETUP,netlink_phase_hook,st);
+    request_signal_notification(SIGUSR1, netlink_signal_handler, st);
+
+    /* If we're point-to-point then we return a CL_NETLINK directly,
+       rather than a CL_NETLINK_OLD or pure closure (depending on
+       compatibility).  This CL_NETLINK is for our one and only
+       client.  Our cl.apply function is NULL. */
+    if (st->ptp) {
+       closure_t *cl;
+       cl=netlink_inst_create(st,loc,dict);
+       st->cl=*cl;
+    }
+    return netlink_dev_incoming;
 }
 
 /* No connection to the kernel at all... */
@@ -1085,7 +1024,21 @@ struct null {
     struct netlink nl;
 };
 
-static void null_deliver(void *sst, void *cid, struct buffer_if *buf)
+static bool_t null_set_route(void *sst, struct netlink_client *routes)
+{
+    struct null *st=sst;
+
+    if (routes->up!=routes->kup) {
+       Message(M_INFO,"%s: setting routes for tunnel %s to state %s\n",
+               st->nl.name,routes->name,
+               routes->up?"up":"down");
+       routes->kup=routes->up;
+       return True;
+    }
+    return False;
+}
+           
+static void null_deliver(void *sst, struct buffer_if *buf)
 {
     return;
 }
@@ -1105,7 +1058,8 @@ static list_t *null_apply(closure_t *self, struct cloc loc, dict_t *context,
     
     dict=item->data.dict;
 
-    netlink_init(&st->nl,st,loc,dict,"null-netlink",null_deliver);
+    netlink_init(&st->nl,st,loc,dict,"null-netlink",null_set_route,
+                null_deliver);
 
     return new_closure(&st->nl.cl);
 }
@@ -1113,15 +1067,5 @@ static list_t *null_apply(closure_t *self, struct cloc loc, dict_t *context,
 init_module netlink_module;
 void netlink_module(dict_t *dict)
 {
-    add_closure(dict,"userv-ipif",userv_apply);
-#ifdef HAVE_LINUX_IF_H
-    add_closure(dict,"tun",tun_apply);
-#endif
-    add_closure(dict,"tun-old",tun_old_apply);
     add_closure(dict,"null-netlink",null_apply);
-#if 0
-    /* TODO */
-    add_closure(dict,"pty-slip",ptyslip_apply);
-    add_closure(dict,"slipd",slipd_apply);
-#endif /* 0 */
 }