chiark / gitweb /
Import release 0.1.5
[secnet.git] / netlink.c
index 7db3559515f411eca917588d7fb69298765fa48a..022ddf1c00ae7a7c440c6245aca9494cf096fa46 100644 (file)
--- a/netlink.c
+++ b/netlink.c
 /* User-kernel network link */
 
-/* We support a variety of methods: userv-ipif, ipif on its own (when
-   we run as root), SLIP to a pty, an external netlink daemon. There
-   is a performance/security tradeoff. */
-
-/* When dealing with SLIP (to a pty, or ipif) we have separate rx, tx
-   and client buffers. When receiving we may read() any amount, not
-   just whole packets. When transmitting we need to bytestuff anyway,
-   and may be part-way through receiving. */
-
-/* Each netlink device is actually a router, with its own IP
-   address. We should eventually do things like decreasing the TTL and
-   recalculating the header checksum, generating ICMP, responding to
-   pings, etc. but for now we can get away without them. We should
-   implement this stuff no matter how we get the packets to/from the
-   kernel. */
+/* Each netlink device is actually a router, with its own IP address.
+   We do things like decreasing the TTL and recalculating the header
+   checksum, generating ICMP, responding to pings, etc. */
 
 /* This is where we have the anti-spoofing paranoia - before sending a
    packet to the kernel we check that the tunnel it came over could
    reasonably have produced it. */
 
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-
 #include "secnet.h"
 #include "util.h"
+#include "netlink.h"
 
-#define DEFAULT_BUFSIZE 2048
+/* Generic IP checksum routine */
+static inline uint16_t ip_csum(uint8_t *iph,uint32_t count)
+{
+    register uint32_t sum=0;
 
-#define SLIP_END    192
-#define SLIP_ESC    219
-#define SLIP_ESCEND 220
-#define SLIP_ESCESC 221
+    while (count>1) {
+       sum+=ntohs(*(uint16_t *)iph);
+       iph+=2;
+       count-=2;
+    }
+    if(count>0)
+       sum+=*(uint8_t *)iph;
+    while (sum>>16)
+       sum=(sum&0xffff)+(sum>>16);
+    return htons(~sum);
+}
 
-struct netlink_client {
-    struct subnet_list *networks;
-    netlink_deliver_fn *deliver;
-    void *dst;
-    struct netlink_client *next;
+#ifdef i386
+/*
+ *      This is a version of ip_compute_csum() optimized for IP headers,
+ *      which always checksum on 4 octet boundaries.
+ *
+ *      By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
+ *      Arnt Gulbrandsen.
+ */
+static inline uint16_t ip_fast_csum(uint8_t *iph, uint32_t ihl) {
+    uint32_t sum;
+
+    __asm__ __volatile__("
+            movl (%1), %0
+            subl $4, %2
+            jbe 2f
+            addl 4(%1), %0
+            adcl 8(%1), %0
+            adcl 12(%1), %0
+1:          adcl 16(%1), %0
+            lea 4(%1), %1
+            decl %2
+            jne 1b
+            adcl $0, %0
+            movl %0, %2
+            shrl $16, %0
+            addw %w2, %w0
+            adcl $0, %0
+            notl %0
+2:
+            "
+        /* Since the input registers which are loaded with iph and ipl
+           are modified, we must also specify them as outputs, or gcc
+           will assume they contain their original values. */
+        : "=r" (sum), "=r" (iph), "=r" (ihl)
+        : "1" (iph), "2" (ihl));
+    return sum;
+}
+#else
+static inline uint16_t ip_fast_csum(uint8_t *iph, uint32_t ihl)
+{
+    return ip_csum(iph,ihl*4);
+}
+#endif
+
+struct iphdr {
+#if defined (WORDS_BIGENDIAN)
+    uint8_t    version:4,
+              ihl:4;
+#else
+    uint8_t    ihl:4,
+              version:4;
+#endif
+    uint8_t    tos;
+    uint16_t   tot_len;
+    uint16_t   id;
+    uint16_t   frag_off;
+    uint8_t    ttl;
+    uint8_t    protocol;
+    uint16_t   check;
+    uint32_t   saddr;
+    uint32_t   daddr;
+    /* The options start here. */
 };
 
-struct userv {
-    closure_t cl;
-    struct netlink_if ops;
-    uint32_t max_start_pad;
-    uint32_t max_end_pad;
-    int txfd; /* We transmit to userv */
-    int rxfd; /* We receive from userv */
-    struct netlink_client *clients;
-    string_t name;
-    string_t userv_path;
-    string_t service_user;
-    string_t service_name;
-    struct subnet_list networks;
-    uint32_t local_address;
-    uint32_t secnet_address;
-    uint32_t mtu;
-    uint32_t txbuflen;
-    struct buffer_if *buff; /* We unstuff received packets into here
-                              and send them to the site code. */
-    bool_t pending_esc;
+struct icmphdr {
+    struct iphdr iph;
+    uint8_t type;
+    uint8_t code;
+    uint16_t check;
+    union {
+       uint32_t unused;
+       struct {
+           uint8_t pointer;
+           uint8_t unused1;
+           uint16_t unused2;
+       } pprob;
+       uint32_t gwaddr;
+       struct {
+           uint16_t id;
+           uint16_t seq;
+       } echo;
+    } d;
 };
+    
+static void netlink_packet_deliver(struct netlink *st,
+                                  struct netlink_client *client,
+                                  struct buffer_if *buf);
 
-static int userv_beforepoll(void *sst, struct pollfd *fds, int *nfds_io,
-                           int *timeout_io, const struct timeval *tv_now,
-                           uint64_t *now)
+static struct icmphdr *netlink_icmp_tmpl(struct netlink *st,
+                                        uint32_t dest,uint16_t len)
 {
-    struct userv *st=sst;
-    *nfds_io=2;
-    fds[0].fd=st->txfd;
-    fds[0].events=POLLERR; /* Might want to pick up POLLOUT sometime */
-    fds[1].fd=st->rxfd;
-    fds[1].events=POLLIN|POLLERR|POLLHUP;
-    return 0;
+    struct icmphdr *h;
+
+    BUF_ALLOC(&st->icmp,"netlink_icmp_tmpl");
+    buffer_init(&st->icmp,st->max_start_pad);
+    h=buf_append(&st->icmp,sizeof(*h));
+
+    h->iph.version=4;
+    h->iph.ihl=5;
+    h->iph.tos=0;
+    h->iph.tot_len=htons(len+(h->iph.ihl*4)+8);
+    h->iph.id=0;
+    h->iph.frag_off=0;
+    h->iph.ttl=255;
+    h->iph.protocol=1;
+    h->iph.saddr=htonl(st->secnet_address);
+    h->iph.daddr=htonl(dest);
+    h->iph.check=0;
+    h->iph.check=ip_fast_csum((uint8_t *)&h->iph,h->iph.ihl);
+    h->check=0;
+    h->d.unused=0;
+
+    return h;
 }
 
-static void process_local_packet(struct userv *st)
+/* Fill in the ICMP checksum field correctly */
+static void netlink_icmp_csum(struct icmphdr *h)
 {
-    uint32_t source,dest;
-    struct netlink_client *c;
+    uint32_t len;
 
-    source=ntohl(*(uint32_t *)(st->buff->start+12));
-    dest=ntohl(*(uint32_t *)(st->buff->start+16));
-
-/*    printf("process_local_packet source=%s dest=%s len=%d\n",
-      ipaddr_to_string(source),ipaddr_to_string(dest),
-      st->buff->size); */
-    if (!subnet_match(&st->networks,source)) {
-       string_t s,d;
-       s=ipaddr_to_string(source);
-       d=ipaddr_to_string(dest);
-       Message(M_WARNING,"%s: outgoing packet with bad source address "
-               "(s=%s,d=%s)\n",st->name,s,d);
-       free(s); free(d);
-       return;
-    }
-    for (c=st->clients; c; c=c->next) {
-       if (subnet_match(c->networks,dest)) {
-           c->deliver(c->dst,c,st->buff);
-           BUF_ALLOC(st->buff,"netlink:process_local_packet");
-           return;
-       }
-    }
-    if (dest==st->secnet_address) {
-       printf("%s: secnet received packet of len %d from %s\n",st->name,
-              st->buff->size,ipaddr_to_string(source));
-       return;
-    }
-    {
-       string_t s,d;
-       s=ipaddr_to_string(source);
-       d=ipaddr_to_string(dest);
-       Message(M_WARNING,"%s: outgoing packet with bad destination address "
-                         "(s=%s,d=%s)\n",st->name,s,d);
-       free(s); free(d);
-       return;
+    len=ntohs(h->iph.tot_len)-(4*h->iph.ihl);
+    h->check=0;
+    h->check=ip_csum(&h->type,len);
+}
+
+/* RFC1122:
+ *       An ICMP error message MUST NOT be sent as the result of
+ *       receiving:
+ *
+ *       *    an ICMP error message, or
+ *
+ *       *    a datagram destined to an IP broadcast or IP multicast
+ *            address, or
+ *
+ *       *    a datagram sent as a link-layer broadcast, or
+ *
+ *       *    a non-initial fragment, or
+ *
+ *       *    a datagram whose source address does not define a single
+ *            host -- e.g., a zero address, a loopback address, a
+ *            broadcast address, a multicast address, or a Class E
+ *            address.
+ */
+static bool_t netlink_icmp_may_reply(struct buffer_if *buf)
+{
+    struct iphdr *iph;
+    uint32_t source;
+
+    iph=(struct iphdr *)buf->start;
+    if (iph->protocol==1) return False; /* Overly-broad; we may reply to
+                                          eg. icmp echo-request */
+    /* How do we spot broadcast destination addresses? */
+    if (ntohs(iph->frag_off)&0x1fff) return False; /* Non-initial fragment */
+    source=ntohl(iph->saddr);
+    if (source==0) return False;
+    if ((source&0xff000000)==0x7f000000) return False;
+    /* How do we spot broadcast source addresses? */
+    if ((source&0xf0000000)==0xe0000000) return False; /* Multicast */
+    if ((source&0xf0000000)==0xf0000000) return False; /* Class E */
+    return True;
+}
+
+/* How much of the original IP packet do we include in its ICMP
+   response? The header plus up to 64 bits. */
+static uint16_t netlink_icmp_reply_len(struct buffer_if *buf)
+{
+    struct iphdr *iph=(struct iphdr *)buf->start;
+    uint16_t hlen,plen;
+
+    hlen=iph->ihl*4;
+    /* We include the first 8 bytes of the packet data, provided they exist */
+    hlen+=8;
+    plen=ntohs(iph->tot_len);
+    return (hlen>plen?plen:hlen);
+}
+
+/* client indicates where the packet we're constructing a response to
+   comes from. NULL indicates the host. */
+static void netlink_icmp_simple(struct netlink *st, struct buffer_if *buf,
+                               struct netlink_client *client,
+                               uint8_t type, uint8_t code)
+{
+    struct iphdr *iph=(struct iphdr *)buf->start;
+    struct icmphdr *h;
+    uint16_t len;
+
+    if (netlink_icmp_may_reply(buf)) {
+       len=netlink_icmp_reply_len(buf);
+       h=netlink_icmp_tmpl(st,ntohl(iph->saddr),len);
+       h->type=type; h->code=code;
+       memcpy(buf_append(&st->icmp,len),buf->start,len);
+       netlink_icmp_csum(h);
+       netlink_packet_deliver(st,NULL,&st->icmp);
+       BUF_ASSERT_FREE(&st->icmp);
     }
 }
 
-static void userv_afterpoll(void *sst, struct pollfd *fds, int nfds,
-                           const struct timeval *tv_now, uint64_t *now)
+/*
+ * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the
+ * checksum.
+ *
+ * Is the datagram acceptable?
+ *
+ * 1. Length at least the size of an ip header
+ * 2. Version of 4
+ * 3. Checksums correctly.
+ * 4. Doesn't have a bogus length
+ */
+static bool_t netlink_check(struct netlink *st, struct buffer_if *buf)
 {
-    struct userv *st=sst;
-    uint8_t rxbuf[DEFAULT_BUFSIZE];
-    int l,i;
+    struct iphdr *iph=(struct iphdr *)buf->start;
+    uint32_t len;
+
+    if (iph->ihl < 5 || iph->version != 4) return False;
+    if (buf->size < iph->ihl*4) return False;
+    if (ip_fast_csum((uint8_t *)iph, iph->ihl)!=0) return False;
+    len=ntohs(iph->tot_len);
+    /* There should be no padding */
+    if (buf->size!=len || len<(iph->ihl<<2)) return False;
+    /* XXX check that there's no source route specified */
+    return True;
+}
+
+/* Deliver a packet. "client" points to the _origin_ of the packet, not
+   its destination. (May be used when sending ICMP response - avoid
+   asymmetric routing.) */
+static void netlink_packet_deliver(struct netlink *st,
+                                  struct netlink_client *client,
+                                  struct buffer_if *buf)
+{
+    struct iphdr *iph=(struct iphdr *)buf->start;
+    uint32_t dest=ntohl(iph->daddr);
+    uint32_t source=ntohl(iph->saddr);
+    uint32_t best_quality;
+    int best_match;
+    int i;
 
-    if (fds[1].revents&POLLERR) {
-       printf("userv_afterpoll: hup!\n");
+    BUF_ASSERT_USED(buf);
+
+    if (dest==st->secnet_address) {
+       Message(M_ERROR,"%s: trying to deliver a packet to myself!\n");
+       BUF_FREE(buf);
+       return;
     }
-    if (fds[1].revents&POLLIN) {
-       l=read(st->rxfd,rxbuf,DEFAULT_BUFSIZE);
-       if (l<0) {
-           fatal_perror("userv_afterpoll: read(rxfd)");
-       }
-       if (l==0) {
-           fatal("userv_afterpoll: read(rxfd)=0; userv gone away?\n");
-       }
-       /* XXX really crude unstuff code */
-       /* XXX check for buffer overflow */
-       for (i=0; i<l; i++) {
-           if (st->pending_esc) {
-               st->pending_esc=False;
-               switch(rxbuf[i]) {
-               case SLIP_ESCEND:
-                   *(uint8_t *)buf_append(st->buff,1)=SLIP_END;
-                   break;
-               case SLIP_ESCESC:
-                   *(uint8_t *)buf_append(st->buff,1)=SLIP_ESC;
-                   break;
-               default:
-                   fatal("userv_afterpoll: bad SLIP escape character\n");
+    
+    /* XXX we're going to need an extra value 'allow_route' for the
+       source of the packet. It's always True for packets from the
+       host. For packets from tunnels, we consult the client
+       options. If !allow_route and the destination is a tunnel that
+       also doesn't allow routing, we must reject the packet with an
+       'administratively prohibited' or something similar ICMP. */
+    if (!client) {
+       /* Origin of packet is host or secnet. Might be for a tunnel. */
+       best_quality=0;
+       best_match=-1;
+       for (i=0; i<st->n_routes; i++) {
+           if (st->routes[i].up && subnet_match(&st->routes[i].net,dest)) {
+               if (st->routes[i].c->link_quality>best_quality
+                   || best_quality==0) {
+                   best_quality=st->routes[i].c->link_quality;
+                   best_match=i;
+                   /* If quality isn't perfect we may wish to
+                      consider kicking the tunnel with a 0-length
+                      packet to prompt it to perform a key setup.
+                      Then it'll eventually decide it's up or
+                      down. */
+                   /* If quality is perfect we don't need to search
+                       any more. */
+                   if (best_quality>=MAXIMUM_LINK_QUALITY) break;
                }
+           }
+       }
+       if (best_match==-1) {
+           /* Not going down a tunnel. Might be for the host. 
+              XXX think about this - only situation should be if we're
+              sending ICMP. */
+           if (source!=st->secnet_address) {
+               Message(M_ERROR,"netlink_packet_deliver: outgoing packet "
+                       "from host that won't fit down any of our tunnels!\n");
+               /* XXX I think this could also occur if a soft tunnel just
+                  went down, but still had packets queued in the kernel. */
+               BUF_FREE(buf);
            } else {
-               switch (rxbuf[i]) {
-               case SLIP_END:
-                   if (st->buff->size>0) process_local_packet(st);
-                   BUF_ASSERT_USED(st->buff);
-                   buffer_init(st->buff,st->max_start_pad);
-                   break;
-               case SLIP_ESC:
-                   st->pending_esc=True;
-                   break;
-               default:
-                   *(uint8_t *)buf_append(st->buff,1)=rxbuf[i];
-                   break;
-               }
+               st->deliver_to_host(st->dst,NULL,buf);
+               BUF_ASSERT_FREE(buf);
+           }
+       } else {
+           if (best_quality>0) {
+               st->routes[best_match].c->deliver(
+                   st->routes[best_match].c->dst,
+                   st->routes[best_match].c, buf);
+               BUF_ASSERT_FREE(buf);
+           } else {
+               /* Generate ICMP destination unreachable */
+               netlink_icmp_simple(st,buf,client,3,0); /* client==NULL */
+               BUF_FREE(buf);
            }
        }
+    } else { /* client is set */
+       /* We know the origin is a tunnel - packet must be for the host */
+       /* XXX THIS IS NOT NECESSARILY TRUE, AND NEEDS FIXING */
+       /* THIS FUNCTION MUST JUST DELIVER THE PACKET: IT MUST ASSUME
+          THE PACKET HAS ALREADY BEEN CHECKED */
+       if (subnet_matches_list(&st->networks,dest)) {
+           st->deliver_to_host(st->dst,NULL,buf);
+           BUF_ASSERT_FREE(buf);
+       } else {
+           Message(M_ERROR,"%s: packet from tunnel %s can't be delivered "
+                   "to the host\n",st->name,client->name);
+           netlink_icmp_simple(st,buf,client,3,0);
+           BUF_FREE(buf);
+       }
     }
-    return;
+    BUF_ASSERT_FREE(buf);
 }
 
-static void userv_phase_hook(void *sst, uint32_t newphase)
+static void netlink_packet_forward(struct netlink *st, 
+                                  struct netlink_client *client,
+                                  struct buffer_if *buf)
 {
-    struct userv *st=sst;
-    pid_t child;
-    int c_stdin[2];
-    int c_stdout[2];
-    string_t addrs;
-    string_t nets;
-    string_t s;
-    struct netlink_client *c;
-    int i;
+    struct iphdr *iph=(struct iphdr *)buf->start;
+    
+    BUF_ASSERT_USED(buf);
 
-    /* This is where we actually invoke userv - all the networks we'll
-       be using should already have been registered. */
+    /* Packet has already been checked */
+    if (iph->ttl<=1) {
+       /* Generate ICMP time exceeded */
+       netlink_icmp_simple(st,buf,client,11,0);
+       BUF_FREE(buf);
+       return;
+    }
+    iph->ttl--;
+    iph->check=0;
+    iph->check=ip_fast_csum((uint8_t *)iph,iph->ihl);
 
-    addrs=safe_malloc(512,"userv_phase_hook:addrs");
-    snprintf(addrs,512,"%s,%s,%d,slip",ipaddr_to_string(st->local_address),
-            ipaddr_to_string(st->secnet_address),st->mtu);
+    netlink_packet_deliver(st,client,buf);
+    BUF_ASSERT_FREE(buf);
+}
 
-    nets=safe_malloc(1024,"userv_phase_hook:nets");
-    *nets=0;
-    for (c=st->clients; c; c=c->next) {
-       for (i=0; i<c->networks->entries; i++) {
-           s=subnet_to_string(&c->networks->list[i]);
-           strcat(nets,s);
-           strcat(nets,",");
-           free(s);
-       }
+/* Deal with packets addressed explicitly to us */
+static void netlink_packet_local(struct netlink *st,
+                                struct netlink_client *client,
+                                struct buffer_if *buf)
+{
+    struct icmphdr *h;
+
+    h=(struct icmphdr *)buf->start;
+
+    if ((ntohs(h->iph.frag_off)&0xbfff)!=0) {
+       Message(M_WARNING,"%s: fragmented packet addressed to secnet; "
+               "ignoring it\n",st->name);
+       BUF_FREE(buf);
+       return;
     }
-    nets[strlen(nets)-1]=0;
 
-    Message(M_INFO,"\nuserv_phase_hook: %s %s %s %s %s\n",st->userv_path,
-          st->service_user,st->service_name,addrs,nets);
+    if (h->iph.protocol==1) {
+       /* It's ICMP */
+       if (h->type==8 && h->code==0) {
+           /* ICMP echo-request. Special case: we re-use the buffer
+              to construct the reply. */
+           h->type=0;
+           h->iph.daddr=h->iph.saddr;
+           h->iph.saddr=htonl(st->secnet_address);
+           h->iph.ttl=255; /* Be nice and bump it up again... */
+           h->iph.check=0;
+           h->iph.check=ip_fast_csum((uint8_t *)h,h->iph.ihl);
+           netlink_icmp_csum(h);
+           netlink_packet_deliver(st,NULL,buf);
+           return;
+       }
+       Message(M_WARNING,"%s: unknown incoming ICMP\n",st->name);
+    } else {
+       /* Send ICMP protocol unreachable */
+       netlink_icmp_simple(st,buf,client,3,2);
+       BUF_FREE(buf);
+       return;
+    }
 
-    /* Allocate buffer, plus space for padding. Make sure we end up
-       with the start of the packet well-aligned. */
-    /* ALIGN(st->max_start_pad,16); */
-    /* ALIGN(st->max_end_pad,16); */
+    BUF_FREE(buf);
+}
 
-    st->pending_esc=False;
+/* If cid==NULL packet is from host, otherwise cid specifies which tunnel 
+   it came from. */
+static void netlink_incoming(void *sst, void *cid, struct buffer_if *buf)
+{
+    struct netlink *st=sst;
+    struct netlink_client *client=cid;
+    uint32_t source,dest;
+    struct iphdr *iph;
 
-    /* Invoke userv */
-    if (pipe(c_stdin)!=0) {
-       fatal_perror("userv_phase_hook: pipe(c_stdin)");
+    BUF_ASSERT_USED(buf);
+    if (!netlink_check(st,buf)) {
+       Message(M_WARNING,"%s: bad IP packet from %s\n",
+               st->name,client?client->name:"host");
+       BUF_FREE(buf);
+       return;
     }
-    if (pipe(c_stdout)!=0) {
-       fatal_perror("userv_phase_hook: pipe(c_stdout)");
+    iph=(struct iphdr *)buf->start;
+
+    source=ntohl(iph->saddr);
+    dest=ntohl(iph->daddr);
+
+    /* Check source */
+    if (client) {
+       /* Check that the packet source is in 'nets' and its destination is
+          in st->networks */
+       if (!subnet_matches_list(client->networks,source)) {
+           string_t s,d;
+           s=ipaddr_to_string(source);
+           d=ipaddr_to_string(dest);
+           Message(M_WARNING,"%s: packet from tunnel %s with bad "
+                   "source address (s=%s,d=%s)\n",st->name,client->name,s,d);
+           free(s); free(d);
+           BUF_FREE(buf);
+           return;
+       }
+    } else {
+       if (!subnet_matches_list(&st->networks,source)) {
+           string_t s,d;
+           s=ipaddr_to_string(source);
+           d=ipaddr_to_string(dest);
+           Message(M_WARNING,"%s: outgoing packet with bad source address "
+                   "(s=%s,d=%s)\n",st->name,s,d);
+           free(s); free(d);
+           BUF_FREE(buf);
+           return;
+       }
     }
-    st->txfd=c_stdin[1];
-    st->rxfd=c_stdout[0];
+    /* (st->secnet_address needs checking before matching destination
+       addresses) */
+    if (dest==st->secnet_address) {
+       netlink_packet_local(st,client,buf);
+       BUF_ASSERT_FREE(buf);
+       return;
+    }
+    if (client) {
+       /* Check for free routing */
+       if (!subnet_matches_list(&st->networks,dest)) {
+           string_t s,d;
+           s=ipaddr_to_string(source);
+           d=ipaddr_to_string(dest);
+           Message(M_WARNING,"%s: incoming packet from tunnel %s "
+                   "with bad destination address "
+                   "(s=%s,d=%s)\n",st->name,client->name,s,d);
+           free(s); free(d);
+           BUF_FREE(buf);
+           return;
+       }
+    }
+    netlink_packet_forward(st,client,buf);
+    BUF_ASSERT_FREE(buf);
+}
+
+static void netlink_set_softlinks(struct netlink *st, struct netlink_client *c,
+                                 bool_t up)
+{
+    uint32_t i;
 
-    child=fork();
-    if (child==-1) {
-       fatal_perror("userv_phase_hook: fork()");
+    if (!st->routes) return; /* Table has not yet been created */
+    for (i=0; i<st->n_routes; i++) {
+       if (!st->routes[i].hard && st->routes[i].c==c) {
+           st->routes[i].up=up;
+           st->set_route(st->dst,&st->routes[i]);
+       }
     }
-    if (child==0) {
-       char **argv;
-
-       /* We are the child. Modify our stdin and stdout, then exec userv */
-       dup2(c_stdin[0],0);
-       dup2(c_stdout[1],1);
-       close(c_stdin[1]);
-       close(c_stdout[0]);
-
-       /* The arguments are:
-          userv
-          service-user
-          service-name
-          local-addr,secnet-addr,mtu,protocol
-          route1,route2,... */
-       argv=malloc(sizeof(*argv)*6);
-       argv[0]=st->userv_path;
-       argv[1]=st->service_user;
-       argv[2]=st->service_name;
-       argv[3]=addrs;
-       argv[4]=nets;
-       argv[5]=NULL;
-       execvp(st->userv_path,argv);
-       perror("netlink-userv-ipif: execvp");
-
-       exit(1);
+}
+
+static void netlink_set_quality(void *sst, void *cid, uint32_t quality)
+{
+    struct netlink *st=sst;
+    struct netlink_client *c=cid;
+
+    c->link_quality=quality;
+    if (c->link_quality==LINK_QUALITY_DOWN) {
+       netlink_set_softlinks(st,c,False);
+    } else {
+       netlink_set_softlinks(st,c,True);
     }
-    /* We are the parent... */
-          
-    /* Register for poll() */
-    register_for_poll(st, userv_beforepoll, userv_afterpoll, 2, "netlink");
 }
 
-static void *userv_regnets(void *sst, struct subnet_list *nets,
-                          netlink_deliver_fn *deliver, void *dst,
-                          uint32_t max_start_pad, uint32_t max_end_pad)
+static void *netlink_regnets(void *sst, struct subnet_list *nets,
+                            netlink_deliver_fn *deliver, void *dst,
+                            uint32_t max_start_pad, uint32_t max_end_pad,
+                            uint32_t options, string_t client_name)
 {
-    struct userv *st=sst;
+    struct netlink *st=sst;
     struct netlink_client *c;
 
-    Message(M_DEBUG_CONFIG,"userv_regnets: request for %d networks, "
+    Message(M_DEBUG_CONFIG,"netlink_regnets: request for %d networks, "
            "max_start_pad=%d, max_end_pad=%d\n",
            nets->entries,max_start_pad,max_end_pad);
 
-    c=safe_malloc(sizeof(*c),"userv_regnets");
+    if ((options&NETLINK_OPTION_SOFTROUTE) && !st->set_route) {
+       Message(M_ERROR,"%s: this netlink device does not support "
+               "soft routes.\n");
+       return NULL;
+    }
+
+    if (options&NETLINK_OPTION_SOFTROUTE) {
+       /* XXX for now we assume that soft routes require root privilege;
+          this may not always be true. The device driver can tell us. */
+       require_root_privileges=True;
+       require_root_privileges_explanation="netlink: soft routes";
+    }
+
+    /* Check that nets do not intersect st->exclude_remote_networks;
+       refuse to register if they do. */
+    if (subnet_lists_intersect(&st->exclude_remote_networks,nets)) {
+       Message(M_ERROR,"%s: site %s specifies networks that "
+               "intersect with the explicitly excluded remote networks\n",
+               st->name,client_name);
+       return False;
+    }
+
+    c=safe_malloc(sizeof(*c),"netlink_regnets");
     c->networks=nets;
     c->deliver=deliver;
     c->dst=dst;
+    c->name=client_name; /* XXX copy it? */
+    c->options=options;
+    c->link_quality=LINK_QUALITY_DOWN;
     c->next=st->clients;
     st->clients=c;
     if (max_start_pad > st->max_start_pad) st->max_start_pad=max_start_pad;
     if (max_end_pad > st->max_end_pad) st->max_end_pad=max_end_pad;
+    st->n_routes+=nets->entries;
 
     return c;
 }
 
-static void userv_deliver(void *sst, void *cid, struct buffer_if *buf)
+static void netlink_dump_routes(struct netlink *st)
 {
-    struct userv *st=sst;
-    struct netlink_client *client=cid;
-    uint8_t txbuf[DEFAULT_BUFSIZE];
-
-    uint32_t source,dest;
-    uint8_t *i;
-    uint32_t j;
-
-    source=ntohl(*(uint32_t *)(buf->start+12));
-    dest=ntohl(*(uint32_t *)(buf->start+16));
-
-    /* Check that the packet source is in 'nets' and its destination is
-       in client->networks */
-    if (!subnet_match(client->networks,source)) {
-       string_t s,d;
-       s=ipaddr_to_string(source);
-       d=ipaddr_to_string(dest);
-       Message(M_WARNING,"%s: incoming packet with bad source address "
-               "(s=%s,d=%s)\n",st->name,s,d);
-       free(s); free(d);
-       return;
+    int i;
+    string_t net;
+
+    Message(M_INFO,"%s: routing table:\n",st->name);
+    for (i=0; i<st->n_routes; i++) {
+       net=subnet_to_string(&st->routes[i].net);
+       Message(M_INFO,"%s -> tunnel %s (%s,%s route,%s)\n",net,
+               st->routes[i].c->name,
+               st->routes[i].hard?"hard":"soft",
+               st->routes[i].allow_route?"free":"restricted",
+               st->routes[i].up?"up":"down");
+       free(net);
     }
-    if (!subnet_match(&st->networks,dest)) {
-       string_t s,d;
-       s=ipaddr_to_string(source);
-       d=ipaddr_to_string(dest);
-       Message(M_WARNING,"%s: incoming packet with bad destination address "
-               "(s=%s,d=%s)\n",st->name,s,d);
-       free(s); free(d);
-       return;
+    Message(M_INFO,"%s/32 -> netlink \"%s\"\n",
+           ipaddr_to_string(st->secnet_address),st->name);
+    for (i=0; i<st->networks.entries; i++) {
+       net=subnet_to_string(&st->networks.list[i]);
+       Message(M_INFO,"%s -> host\n",net);
+       free(net);
     }
+}
 
-    /* Really we should decrease TTL, check it's above zero, and
-       recalculate header checksum here. If it gets down to zero,
-       generate an ICMP time-exceeded and send the new packet back to
-       the originating tunnel. XXX check buffer usage! */
-
-    /* (Basically do full IP packet forwarding stuff. Except that we
-       know any packet passed in here is destined for the local
-       machine; only exception is if it's destined for us.) */
+static int netlink_compare_route_specificity(const void *ap, const void *bp)
+{
+    const struct netlink_route *a=ap;
+    const struct netlink_route *b=bp;
 
-    if (dest==st->secnet_address) {
-       printf("%s: incoming tunneled packet for secnet!\n",st->name);
-       return;
-    }
+    if (a->net.len==b->net.len) return 0;
+    if (a->net.len<b->net.len) return 1;
+    return -1;
+}
 
-    /* Now spit the packet at userv-ipif: SLIP start marker, then
-       bytestuff the packet, then SLIP end marker */
-    /* XXX crunchy bytestuff code */
-    j=0;
-    txbuf[j++]=SLIP_END;
-    for (i=buf->start; i<(buf->start+buf->size); i++) {
-       switch (*i) {
-       case SLIP_END:
-           txbuf[j++]=SLIP_ESC;
-           txbuf[j++]=SLIP_ESCEND;
-           break;
-       case SLIP_ESC:
-           txbuf[j++]=SLIP_ESC;
-           txbuf[j++]=SLIP_ESCESC;
-           break;
-       default:
-           txbuf[j++]=*i;
-           break;
+static void netlink_phase_hook(void *sst, uint32_t new_phase)
+{
+    struct netlink *st=sst;
+    struct netlink_client *c;
+    uint32_t i,j;
+
+    /* All the networks serviced by the various tunnels should now
+     * have been registered.  We build a routing table by sorting the
+     * routes into most-specific-first order.  */
+    st->routes=safe_malloc(st->n_routes*sizeof(*st->routes),
+                          "netlink_phase_hook");
+    /* Fill the table */
+    i=0;
+    for (c=st->clients; c; c=c->next) {
+       for (j=0; j<c->networks->entries; j++) {
+           st->routes[i].net=c->networks->list[j];
+           st->routes[i].c=c;
+           /* Hard routes are always up;
+              soft routes default to down */
+           st->routes[i].up=c->options&NETLINK_OPTION_SOFTROUTE?False:True;
+           st->routes[i].kup=False;
+           st->routes[i].hard=c->options&NETLINK_OPTION_SOFTROUTE?False:True;
+           st->routes[i].allow_route=c->options&NETLINK_OPTION_ALLOW_ROUTE?
+               True:False;
+           i++;
        }
     }
-    txbuf[j++]=SLIP_END;
-    if (write(st->txfd,txbuf,j)<0) {
-       fatal_perror("userv_deliver: write()");
+    /* ASSERT i==st->n_routes */
+    if (i!=st->n_routes) {
+       fatal("netlink: route count error: expected %d got %d\n",
+             st->n_routes,i);
     }
+    /* Sort the table in descending order of specificity */
+    qsort(st->routes,st->n_routes,sizeof(*st->routes),
+         netlink_compare_route_specificity);
 
-    return;
+    netlink_dump_routes(st);
 }
 
-static list_t *userv_apply(closure_t *self, struct cloc loc, dict_t *context,
-                          list_t *args)
+netlink_deliver_fn *netlink_init(struct netlink *st,
+                                void *dst, struct cloc loc,
+                                dict_t *dict, string_t description,
+                                netlink_route_fn *set_route,
+                                netlink_deliver_fn *to_host)
 {
-    struct userv *st;
-    item_t *item;
-    dict_t *dict;
-
-    st=safe_malloc(sizeof(*st),"userv_apply (netlink)");
-    st->cl.description="userv-netlink";
+    st->dst=dst;
+    st->cl.description=description;
     st->cl.type=CL_NETLINK;
     st->cl.apply=NULL;
     st->cl.interface=&st->ops;
     st->ops.st=st;
-    st->ops.regnets=userv_regnets;
-    st->ops.deliver=userv_deliver;
+    st->ops.regnets=netlink_regnets;
+    st->ops.deliver=netlink_incoming;
+    st->ops.set_quality=netlink_set_quality;
     st->max_start_pad=0;
     st->max_end_pad=0;
-    st->rxfd=-1; st->txfd=-1;
     st->clients=NULL;
+    st->set_route=set_route;
+    st->deliver_to_host=to_host;
 
-    /* First parameter must be a dict */
-    item=list_elem(args,0);
-    if (!item || item->type!=t_dict)
-       cfgfatal(loc,"userv-ipif","parameter must be a dictionary\n");
-    
-    dict=item->data.dict;
-    st->name=dict_read_string(dict,"name",False,"userv-netlink",loc);
-    st->userv_path=dict_read_string(dict,"userv-path",False,"userv-netlink",
-                                   loc);
-    st->service_user=dict_read_string(dict,"service-user",False,
-                                     "userv-netlink",loc);
-    st->service_name=dict_read_string(dict,"service-name",False,
-                                     "userv-netlink",loc);
-    if (!st->name) st->name="netlink-userv-ipif";
-    if (!st->userv_path) st->userv_path="userv";
-    if (!st->service_user) st->service_user="root";
-    if (!st->service_name) st->service_name="ipif";
-    dict_read_subnet_list(dict, "networks", True, "userv-netlink", loc,
+    st->name=dict_read_string(dict,"name",False,"netlink",loc);
+    if (!st->name) st->name=description;
+    dict_read_subnet_list(dict, "networks", True, "netlink", loc,
                          &st->networks);
-    st->local_address=string_to_ipaddr(
-       dict_find_item(dict,"local-address", True, "userv-netlink", loc),
-       "userv-netlink");
+    dict_read_subnet_list(dict, "exclude-remote-networks", False, "netlink",
+                         loc, &st->exclude_remote_networks);
+    /* secnet-address does not have to be in local-networks;
+       however, it should be advertised in the 'sites' file for the
+       local site. */
     st->secnet_address=string_to_ipaddr(
-       dict_find_item(dict,"secnet-address", True, "userv-netlink", loc),
-       "userv-netlink");
-    if (!subnet_match(&st->networks,st->local_address)) {
-       cfgfatal(loc,"netlink-userv-ipif","local-address must be in "
-             "local networks\n");
-    }
-    st->mtu=dict_read_number(dict, "mtu", False, "userv-netlink", loc, 1000);
-    st->buff=find_cl_if(dict,"buffer",CL_BUFFER,True,"userv-netlink",loc);
-    BUF_ALLOC(st->buff,"netlink:userv_apply");
+       dict_find_item(dict,"secnet-address", True, "netlink", loc),"netlink");
+    st->mtu=dict_read_number(dict, "mtu", False, "netlink", loc, DEFAULT_MTU);
+    buffer_new(&st->icmp,ICMP_BUFSIZE);
+    st->n_routes=0;
+    st->routes=NULL;
 
-    add_hook(PHASE_DROPPRIV,userv_phase_hook,st);
+    add_hook(PHASE_SETUP,netlink_phase_hook,st);
 
-    return new_closure(&st->cl);
+    return netlink_incoming;
 }
 
+/* No connection to the kernel at all... */
+
 struct null {
-    closure_t cl;
-    struct netlink_if ops;
+    struct netlink nl;
 };
 
-static void *null_regnets(void *sst, struct subnet_list *nets,
-                         netlink_deliver_fn *deliver, void *dst,
-                         uint32_t max_start_pad, uint32_t max_end_pad)
+static bool_t null_set_route(void *sst, struct netlink_route *route)
 {
-    Message(M_DEBUG_CONFIG,"null_regnets: request for %d networks, "
-           "max_start_pad=%d, max_end_pad=%d\n",
-           nets->entries,max_start_pad,max_end_pad);
-    return NULL;
+    struct null *st=sst;
+    string_t t;
+
+    if (route->up!=route->kup) {
+       t=subnet_to_string(&route->net);
+       Message(M_INFO,"%s: setting route %s to state %s\n",st->nl.name,
+               t, route->up?"up":"down");
+       free(t);
+       route->kup=route->up;
+       return True;
+    }
+    return False;
 }
-
+           
 static void null_deliver(void *sst, void *cid, struct buffer_if *buf)
 {
     return;
@@ -442,26 +705,25 @@ static list_t *null_apply(closure_t *self, struct cloc loc, dict_t *context,
                          list_t *args)
 {
     struct null *st;
+    item_t *item;
+    dict_t *dict;
 
-    st=safe_malloc(sizeof(*st),"null_apply (netlink)");
-    st->cl.description="null-netlink";
-    st->cl.type=CL_NETLINK;
-    st->cl.apply=NULL;
-    st->cl.interface=&st->ops;
-    st->ops.st=st;
-    st->ops.regnets=null_regnets;
-    st->ops.deliver=null_deliver;
+    st=safe_malloc(sizeof(*st),"null_apply");
+
+    item=list_elem(args,0);
+    if (!item || item->type!=t_dict)
+       cfgfatal(loc,"null-netlink","parameter must be a dictionary\n");
+    
+    dict=item->data.dict;
+
+    netlink_init(&st->nl,st,loc,dict,"null-netlink",null_set_route,
+                null_deliver);
 
-    return new_closure(&st->cl);
+    return new_closure(&st->nl.cl);
 }
 
 init_module netlink_module;
 void netlink_module(dict_t *dict)
 {
-    add_closure(dict,"userv-ipif",userv_apply);
-#if 0
-    add_closure(dict,"pty-slip",ptyslip_apply);
-    add_closure(dict,"slipd",slipd_apply);
-#endif /* 0 */
     add_closure(dict,"null-netlink",null_apply);
 }