chiark / gitweb /
dea05f2e912f0294189244603897b25037d1abee
[secnet.git] / netlink.c
1 /* User-kernel network link */
2
3 /* We support a variety of methods for extracting packets from the
4    kernel: userv-ipif, ipif on its own (when we run as root), the
5    kernel TUN driver.  Possible future methods: SLIP to a pty, an
6    external netlink daemon.  There is a performance/security
7    tradeoff. */
8
9 /* When dealing with SLIP (to a pty, or ipif) we have separate rx, tx
10    and client buffers.  When receiving we may read() any amount, not
11    just whole packets.  When transmitting we need to bytestuff anyway,
12    and may be part-way through receiving. */
13
14 /* Each netlink device is actually a router, with its own IP address.
15    We do things like decreasing the TTL and recalculating the header
16    checksum, generating ICMP, responding to pings, etc. */
17
18 /* This is where we have the anti-spoofing paranoia - before sending a
19    packet to the kernel we check that the tunnel it came over could
20    reasonably have produced it. */
21
22 #include "secnet.h"
23 #include <stdio.h>
24 #include <string.h>
25 #include <unistd.h>
26 #include <fcntl.h>
27 #include <sys/ioctl.h>
28 #include "util.h"
29
30 #ifdef HAVE_LINUX_IF_H
31 #include <linux/if.h>
32 #include <linux/if_tun.h>
33 #endif
34
35 /* XXX where do we find if_tun on other platforms? */
36
37 #define DEFAULT_BUFSIZE 2048
38 #define DEFAULT_MTU 1000
39 #define ICMP_BUFSIZE 1024
40
41 #define SLIP_END    192
42 #define SLIP_ESC    219
43 #define SLIP_ESCEND 220
44 #define SLIP_ESCESC 221
45
46 struct netlink_client {
47     struct subnet_list *networks;
48     netlink_deliver_fn *deliver;
49     void *dst;
50     string_t name;
51     uint32_t link_quality;
52     struct netlink_client *next;
53 };
54
55 struct netlink_route {
56     struct subnet net;
57     struct netlink_client *c;
58 };
59
60 /* Netlink provides one function to the device driver, to call to deliver
61    a packet from the device. The device driver provides one function to
62    netlink, for it to call to deliver a packet to the device. */
63
64 struct netlink {
65     closure_t cl;
66     struct netlink_if ops;
67     void *dst; /* Pointer to host interface state */
68     string_t name;
69     uint32_t max_start_pad;
70     uint32_t max_end_pad;
71     struct subnet_list networks;
72     struct subnet_list exclude_remote_networks;
73     uint32_t local_address; /* host interface address */
74     uint32_t secnet_address; /* our own address */
75     uint32_t mtu;
76     struct netlink_client *clients;
77     netlink_deliver_fn *deliver_to_host; /* Provided by driver */
78     struct buffer_if icmp; /* Buffer for assembly of outgoing ICMP */
79     uint32_t n_routes; /* How many routes do we know about? */
80     struct netlink_route *routes;
81 };
82
83 /* Generic IP checksum routine */
84 static inline uint16_t ip_csum(uint8_t *iph,uint32_t count)
85 {
86     register uint32_t sum=0;
87
88     while (count>1) {
89         sum+=ntohs(*(uint16_t *)iph);
90         iph+=2;
91         count-=2;
92     }
93     if(count>0)
94         sum+=*(uint8_t *)iph;
95     while (sum>>16)
96         sum=(sum&0xffff)+(sum>>16);
97     return htons(~sum);
98 }
99
100 #ifdef i386
101 /*
102  *      This is a version of ip_compute_csum() optimized for IP headers,
103  *      which always checksum on 4 octet boundaries.
104  *
105  *      By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
106  *      Arnt Gulbrandsen.
107  */
108 static inline uint16_t ip_fast_csum(uint8_t *iph, uint32_t ihl) {
109     uint32_t sum;
110
111     __asm__ __volatile__("
112             movl (%1), %0
113             subl $4, %2
114             jbe 2f
115             addl 4(%1), %0
116             adcl 8(%1), %0
117             adcl 12(%1), %0
118 1:          adcl 16(%1), %0
119             lea 4(%1), %1
120             decl %2
121             jne 1b
122             adcl $0, %0
123             movl %0, %2
124             shrl $16, %0
125             addw %w2, %w0
126             adcl $0, %0
127             notl %0
128 2:
129             "
130         /* Since the input registers which are loaded with iph and ipl
131            are modified, we must also specify them as outputs, or gcc
132            will assume they contain their original values. */
133         : "=r" (sum), "=r" (iph), "=r" (ihl)
134         : "1" (iph), "2" (ihl));
135     return sum;
136 }
137 #else
138 static inline uint16_t ip_fast_csum(uint8_t *iph, uint32_t ihl)
139 {
140     return ip_csum(iph,ihl*4);
141 }
142 #endif
143
144 struct iphdr {
145 #if defined (WORDS_BIGENDIAN)
146     uint8_t    version:4,
147                ihl:4;
148 #else
149     uint8_t    ihl:4,
150                version:4;
151 #endif
152     uint8_t    tos;
153     uint16_t   tot_len;
154     uint16_t   id;
155     uint16_t   frag_off;
156     uint8_t    ttl;
157     uint8_t    protocol;
158     uint16_t   check;
159     uint32_t   saddr;
160     uint32_t   daddr;
161     /* The options start here. */
162 };
163
164 struct icmphdr {
165     struct iphdr iph;
166     uint8_t type;
167     uint8_t code;
168     uint16_t check;
169     union {
170         uint32_t unused;
171         struct {
172             uint8_t pointer;
173             uint8_t unused1;
174             uint16_t unused2;
175         } pprob;
176         uint32_t gwaddr;
177         struct {
178             uint16_t id;
179             uint16_t seq;
180         } echo;
181     } d;
182 };
183     
184 static void netlink_packet_deliver(struct netlink *st,
185                                    struct netlink_client *client,
186                                    struct buffer_if *buf);
187
188 static struct icmphdr *netlink_icmp_tmpl(struct netlink *st,
189                                          uint32_t dest,uint16_t len)
190 {
191     struct icmphdr *h;
192
193     BUF_ALLOC(&st->icmp,"netlink_icmp_tmpl");
194     buffer_init(&st->icmp,st->max_start_pad);
195     h=buf_append(&st->icmp,sizeof(*h));
196
197     h->iph.version=4;
198     h->iph.ihl=5;
199     h->iph.tos=0;
200     h->iph.tot_len=htons(len+(h->iph.ihl*4)+8);
201     h->iph.id=0;
202     h->iph.frag_off=0;
203     h->iph.ttl=255;
204     h->iph.protocol=1;
205     h->iph.saddr=htonl(st->secnet_address);
206     h->iph.daddr=htonl(dest);
207     h->iph.check=0;
208     h->iph.check=ip_fast_csum((uint8_t *)&h->iph,h->iph.ihl);
209     h->check=0;
210     h->d.unused=0;
211
212     return h;
213 }
214
215 /* Fill in the ICMP checksum field correctly */
216 static void netlink_icmp_csum(struct icmphdr *h)
217 {
218     uint32_t len;
219
220     len=ntohs(h->iph.tot_len)-(4*h->iph.ihl);
221     h->check=0;
222     h->check=ip_csum(&h->type,len);
223 }
224
225 /* RFC1122:
226  *       An ICMP error message MUST NOT be sent as the result of
227  *       receiving:
228  *
229  *       *    an ICMP error message, or
230  *
231  *       *    a datagram destined to an IP broadcast or IP multicast
232  *            address, or
233  *
234  *       *    a datagram sent as a link-layer broadcast, or
235  *
236  *       *    a non-initial fragment, or
237  *
238  *       *    a datagram whose source address does not define a single
239  *            host -- e.g., a zero address, a loopback address, a
240  *            broadcast address, a multicast address, or a Class E
241  *            address.
242  */
243 static bool_t netlink_icmp_may_reply(struct buffer_if *buf)
244 {
245     struct iphdr *iph;
246     uint32_t source;
247
248     iph=(struct iphdr *)buf->start;
249     if (iph->protocol==1) return False; /* Overly-broad; we may reply to
250                                            eg. icmp echo-request */
251     /* How do we spot broadcast destination addresses? */
252     if (ntohs(iph->frag_off)&0x1fff) return False; /* Non-initial fragment */
253     source=ntohl(iph->saddr);
254     if (source==0) return False;
255     if ((source&0xff000000)==0x7f000000) return False;
256     /* How do we spot broadcast source addresses? */
257     if ((source&0xf0000000)==0xe0000000) return False; /* Multicast */
258     if ((source&0xf0000000)==0xf0000000) return False; /* Class E */
259     return True;
260 }
261
262 /* How much of the original IP packet do we include in its ICMP
263    response? The header plus up to 64 bits. */
264 static uint16_t netlink_icmp_reply_len(struct buffer_if *buf)
265 {
266     struct iphdr *iph=(struct iphdr *)buf->start;
267     uint16_t hlen,plen;
268
269     hlen=iph->ihl*4;
270     /* We include the first 8 bytes of the packet data, provided they exist */
271     hlen+=8;
272     plen=ntohs(iph->tot_len);
273     return (hlen>plen?plen:hlen);
274 }
275
276 /* client indicates where the packet we're constructing a response to
277    comes from. NULL indicates the host. */
278 static void netlink_icmp_simple(struct netlink *st, struct buffer_if *buf,
279                                 struct netlink_client *client,
280                                 uint8_t type, uint8_t code)
281 {
282     struct iphdr *iph=(struct iphdr *)buf->start;
283     struct icmphdr *h;
284     uint16_t len;
285
286     if (netlink_icmp_may_reply(buf)) {
287         len=netlink_icmp_reply_len(buf);
288         h=netlink_icmp_tmpl(st,ntohl(iph->saddr),len);
289         h->type=type; h->code=code;
290         memcpy(buf_append(&st->icmp,len),buf->start,len);
291         netlink_icmp_csum(h);
292         netlink_packet_deliver(st,NULL,&st->icmp);
293         BUF_ASSERT_FREE(&st->icmp);
294     }
295 }
296
297 /*
298  * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the
299  * checksum.
300  *
301  * Is the datagram acceptable?
302  *
303  * 1. Length at least the size of an ip header
304  * 2. Version of 4
305  * 3. Checksums correctly.
306  * 4. Doesn't have a bogus length
307  */
308 static bool_t netlink_check(struct netlink *st, struct buffer_if *buf)
309 {
310     struct iphdr *iph=(struct iphdr *)buf->start;
311     uint32_t len;
312
313     if (iph->ihl < 5 || iph->version != 4) {
314         printf("ihl/version check failed\n");
315         return False;
316     }
317     if (buf->size < iph->ihl*4) {
318         printf("buffer size check failed\n");
319         return False;
320     }
321     if (ip_fast_csum((uint8_t *)iph, iph->ihl)!=0) {
322         printf("checksum failed\n");
323         return False;
324     }
325     len=ntohs(iph->tot_len);
326     /* There should be no padding */
327     if (buf->size!=len || len<(iph->ihl<<2)) {
328         printf("length check failed buf->size=%d len=%d\n",buf->size,len);
329         return False;
330     }
331
332     /* XXX check that there's no source route specified */
333     return True;
334 }
335
336 /* Deliver a packet. "client" points to the _origin_ of the packet, not
337    its destination. (May be used when sending ICMP response - avoid
338    asymmetric routing.) */
339 static void netlink_packet_deliver(struct netlink *st,
340                                    struct netlink_client *client,
341                                    struct buffer_if *buf)
342 {
343     struct iphdr *iph=(struct iphdr *)buf->start;
344     uint32_t dest=ntohl(iph->daddr);
345     uint32_t source=ntohl(iph->saddr);
346     uint32_t best_quality;
347     int best_match;
348     int i;
349
350     BUF_ASSERT_USED(buf);
351
352     if (dest==st->secnet_address) {
353         Message(M_ERROR,"%s: trying to deliver a packet to myself!\n");
354         BUF_FREE(buf);
355         return;
356     }
357     
358     if (!client) {
359         /* Origin of packet is host or secnet. Might be for a tunnel. */
360         best_quality=0;
361         best_match=-1;
362         for (i=0; i<st->n_routes; i++) {
363             if (subnet_match(&st->routes[i].net,dest)) {
364                 if (st->routes[i].c->link_quality>best_quality
365                     || best_quality==0) {
366                     best_quality=st->routes[i].c->link_quality;
367                     best_match=i;
368                     /* If quality isn't perfect we may wish to
369                        consider kicking the tunnel with a 0-length
370                        packet to prompt it to perform a key setup.
371                        Then it'll eventually decide it's up or
372                        down. */
373                     /* If quality is perfect we don't need to search
374                        any more. */
375                     if (best_quality>=MAXIMUM_LINK_QUALITY) break;
376                 }
377             }
378         }
379         if (best_match==-1) {
380             /* Not going down a tunnel. Might be for the host. 
381                XXX think about this - only situation should be if we're
382                sending ICMP. */
383             if (source!=st->secnet_address) {
384                 Message(M_ERROR,"netlink_packet_deliver: outgoing packet "
385                         "from host that won't fit down any of our tunnels!\n");
386                 BUF_FREE(buf);
387             } else {
388                 st->deliver_to_host(st->dst,NULL,buf);
389                 BUF_ASSERT_FREE(buf);
390             }
391         } else {
392             if (best_quality>0) {
393                 st->routes[best_match].c->deliver(
394                     st->routes[best_match].c->dst,
395                     st->routes[best_match].c, buf);
396                 BUF_ASSERT_FREE(buf);
397             } else {
398                 /* Generate ICMP destination unreachable */
399                 netlink_icmp_simple(st,buf,client,3,0); /* client==NULL */
400                 BUF_FREE(buf);
401             }
402         }
403     } else { /* client is set */
404         /* We know the origin is a tunnel - packet must be for the host */
405         if (subnet_matches_list(&st->networks,dest)) {
406             st->deliver_to_host(st->dst,NULL,buf);
407             BUF_ASSERT_FREE(buf);
408         } else {
409             Message(M_ERROR,"%s: packet from tunnel %s can't be delivered "
410                     "to the host\n",st->name,client->name);
411             netlink_icmp_simple(st,buf,client,3,0);
412             BUF_FREE(buf);
413         }
414     }
415     BUF_ASSERT_FREE(buf);
416 }
417
418 static void netlink_packet_forward(struct netlink *st, 
419                                    struct netlink_client *client,
420                                    struct buffer_if *buf)
421 {
422     struct iphdr *iph=(struct iphdr *)buf->start;
423     
424     BUF_ASSERT_USED(buf);
425
426     /* Packet has already been checked */
427     if (iph->ttl<=1) {
428         /* Generate ICMP time exceeded */
429         netlink_icmp_simple(st,buf,client,11,0);
430         BUF_FREE(buf);
431         return;
432     }
433     iph->ttl--;
434     iph->check=0;
435     iph->check=ip_fast_csum((uint8_t *)iph,iph->ihl);
436
437     netlink_packet_deliver(st,client,buf);
438     BUF_ASSERT_FREE(buf);
439 }
440
441 /* Someone has been foolish enough to address a packet to us. I
442    suppose we should reply to it, just to be polite. */
443 static void netlink_packet_local(struct netlink *st,
444                                  struct netlink_client *client,
445                                  struct buffer_if *buf)
446 {
447     struct icmphdr *h;
448
449     h=(struct icmphdr *)buf->start;
450
451     if ((ntohs(h->iph.frag_off)&0xbfff)!=0) {
452         Message(M_WARNING,"%s: fragmented packet addressed to us\n",st->name);
453         BUF_FREE(buf);
454         return;
455     }
456
457     if (h->iph.protocol==1) {
458         /* It's ICMP */
459         if (h->type==8 && h->code==0) {
460             /* ICMP echo-request. Special case: we re-use the buffer
461                to construct the reply. */
462             h->type=0;
463             h->iph.daddr=h->iph.saddr;
464             h->iph.saddr=htonl(st->secnet_address);
465             h->iph.ttl=255; /* Be nice and bump it up again... */
466             h->iph.check=0;
467             h->iph.check=ip_fast_csum((uint8_t *)h,h->iph.ihl);
468             netlink_icmp_csum(h);
469             netlink_packet_deliver(st,NULL,buf);
470             return;
471         }
472         Message(M_WARNING,"%s: unknown incoming ICMP\n",st->name);
473     } else {
474         /* Send ICMP protocol unreachable */
475         netlink_icmp_simple(st,buf,client,3,2);
476         BUF_FREE(buf);
477         return;
478     }
479
480     BUF_FREE(buf);
481 }
482
483 /* Called by site code when remote packet is available */
484 /* buf is allocated on entry and free on return */
485 static void netlink_from_tunnel(void *sst, void *cst, struct buffer_if *buf)
486 {
487     struct netlink *st=sst;
488     struct netlink_client *client=cst;
489     uint32_t source,dest;
490     struct iphdr *iph;
491
492     BUF_ASSERT_USED(buf);
493     if (!netlink_check(st,buf)) {
494         Message(M_WARNING,"%s: bad IP packet from tunnel %s\n",
495                 st->name,client->name);
496         BUF_FREE(buf);
497         return;
498     }
499     iph=(struct iphdr *)buf->start;
500
501     source=ntohl(iph->saddr);
502     dest=ntohl(iph->daddr);
503
504     /* Check that the packet source is in 'nets' and its destination is
505        in st->networks */
506     if (!subnet_matches_list(client->networks,source)) {
507         string_t s,d;
508         s=ipaddr_to_string(source);
509         d=ipaddr_to_string(dest);
510         Message(M_WARNING,"%s: packet from tunnel %s with bad source address "
511                 "(s=%s,d=%s)\n",st->name,client->name,s,d);
512         free(s); free(d);
513         BUF_FREE(buf);
514         return;
515     }
516     /* (st->secnet_address needs checking before matching against
517        st->networks because secnet's IP address may not be in the
518        range the host is willing to deal with) */
519     if (dest==st->secnet_address) {
520         netlink_packet_local(st,client,buf);
521         BUF_ASSERT_FREE(buf);
522         return;
523     }
524     if (!subnet_matches_list(&st->networks,dest)) {
525         string_t s,d;
526         s=ipaddr_to_string(source);
527         d=ipaddr_to_string(dest);
528         Message(M_WARNING,"%s: incoming packet from tunnel %s "
529                 "with bad destination address "
530                 "(s=%s,d=%s)\n",st->name,client->name,s,d);
531         free(s); free(d);
532         BUF_FREE(buf);
533         return;
534     }
535
536     netlink_packet_forward(st,client,buf);
537
538     BUF_ASSERT_FREE(buf);
539 }
540
541 /* Called by driver code when packet is received from kernel */
542 /* cid should be NULL */
543 /* buf should be allocated on entry, and is free on return */
544 static void netlink_from_host(void *sst, void *cid, struct buffer_if *buf)
545 {
546     struct netlink *st=sst;
547     uint32_t source,dest;
548     struct iphdr *iph;
549
550     BUF_ASSERT_USED(buf);
551     if (!netlink_check(st,buf)) {
552         Message(M_WARNING,"%s: bad IP packet from host\n",
553                 st->name);
554         BUF_FREE(buf);
555         return;
556     }
557     iph=(struct iphdr *)buf->start;
558
559     source=ntohl(iph->saddr);
560     dest=ntohl(iph->daddr);
561
562     if (!subnet_matches_list(&st->networks,source)) {
563         string_t s,d;
564         s=ipaddr_to_string(source);
565         d=ipaddr_to_string(dest);
566         Message(M_WARNING,"%s: outgoing packet with bad source address "
567                 "(s=%s,d=%s)\n",st->name,s,d);
568         free(s); free(d);
569         BUF_FREE(buf);
570         return;
571     }
572     if (dest==st->secnet_address) {
573         netlink_packet_local(st,NULL,buf);
574         BUF_ASSERT_FREE(buf);
575         return;
576     }
577     netlink_packet_forward(st,NULL,buf);
578     BUF_ASSERT_FREE(buf);
579 }
580
581 static void netlink_set_quality(void *sst, void *cid, uint32_t quality)
582 {
583     struct netlink_client *c=cid;
584
585     c->link_quality=quality;
586 }
587
588 static void *netlink_regnets(void *sst, struct subnet_list *nets,
589                              netlink_deliver_fn *deliver, void *dst,
590                              uint32_t max_start_pad, uint32_t max_end_pad,
591                              string_t client_name)
592 {
593     struct netlink *st=sst;
594     struct netlink_client *c;
595
596     Message(M_DEBUG_CONFIG,"netlink_regnets: request for %d networks, "
597             "max_start_pad=%d, max_end_pad=%d\n",
598             nets->entries,max_start_pad,max_end_pad);
599
600     /* Check that nets does not intersect with st->networks or
601        st->exclude_remote_networks; refuse to register if it does. */
602     if (subnet_lists_intersect(&st->networks,nets)) {
603         Message(M_ERROR,"%s: site %s specifies networks that "
604                 "intersect with our local networks\n",st->name,client_name);
605         return False;
606     }
607     if (subnet_lists_intersect(&st->exclude_remote_networks,nets)) {
608         Message(M_ERROR,"%s: site %s specifies networks that "
609                 "intersect with the explicitly excluded remote networks\n",
610                 st->name,client_name);
611         return False;
612     }
613
614     c=safe_malloc(sizeof(*c),"netlink_regnets");
615     c->networks=nets;
616     c->deliver=deliver;
617     c->dst=dst;
618     c->name=client_name; /* XXX copy it? */
619     c->link_quality=LINK_QUALITY_DOWN;
620     c->next=st->clients;
621     st->clients=c;
622     if (max_start_pad > st->max_start_pad) st->max_start_pad=max_start_pad;
623     if (max_end_pad > st->max_end_pad) st->max_end_pad=max_end_pad;
624     st->n_routes+=nets->entries;
625
626     return c;
627 }
628
629 static int netlink_compare_route_specificity(const void *ap, const void *bp)
630 {
631     const struct netlink_route *a=ap;
632     const struct netlink_route *b=bp;
633
634     if (a->net.len==b->net.len) return 0;
635     if (a->net.len<b->net.len) return 1;
636     return -1;
637 }
638
639 static void netlink_phase_hook(void *sst, uint32_t new_phase)
640 {
641     struct netlink *st=sst;
642     struct netlink_client *c;
643     uint32_t i,j;
644
645     /* All the networks serviced by the various tunnels should now
646      * have been registered.  We build a routing table by sorting the
647      * routes into most-specific-first order.  */
648     st->routes=safe_malloc(st->n_routes*sizeof(*st->routes),
649                            "netlink_phase_hook");
650     /* Fill the table */
651     i=0;
652     for (c=st->clients; c; c=c->next) {
653         for (j=0; j<c->networks->entries; j++) {
654             st->routes[i].net=c->networks->list[j];
655             st->routes[i].c=c;
656             i++;
657         }
658     }
659     /* ASSERT i==st->n_routes */
660     if (i!=st->n_routes) {
661         fatal("netlink: route count error: expected %d got %d\n",
662               st->n_routes,i);
663     }
664     /* Sort the table in descending order of specificity */
665     qsort(st->routes,st->n_routes,sizeof(*st->routes),
666           netlink_compare_route_specificity);
667     Message(M_INFO,"%s: routing table:\n",st->name);
668     for (i=0; i<st->n_routes; i++) {
669         string_t net;
670         net=subnet_to_string(&st->routes[i].net);
671         Message(M_INFO,"%s -> tunnel %s\n",net,st->routes[i].c->name);
672         free(net);
673     }
674     Message(M_INFO,"%s/32 -> netlink \"%s\"\n",
675             ipaddr_to_string(st->secnet_address),st->name);
676     for (i=0; i<st->networks.entries; i++) {
677         string_t net;
678         net=subnet_to_string(&st->networks.list[i]);
679         Message(M_INFO,"%s -> host\n",net);
680         free(net);
681     }
682 }
683
684 static netlink_deliver_fn *netlink_init(struct netlink *st,
685                                         void *dst, struct cloc loc,
686                                         dict_t *dict, string_t description,
687                                         netlink_deliver_fn *to_host)
688 {
689     st->dst=dst;
690     st->cl.description=description;
691     st->cl.type=CL_NETLINK;
692     st->cl.apply=NULL;
693     st->cl.interface=&st->ops;
694     st->ops.st=st;
695     st->ops.regnets=netlink_regnets;
696     st->ops.deliver=netlink_from_tunnel;
697     st->ops.set_quality=netlink_set_quality;
698     st->max_start_pad=0;
699     st->max_end_pad=0;
700     st->clients=NULL;
701     st->deliver_to_host=to_host;
702
703     st->name=dict_read_string(dict,"name",False,"netlink",loc);
704     if (!st->name) st->name=description;
705     dict_read_subnet_list(dict, "networks", True, "netlink", loc,
706                           &st->networks);
707     dict_read_subnet_list(dict, "exclude-remote-networks", False, "netlink",
708                           loc, &st->exclude_remote_networks);
709     /* local-address and secnet-address do not have to be in local-networks;
710        however, they should be advertised in the 'sites' file for the
711        local site. */
712     st->local_address=string_to_ipaddr(
713         dict_find_item(dict,"local-address", True, "netlink", loc),"netlink");
714     st->secnet_address=string_to_ipaddr(
715         dict_find_item(dict,"secnet-address", True, "netlink", loc),"netlink");
716     st->mtu=dict_read_number(dict, "mtu", False, "netlink", loc, DEFAULT_MTU);
717     buffer_new(&st->icmp,ICMP_BUFSIZE);
718     st->n_routes=0;
719     st->routes=NULL;
720
721     add_hook(PHASE_SETUP,netlink_phase_hook,st);
722
723     return netlink_from_host;
724 }
725
726 /* Connection to the kernel through userv-ipif */
727
728 struct userv {
729     struct netlink nl;
730     int txfd; /* We transmit to userv */
731     int rxfd; /* We receive from userv */
732     string_t userv_path;
733     string_t service_user;
734     string_t service_name;
735     uint32_t txbuflen;
736     struct buffer_if *buff; /* We unstuff received packets into here
737                                and send them to the site code. */
738     bool_t pending_esc;
739     netlink_deliver_fn *netlink_to_tunnel;
740 };
741
742 static int userv_beforepoll(void *sst, struct pollfd *fds, int *nfds_io,
743                             int *timeout_io, const struct timeval *tv_now,
744                             uint64_t *now)
745 {
746     struct userv *st=sst;
747     *nfds_io=2;
748     fds[0].fd=st->txfd;
749     fds[0].events=POLLERR; /* Might want to pick up POLLOUT sometime */
750     fds[1].fd=st->rxfd;
751     fds[1].events=POLLIN|POLLERR|POLLHUP;
752     return 0;
753 }
754
755 static void userv_afterpoll(void *sst, struct pollfd *fds, int nfds,
756                             const struct timeval *tv_now, uint64_t *now)
757 {
758     struct userv *st=sst;
759     uint8_t rxbuf[DEFAULT_BUFSIZE];
760     int l,i;
761
762     if (fds[1].revents&POLLERR) {
763         printf("userv_afterpoll: hup!\n");
764     }
765     if (fds[1].revents&POLLIN) {
766         l=read(st->rxfd,rxbuf,DEFAULT_BUFSIZE);
767         if (l<0) {
768             fatal_perror("userv_afterpoll: read(rxfd)");
769         }
770         if (l==0) {
771             fatal("userv_afterpoll: read(rxfd)=0; userv gone away?\n");
772         }
773         /* XXX really crude unstuff code */
774         /* XXX check for buffer overflow */
775         BUF_ASSERT_USED(st->buff);
776         for (i=0; i<l; i++) {
777             if (st->pending_esc) {
778                 st->pending_esc=False;
779                 switch(rxbuf[i]) {
780                 case SLIP_ESCEND:
781                     *(uint8_t *)buf_append(st->buff,1)=SLIP_END;
782                     break;
783                 case SLIP_ESCESC:
784                     *(uint8_t *)buf_append(st->buff,1)=SLIP_ESC;
785                     break;
786                 default:
787                     fatal("userv_afterpoll: bad SLIP escape character\n");
788                 }
789             } else {
790                 switch (rxbuf[i]) {
791                 case SLIP_END:
792                     if (st->buff->size>0) {
793                         st->netlink_to_tunnel(&st->nl,NULL,
794                                               st->buff);
795                         BUF_ALLOC(st->buff,"userv_afterpoll");
796                     }
797                     buffer_init(st->buff,st->nl.max_start_pad);
798                     break;
799                 case SLIP_ESC:
800                     st->pending_esc=True;
801                     break;
802                 default:
803                     *(uint8_t *)buf_append(st->buff,1)=rxbuf[i];
804                     break;
805                 }
806             }
807         }
808     }
809 }
810
811 /* Send buf to the kernel. Free buf before returning. */
812 static void userv_deliver_to_kernel(void *sst, void *cid,
813                                     struct buffer_if *buf)
814 {
815     struct userv *st=sst;
816     uint8_t txbuf[DEFAULT_BUFSIZE];
817     uint8_t *i;
818     uint32_t j;
819
820     BUF_ASSERT_USED(buf);
821
822     /* Spit the packet at userv-ipif: SLIP start marker, then
823        bytestuff the packet, then SLIP end marker */
824     /* XXX crunchy bytestuff code */
825     j=0;
826     txbuf[j++]=SLIP_END;
827     for (i=buf->start; i<(buf->start+buf->size); i++) {
828         switch (*i) {
829         case SLIP_END:
830             txbuf[j++]=SLIP_ESC;
831             txbuf[j++]=SLIP_ESCEND;
832             break;
833         case SLIP_ESC:
834             txbuf[j++]=SLIP_ESC;
835             txbuf[j++]=SLIP_ESCESC;
836             break;
837         default:
838             txbuf[j++]=*i;
839             break;
840         }
841     }
842     txbuf[j++]=SLIP_END;
843     if (write(st->txfd,txbuf,j)<0) {
844         fatal_perror("userv_deliver_to_kernel: write()");
845     }
846     BUF_FREE(buf);
847 }
848
849 static void userv_phase_hook(void *sst, uint32_t newphase)
850 {
851     struct userv *st=sst;
852     pid_t child;
853     int c_stdin[2];
854     int c_stdout[2];
855     string_t addrs;
856     string_t nets;
857     string_t s;
858     struct netlink_client *c;
859     int i;
860
861     /* This is where we actually invoke userv - all the networks we'll
862        be using should already have been registered. */
863
864     addrs=safe_malloc(512,"userv_phase_hook:addrs");
865     snprintf(addrs,512,"%s,%s,%d,slip",ipaddr_to_string(st->nl.local_address),
866              ipaddr_to_string(st->nl.secnet_address),st->nl.mtu);
867
868     nets=safe_malloc(1024,"userv_phase_hook:nets");
869     *nets=0;
870     for (c=st->nl.clients; c; c=c->next) {
871         for (i=0; i<c->networks->entries; i++) {
872             s=subnet_to_string(&c->networks->list[i]);
873             strcat(nets,s);
874             strcat(nets,",");
875             free(s);
876         }
877     }
878     nets[strlen(nets)-1]=0;
879
880     Message(M_INFO,"\nuserv_phase_hook: %s %s %s %s %s\n",st->userv_path,
881            st->service_user,st->service_name,addrs,nets);
882
883     /* Allocate buffer, plus space for padding. Make sure we end up
884        with the start of the packet well-aligned. */
885     /* ALIGN(st->max_start_pad,16); */
886     /* ALIGN(st->max_end_pad,16); */
887
888     st->pending_esc=False;
889
890     /* Invoke userv */
891     if (pipe(c_stdin)!=0) {
892         fatal_perror("userv_phase_hook: pipe(c_stdin)");
893     }
894     if (pipe(c_stdout)!=0) {
895         fatal_perror("userv_phase_hook: pipe(c_stdout)");
896     }
897     st->txfd=c_stdin[1];
898     st->rxfd=c_stdout[0];
899
900     child=fork();
901     if (child==-1) {
902         fatal_perror("userv_phase_hook: fork()");
903     }
904     if (child==0) {
905         char **argv;
906
907         /* We are the child. Modify our stdin and stdout, then exec userv */
908         dup2(c_stdin[0],0);
909         dup2(c_stdout[1],1);
910         close(c_stdin[1]);
911         close(c_stdout[0]);
912
913         /* The arguments are:
914            userv
915            service-user
916            service-name
917            local-addr,secnet-addr,mtu,protocol
918            route1,route2,... */
919         argv=malloc(sizeof(*argv)*6);
920         argv[0]=st->userv_path;
921         argv[1]=st->service_user;
922         argv[2]=st->service_name;
923         argv[3]=addrs;
924         argv[4]=nets;
925         argv[5]=NULL;
926         execvp(st->userv_path,argv);
927         perror("netlink-userv-ipif: execvp");
928
929         exit(1);
930     }
931     /* We are the parent... */
932            
933     /* Register for poll() */
934     register_for_poll(st, userv_beforepoll, userv_afterpoll, 2, st->nl.name);
935 }
936
937 static list_t *userv_apply(closure_t *self, struct cloc loc, dict_t *context,
938                            list_t *args)
939 {
940     struct userv *st;
941     item_t *item;
942     dict_t *dict;
943
944     st=safe_malloc(sizeof(*st),"userv_apply");
945
946     /* First parameter must be a dict */
947     item=list_elem(args,0);
948     if (!item || item->type!=t_dict)
949         cfgfatal(loc,"userv-ipif","parameter must be a dictionary\n");
950     
951     dict=item->data.dict;
952
953     st->netlink_to_tunnel=
954         netlink_init(&st->nl,st,loc,dict,
955                      "netlink-userv-ipif",userv_deliver_to_kernel);
956
957     st->userv_path=dict_read_string(dict,"userv-path",False,"userv-netlink",
958                                     loc);
959     st->service_user=dict_read_string(dict,"service-user",False,
960                                       "userv-netlink",loc);
961     st->service_name=dict_read_string(dict,"service-name",False,
962                                       "userv-netlink",loc);
963     if (!st->userv_path) st->userv_path="userv";
964     if (!st->service_user) st->service_user="root";
965     if (!st->service_name) st->service_name="ipif";
966     st->buff=find_cl_if(dict,"buffer",CL_BUFFER,True,"userv-netlink",loc);
967     BUF_ALLOC(st->buff,"netlink:userv_apply");
968
969     st->rxfd=-1; st->txfd=-1;
970     add_hook(PHASE_DROPPRIV,userv_phase_hook,st);
971
972     return new_closure(&st->nl.cl);
973 }
974
975 /* Connection to the kernel through the universal TUN/TAP driver */
976
977 struct tun {
978     struct netlink nl;
979     int fd;
980     string_t device_path;
981     string_t interface_name;
982     string_t ifconfig_path;
983     string_t route_path;
984     bool_t tun_old;
985     bool_t search_for_if; /* Applies to tun-old only */
986     struct buffer_if *buff; /* We receive packets into here
987                                and send them to the netlink code. */
988     netlink_deliver_fn *netlink_to_tunnel;
989 };
990
991 static int tun_beforepoll(void *sst, struct pollfd *fds, int *nfds_io,
992                           int *timeout_io, const struct timeval *tv_now,
993                           uint64_t *now)
994 {
995     struct tun *st=sst;
996     *nfds_io=1;
997     fds[0].fd=st->fd;
998     fds[0].events=POLLIN|POLLERR|POLLHUP;
999     return 0;
1000 }
1001
1002 static void tun_afterpoll(void *sst, struct pollfd *fds, int nfds,
1003                             const struct timeval *tv_now, uint64_t *now)
1004 {
1005     struct tun *st=sst;
1006     int l;
1007
1008     if (fds[0].revents&POLLERR) {
1009         printf("tun_afterpoll: hup!\n");
1010     }
1011     if (fds[0].revents&POLLIN) {
1012         BUF_ALLOC(st->buff,"tun_afterpoll");
1013         buffer_init(st->buff,st->nl.max_start_pad);
1014         l=read(st->fd,st->buff->start,st->buff->len-st->nl.max_start_pad);
1015         if (l<0) {
1016             fatal_perror("tun_afterpoll: read()");
1017         }
1018         if (l==0) {
1019             fatal("tun_afterpoll: read()=0; device gone away?\n");
1020         }
1021         if (l>0) {
1022             st->buff->size=l;
1023             st->netlink_to_tunnel(&st->nl,NULL,st->buff);
1024             BUF_ASSERT_FREE(st->buff);
1025         }
1026     }
1027 }
1028
1029 static void tun_deliver_to_kernel(void *sst, void *cid,
1030                                   struct buffer_if *buf)
1031 {
1032     struct tun *st=sst;
1033
1034     BUF_ASSERT_USED(buf);
1035
1036     /* No error checking, because we'd just throw the packet away anyway */
1037     write(st->fd,buf->start,buf->size);
1038     BUF_FREE(buf);
1039 }
1040
1041 static void tun_phase_hook(void *sst, uint32_t newphase)
1042 {
1043     struct tun *st=sst;
1044     string_t hostaddr,secnetaddr;
1045     uint8_t mtu[6];
1046     string_t network,mask;
1047     struct netlink_client *c;
1048     int i;
1049
1050     if (st->tun_old) {
1051         if (st->search_for_if) {
1052             string_t dname;
1053             int i;
1054
1055             /* ASSERT st->interface_name */
1056             dname=safe_malloc(strlen(st->device_path)+4,"tun_old_apply");
1057             st->interface_name=safe_malloc(8,"tun_phase_hook");
1058         
1059             for (i=0; i<255; i++) {
1060                 sprintf(dname,"%s%d",st->device_path,i);
1061                 if ((st->fd=open(dname,O_RDWR))>0) {
1062                     sprintf(st->interface_name,"tun%d",i);
1063                     Message(M_INFO,"%s: allocated network interface %s "
1064                             "through %s\n",st->nl.name,st->interface_name,
1065                             dname);
1066                     break;
1067                 }
1068             }
1069             if (st->fd==-1) {
1070                 fatal("%s: unable to open any TUN device (%s...)\n",
1071                       st->nl.name,st->device_path);
1072             }
1073         } else {
1074             st->fd=open(st->device_path,O_RDWR);
1075             if (st->fd==-1) {
1076                 fatal_perror("%s: unable to open TUN device file %s",
1077                              st->nl.name,st->device_path);
1078             }
1079         }
1080     } else {
1081 #ifdef HAVE_LINUX_IF_H
1082         struct ifreq ifr;
1083
1084         /* New TUN interface: open the device, then do ioctl TUNSETIFF
1085            to set or find out the network interface name. */
1086         st->fd=open(st->device_path,O_RDWR);
1087         if (st->fd==-1) {
1088             fatal_perror("%s: can't open device file %s",st->nl.name,
1089                          st->device_path);
1090         }
1091         memset(&ifr,0,sizeof(ifr));
1092         ifr.ifr_flags = IFF_TUN | IFF_NO_PI; /* Just send/receive IP packets,
1093                                                 no extra headers */
1094         if (st->interface_name)
1095             strncpy(ifr.ifr_name,st->interface_name,IFNAMSIZ);
1096         Message(M_INFO,"%s: about to ioctl(TUNSETIFF)...\n",st->nl.name);
1097         if (ioctl(st->fd,TUNSETIFF,&ifr)<0) {
1098             fatal_perror("%s: ioctl(TUNSETIFF)",st->nl.name);
1099         }
1100         if (!st->interface_name) {
1101             st->interface_name=safe_malloc(strlen(ifr.ifr_name)+1,"tun_apply");
1102             strcpy(st->interface_name,ifr.ifr_name);
1103             Message(M_INFO,"%s: allocated network interface %s\n",st->nl.name,
1104                     st->interface_name);
1105         }
1106 #else
1107         fatal("netlink.c:tun_phase_hook:!tun_old unexpected\n");
1108 #endif /* HAVE_LINUX_IF_H */
1109     }
1110     /* All the networks we'll be using have been registered. Invoke ifconfig
1111        to set the TUN device's address, and route to add routes to all
1112        our networks. */
1113
1114     hostaddr=ipaddr_to_string(st->nl.local_address);
1115     secnetaddr=ipaddr_to_string(st->nl.secnet_address);
1116     snprintf(mtu,6,"%d",st->nl.mtu);
1117     mtu[5]=0;
1118
1119     sys_cmd(st->ifconfig_path,"ifconfig",st->interface_name,
1120             hostaddr,"netmask","255.255.255.255","-broadcast",
1121             "pointopoint",secnetaddr,"mtu",mtu,"up",(char *)0);
1122
1123     for (c=st->nl.clients; c; c=c->next) {
1124         for (i=0; i<c->networks->entries; i++) {
1125             network=ipaddr_to_string(c->networks->list[i].prefix);
1126             mask=ipaddr_to_string(c->networks->list[i].mask);
1127             sys_cmd(st->route_path,"route","add","-net",network,
1128                     "netmask",mask,"gw",secnetaddr,(char *)0);
1129         }
1130     }
1131
1132     /* Register for poll() */
1133     register_for_poll(st, tun_beforepoll, tun_afterpoll, 1, st->nl.name);
1134 }
1135
1136 #ifdef HAVE_LINUX_IF_H
1137 static list_t *tun_apply(closure_t *self, struct cloc loc, dict_t *context,
1138                          list_t *args)
1139 {
1140     struct tun *st;
1141     item_t *item;
1142     dict_t *dict;
1143
1144     st=safe_malloc(sizeof(*st),"tun_apply");
1145
1146     /* First parameter must be a dict */
1147     item=list_elem(args,0);
1148     if (!item || item->type!=t_dict)
1149         cfgfatal(loc,"tun","parameter must be a dictionary\n");
1150     
1151     dict=item->data.dict;
1152
1153     st->netlink_to_tunnel=
1154         netlink_init(&st->nl,st,loc,dict,
1155                      "netlink-tun",tun_deliver_to_kernel);
1156
1157     st->tun_old=False;
1158     st->device_path=dict_read_string(dict,"device",False,"tun-netlink",loc);
1159     st->interface_name=dict_read_string(dict,"interface",False,
1160                                         "tun-netlink",loc);
1161     st->ifconfig_path=dict_read_string(dict,"ifconfig-path",
1162                                        False,"tun-netlink",loc);
1163     st->route_path=dict_read_string(dict,"route-path",
1164                                     False,"tun-netlink",loc);
1165
1166     if (!st->device_path) st->device_path="/dev/net/tun";
1167     if (!st->ifconfig_path) st->ifconfig_path="ifconfig";
1168     if (!st->route_path) st->route_path="route";
1169     st->buff=find_cl_if(dict,"buffer",CL_BUFFER,True,"tun-netlink",loc);
1170
1171     add_hook(PHASE_GETRESOURCES,tun_phase_hook,st);
1172
1173     return new_closure(&st->nl.cl);
1174 }
1175 #endif /* HAVE_LINUX_IF_H */
1176
1177 static list_t *tun_old_apply(closure_t *self, struct cloc loc, dict_t *context,
1178                              list_t *args)
1179 {
1180     struct tun *st;
1181     item_t *item;
1182     dict_t *dict;
1183
1184     st=safe_malloc(sizeof(*st),"tun_old_apply");
1185
1186     Message(M_WARNING,"the tun-old code has never been tested. Please report "
1187             "success or failure to steve@greenend.org.uk\n");
1188
1189     /* First parameter must be a dict */
1190     item=list_elem(args,0);
1191     if (!item || item->type!=t_dict)
1192         cfgfatal(loc,"tun","parameter must be a dictionary\n");
1193     
1194     dict=item->data.dict;
1195
1196     st->netlink_to_tunnel=
1197         netlink_init(&st->nl,st,loc,dict,
1198                      "netlink-tun",tun_deliver_to_kernel);
1199
1200     st->tun_old=True;
1201     st->device_path=dict_read_string(dict,"device",False,"tun-netlink",loc);
1202     st->interface_name=dict_read_string(dict,"interface",False,
1203                                         "tun-netlink",loc);
1204     st->search_for_if=dict_read_bool(dict,"interface-search",False,
1205                                      "tun-netlink",loc,st->device_path==NULL);
1206     st->ifconfig_path=dict_read_string(dict,"ifconfig-path",False,
1207                                        "tun-netlink",loc);
1208     st->route_path=dict_read_string(dict,"route-path",False,"tun-netlink",loc);
1209
1210     if (!st->device_path) st->device_path="/dev/tun";
1211     if (!st->ifconfig_path) st->ifconfig_path="ifconfig";
1212     if (!st->route_path) st->route_path="route";
1213     st->buff=find_cl_if(dict,"buffer",CL_BUFFER,True,"tun-netlink",loc);
1214
1215     /* Old TUN interface: the network interface name depends on which
1216        /dev/tunX file we open. If 'interface-search' is set to true, treat
1217        'device' as the prefix and try numbers from 0--255. If it's set
1218        to false, treat 'device' as the whole name, and require than an
1219        appropriate interface name be specified. */
1220     if (st->search_for_if && st->interface_name) {
1221         cfgfatal(loc,"tun-old","you may not specify an interface name "
1222                  "in interface-search mode\n");
1223     }
1224     if (!st->search_for_if && !st->interface_name) {
1225         cfgfatal(loc,"tun-old","you must specify an interface name "
1226                  "when you explicitly specify a TUN device file\n");
1227     }
1228
1229
1230     add_hook(PHASE_GETRESOURCES,tun_phase_hook,st);
1231
1232     return new_closure(&st->nl.cl);
1233 }
1234
1235 /* No connection to the kernel at all... */
1236
1237 struct null {
1238     struct netlink nl;
1239 };
1240
1241 static void null_deliver(void *sst, void *cid, struct buffer_if *buf)
1242 {
1243     return;
1244 }
1245
1246 static list_t *null_apply(closure_t *self, struct cloc loc, dict_t *context,
1247                           list_t *args)
1248 {
1249     struct null *st;
1250     item_t *item;
1251     dict_t *dict;
1252
1253     st=safe_malloc(sizeof(*st),"null_apply");
1254
1255     item=list_elem(args,0);
1256     if (!item || item->type!=t_dict)
1257         cfgfatal(loc,"null-netlink","parameter must be a dictionary\n");
1258     
1259     dict=item->data.dict;
1260
1261     netlink_init(&st->nl,st,loc,dict,"null-netlink",null_deliver);
1262
1263     return new_closure(&st->nl.cl);
1264 }
1265
1266 init_module netlink_module;
1267 void netlink_module(dict_t *dict)
1268 {
1269     add_closure(dict,"userv-ipif",userv_apply);
1270 #ifdef HAVE_LINUX_IF_H
1271     add_closure(dict,"tun",tun_apply);
1272 #endif
1273     add_closure(dict,"tun-old",tun_old_apply);
1274     add_closure(dict,"null-netlink",null_apply);
1275 #if 0
1276     /* TODO */
1277     add_closure(dict,"pty-slip",ptyslip_apply);
1278     add_closure(dict,"slipd",slipd_apply);
1279 #endif /* 0 */
1280 }