chiark / gitweb /
sd-event: permit a USEC_INFINITY timeout as an alternative to a disabling an event...
[elogind.git] / src / libelogind / sd-event / sd-event.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2013 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/epoll.h>
23 #include <sys/timerfd.h>
24 #include <sys/wait.h>
25
26 #include "sd-daemon.h"
27 #include "sd-event.h"
28 #include "sd-id128.h"
29
30 #include "alloc-util.h"
31 #include "fd-util.h"
32 #include "hashmap.h"
33 #include "list.h"
34 #include "macro.h"
35 #include "missing.h"
36 #include "prioq.h"
37 #include "process-util.h"
38 #include "set.h"
39 #include "signal-util.h"
40 #include "string-table.h"
41 #include "string-util.h"
42 #include "time-util.h"
43 #include "util.h"
44
45 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
46
47 typedef enum EventSourceType {
48         SOURCE_IO,
49         SOURCE_TIME_REALTIME,
50         SOURCE_TIME_BOOTTIME,
51         SOURCE_TIME_MONOTONIC,
52         SOURCE_TIME_REALTIME_ALARM,
53         SOURCE_TIME_BOOTTIME_ALARM,
54         SOURCE_SIGNAL,
55         SOURCE_CHILD,
56         SOURCE_DEFER,
57         SOURCE_POST,
58         SOURCE_EXIT,
59         SOURCE_WATCHDOG,
60         _SOURCE_EVENT_SOURCE_TYPE_MAX,
61         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
62 } EventSourceType;
63
64 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
65         [SOURCE_IO] = "io",
66         [SOURCE_TIME_REALTIME] = "realtime",
67         [SOURCE_TIME_BOOTTIME] = "bootime",
68         [SOURCE_TIME_MONOTONIC] = "monotonic",
69         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
70         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
71         [SOURCE_SIGNAL] = "signal",
72         [SOURCE_CHILD] = "child",
73         [SOURCE_DEFER] = "defer",
74         [SOURCE_POST] = "post",
75         [SOURCE_EXIT] = "exit",
76         [SOURCE_WATCHDOG] = "watchdog",
77 };
78
79 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
80
81 /* All objects we use in epoll events start with this value, so that
82  * we know how to dispatch it */
83 typedef enum WakeupType {
84         WAKEUP_NONE,
85         WAKEUP_EVENT_SOURCE,
86         WAKEUP_CLOCK_DATA,
87         WAKEUP_SIGNAL_DATA,
88         _WAKEUP_TYPE_MAX,
89         _WAKEUP_TYPE_INVALID = -1,
90 } WakeupType;
91
92 #define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
93
94 struct sd_event_source {
95         WakeupType wakeup;
96
97         unsigned n_ref;
98
99         sd_event *event;
100         void *userdata;
101         sd_event_handler_t prepare;
102
103         char *description;
104
105         EventSourceType type:5;
106         int enabled:3;
107         bool pending:1;
108         bool dispatching:1;
109         bool floating:1;
110
111         int64_t priority;
112         unsigned pending_index;
113         unsigned prepare_index;
114         unsigned pending_iteration;
115         unsigned prepare_iteration;
116
117         LIST_FIELDS(sd_event_source, sources);
118
119         union {
120                 struct {
121                         sd_event_io_handler_t callback;
122                         int fd;
123                         uint32_t events;
124                         uint32_t revents;
125                         bool registered:1;
126                 } io;
127                 struct {
128                         sd_event_time_handler_t callback;
129                         usec_t next, accuracy;
130                         unsigned earliest_index;
131                         unsigned latest_index;
132                 } time;
133                 struct {
134                         sd_event_signal_handler_t callback;
135                         struct signalfd_siginfo siginfo;
136                         int sig;
137                 } signal;
138                 struct {
139                         sd_event_child_handler_t callback;
140                         siginfo_t siginfo;
141                         pid_t pid;
142                         int options;
143                 } child;
144                 struct {
145                         sd_event_handler_t callback;
146                 } defer;
147                 struct {
148                         sd_event_handler_t callback;
149                 } post;
150                 struct {
151                         sd_event_handler_t callback;
152                         unsigned prioq_index;
153                 } exit;
154         };
155 };
156
157 struct clock_data {
158         WakeupType wakeup;
159         int fd;
160
161         /* For all clocks we maintain two priority queues each, one
162          * ordered for the earliest times the events may be
163          * dispatched, and one ordered by the latest times they must
164          * have been dispatched. The range between the top entries in
165          * the two prioqs is the time window we can freely schedule
166          * wakeups in */
167
168         Prioq *earliest;
169         Prioq *latest;
170         usec_t next;
171
172         bool needs_rearm:1;
173 };
174
175 struct signal_data {
176         WakeupType wakeup;
177
178         /* For each priority we maintain one signal fd, so that we
179          * only have to dequeue a single event per priority at a
180          * time. */
181
182         int fd;
183         int64_t priority;
184         sigset_t sigset;
185         sd_event_source *current;
186 };
187
188 struct sd_event {
189         unsigned n_ref;
190
191         int epoll_fd;
192         int watchdog_fd;
193
194         Prioq *pending;
195         Prioq *prepare;
196
197         /* timerfd_create() only supports these five clocks so far. We
198          * can add support for more clocks when the kernel learns to
199          * deal with them, too. */
200         struct clock_data realtime;
201         struct clock_data boottime;
202         struct clock_data monotonic;
203         struct clock_data realtime_alarm;
204         struct clock_data boottime_alarm;
205
206         usec_t perturb;
207
208         sd_event_source **signal_sources; /* indexed by signal number */
209         Hashmap *signal_data; /* indexed by priority */
210
211         Hashmap *child_sources;
212         unsigned n_enabled_child_sources;
213
214         Set *post_sources;
215
216         Prioq *exit;
217
218         pid_t original_pid;
219
220         unsigned iteration;
221         dual_timestamp timestamp;
222         usec_t timestamp_boottime;
223         int state;
224
225         bool exit_requested:1;
226         bool need_process_child:1;
227         bool watchdog:1;
228         bool profile_delays:1;
229
230         int exit_code;
231
232         pid_t tid;
233         sd_event **default_event_ptr;
234
235         usec_t watchdog_last, watchdog_period;
236
237         unsigned n_sources;
238
239         LIST_HEAD(sd_event_source, sources);
240
241         usec_t last_run, last_log;
242         unsigned delays[sizeof(usec_t) * 8];
243 };
244
245 static void source_disconnect(sd_event_source *s);
246
247 static int pending_prioq_compare(const void *a, const void *b) {
248         const sd_event_source *x = a, *y = b;
249
250         assert(x->pending);
251         assert(y->pending);
252
253         /* Enabled ones first */
254         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
255                 return -1;
256         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
257                 return 1;
258
259         /* Lower priority values first */
260         if (x->priority < y->priority)
261                 return -1;
262         if (x->priority > y->priority)
263                 return 1;
264
265         /* Older entries first */
266         if (x->pending_iteration < y->pending_iteration)
267                 return -1;
268         if (x->pending_iteration > y->pending_iteration)
269                 return 1;
270
271         return 0;
272 }
273
274 static int prepare_prioq_compare(const void *a, const void *b) {
275         const sd_event_source *x = a, *y = b;
276
277         assert(x->prepare);
278         assert(y->prepare);
279
280         /* Enabled ones first */
281         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
282                 return -1;
283         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
284                 return 1;
285
286         /* Move most recently prepared ones last, so that we can stop
287          * preparing as soon as we hit one that has already been
288          * prepared in the current iteration */
289         if (x->prepare_iteration < y->prepare_iteration)
290                 return -1;
291         if (x->prepare_iteration > y->prepare_iteration)
292                 return 1;
293
294         /* Lower priority values first */
295         if (x->priority < y->priority)
296                 return -1;
297         if (x->priority > y->priority)
298                 return 1;
299
300         return 0;
301 }
302
303 static int earliest_time_prioq_compare(const void *a, const void *b) {
304         const sd_event_source *x = a, *y = b;
305
306         assert(EVENT_SOURCE_IS_TIME(x->type));
307         assert(x->type == y->type);
308
309         /* Enabled ones first */
310         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
311                 return -1;
312         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
313                 return 1;
314
315         /* Move the pending ones to the end */
316         if (!x->pending && y->pending)
317                 return -1;
318         if (x->pending && !y->pending)
319                 return 1;
320
321         /* Order by time */
322         if (x->time.next < y->time.next)
323                 return -1;
324         if (x->time.next > y->time.next)
325                 return 1;
326
327         return 0;
328 }
329
330 static usec_t time_event_source_latest(const sd_event_source *s) {
331         return usec_add(s->time.next, s->time.accuracy);
332 }
333
334 static int latest_time_prioq_compare(const void *a, const void *b) {
335         const sd_event_source *x = a, *y = b;
336
337         assert(EVENT_SOURCE_IS_TIME(x->type));
338         assert(x->type == y->type);
339
340         /* Enabled ones first */
341         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
342                 return -1;
343         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
344                 return 1;
345
346         /* Move the pending ones to the end */
347         if (!x->pending && y->pending)
348                 return -1;
349         if (x->pending && !y->pending)
350                 return 1;
351
352         /* Order by time */
353         if (time_event_source_latest(x) < time_event_source_latest(y))
354                 return -1;
355         if (time_event_source_latest(x) > time_event_source_latest(y))
356                 return 1;
357
358         return 0;
359 }
360
361 static int exit_prioq_compare(const void *a, const void *b) {
362         const sd_event_source *x = a, *y = b;
363
364         assert(x->type == SOURCE_EXIT);
365         assert(y->type == SOURCE_EXIT);
366
367         /* Enabled ones first */
368         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
369                 return -1;
370         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
371                 return 1;
372
373         /* Lower priority values first */
374         if (x->priority < y->priority)
375                 return -1;
376         if (x->priority > y->priority)
377                 return 1;
378
379         return 0;
380 }
381
382 static void free_clock_data(struct clock_data *d) {
383         assert(d);
384         assert(d->wakeup == WAKEUP_CLOCK_DATA);
385
386         safe_close(d->fd);
387         prioq_free(d->earliest);
388         prioq_free(d->latest);
389 }
390
391 static void event_free(sd_event *e) {
392         sd_event_source *s;
393
394         assert(e);
395
396         while ((s = e->sources)) {
397                 assert(s->floating);
398                 source_disconnect(s);
399                 sd_event_source_unref(s);
400         }
401
402         assert(e->n_sources == 0);
403
404         if (e->default_event_ptr)
405                 *(e->default_event_ptr) = NULL;
406
407         safe_close(e->epoll_fd);
408         safe_close(e->watchdog_fd);
409
410         free_clock_data(&e->realtime);
411         free_clock_data(&e->boottime);
412         free_clock_data(&e->monotonic);
413         free_clock_data(&e->realtime_alarm);
414         free_clock_data(&e->boottime_alarm);
415
416         prioq_free(e->pending);
417         prioq_free(e->prepare);
418         prioq_free(e->exit);
419
420         free(e->signal_sources);
421         hashmap_free(e->signal_data);
422
423         hashmap_free(e->child_sources);
424         set_free(e->post_sources);
425         free(e);
426 }
427
428 _public_ int sd_event_new(sd_event** ret) {
429         sd_event *e;
430         int r;
431
432         assert_return(ret, -EINVAL);
433
434         e = new0(sd_event, 1);
435         if (!e)
436                 return -ENOMEM;
437
438         e->n_ref = 1;
439         e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
440         e->realtime.next = e->boottime.next = e->monotonic.next = e->realtime_alarm.next = e->boottime_alarm.next = USEC_INFINITY;
441         e->realtime.wakeup = e->boottime.wakeup = e->monotonic.wakeup = e->realtime_alarm.wakeup = e->boottime_alarm.wakeup = WAKEUP_CLOCK_DATA;
442         e->original_pid = getpid();
443         e->perturb = USEC_INFINITY;
444
445         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
446         if (r < 0)
447                 goto fail;
448
449         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
450         if (e->epoll_fd < 0) {
451                 r = -errno;
452                 goto fail;
453         }
454
455         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
456                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 ... 2^63 us will be logged every 5s.");
457                 e->profile_delays = true;
458         }
459
460         *ret = e;
461         return 0;
462
463 fail:
464         event_free(e);
465         return r;
466 }
467
468 _public_ sd_event* sd_event_ref(sd_event *e) {
469         assert_return(e, NULL);
470
471         assert(e->n_ref >= 1);
472         e->n_ref++;
473
474         return e;
475 }
476
477 _public_ sd_event* sd_event_unref(sd_event *e) {
478
479         if (!e)
480                 return NULL;
481
482         assert(e->n_ref >= 1);
483         e->n_ref--;
484
485         if (e->n_ref <= 0)
486                 event_free(e);
487
488         return NULL;
489 }
490
491 static bool event_pid_changed(sd_event *e) {
492         assert(e);
493
494         /* We don't support people creating an event loop and keeping
495          * it around over a fork(). Let's complain. */
496
497         return e->original_pid != getpid();
498 }
499
500 static void source_io_unregister(sd_event_source *s) {
501         int r;
502
503         assert(s);
504         assert(s->type == SOURCE_IO);
505
506         if (event_pid_changed(s->event))
507                 return;
508
509         if (!s->io.registered)
510                 return;
511
512         r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
513         if (r < 0)
514                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
515                                 strna(s->description), event_source_type_to_string(s->type));
516
517         s->io.registered = false;
518 }
519
520 static int source_io_register(
521                 sd_event_source *s,
522                 int enabled,
523                 uint32_t events) {
524
525         struct epoll_event ev = {};
526         int r;
527
528         assert(s);
529         assert(s->type == SOURCE_IO);
530         assert(enabled != SD_EVENT_OFF);
531
532         ev.events = events;
533         ev.data.ptr = s;
534
535         if (enabled == SD_EVENT_ONESHOT)
536                 ev.events |= EPOLLONESHOT;
537
538         if (s->io.registered)
539                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
540         else
541                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
542         if (r < 0)
543                 return -errno;
544
545         s->io.registered = true;
546
547         return 0;
548 }
549
550 #if 0 /// UNNEEDED by elogind
551 static clockid_t event_source_type_to_clock(EventSourceType t) {
552
553         switch (t) {
554
555         case SOURCE_TIME_REALTIME:
556                 return CLOCK_REALTIME;
557
558         case SOURCE_TIME_BOOTTIME:
559                 return CLOCK_BOOTTIME;
560
561         case SOURCE_TIME_MONOTONIC:
562                 return CLOCK_MONOTONIC;
563
564         case SOURCE_TIME_REALTIME_ALARM:
565                 return CLOCK_REALTIME_ALARM;
566
567         case SOURCE_TIME_BOOTTIME_ALARM:
568                 return CLOCK_BOOTTIME_ALARM;
569
570         default:
571                 return (clockid_t) -1;
572         }
573 }
574 #endif // 0
575
576 static EventSourceType clock_to_event_source_type(clockid_t clock) {
577
578         switch (clock) {
579
580         case CLOCK_REALTIME:
581                 return SOURCE_TIME_REALTIME;
582
583         case CLOCK_BOOTTIME:
584                 return SOURCE_TIME_BOOTTIME;
585
586         case CLOCK_MONOTONIC:
587                 return SOURCE_TIME_MONOTONIC;
588
589         case CLOCK_REALTIME_ALARM:
590                 return SOURCE_TIME_REALTIME_ALARM;
591
592         case CLOCK_BOOTTIME_ALARM:
593                 return SOURCE_TIME_BOOTTIME_ALARM;
594
595         default:
596                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
597         }
598 }
599
600 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
601         assert(e);
602
603         switch (t) {
604
605         case SOURCE_TIME_REALTIME:
606                 return &e->realtime;
607
608         case SOURCE_TIME_BOOTTIME:
609                 return &e->boottime;
610
611         case SOURCE_TIME_MONOTONIC:
612                 return &e->monotonic;
613
614         case SOURCE_TIME_REALTIME_ALARM:
615                 return &e->realtime_alarm;
616
617         case SOURCE_TIME_BOOTTIME_ALARM:
618                 return &e->boottime_alarm;
619
620         default:
621                 return NULL;
622         }
623 }
624
625 static int event_make_signal_data(
626                 sd_event *e,
627                 int sig,
628                 struct signal_data **ret) {
629
630         struct epoll_event ev = {};
631         struct signal_data *d;
632         bool added = false;
633         sigset_t ss_copy;
634         int64_t priority;
635         int r;
636
637         assert(e);
638
639         if (event_pid_changed(e))
640                 return -ECHILD;
641
642         if (e->signal_sources && e->signal_sources[sig])
643                 priority = e->signal_sources[sig]->priority;
644         else
645                 priority = 0;
646
647         d = hashmap_get(e->signal_data, &priority);
648         if (d) {
649                 if (sigismember(&d->sigset, sig) > 0) {
650                         if (ret)
651                                 *ret = d;
652                 return 0;
653                 }
654         } else {
655                 r = hashmap_ensure_allocated(&e->signal_data, &uint64_hash_ops);
656                 if (r < 0)
657                         return r;
658
659                 d = new0(struct signal_data, 1);
660                 if (!d)
661                         return -ENOMEM;
662
663                 d->wakeup = WAKEUP_SIGNAL_DATA;
664                 d->fd  = -1;
665                 d->priority = priority;
666
667                 r = hashmap_put(e->signal_data, &d->priority, d);
668         if (r < 0)
669                         return r;
670
671                 added = true;
672         }
673
674         ss_copy = d->sigset;
675         assert_se(sigaddset(&ss_copy, sig) >= 0);
676
677         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
678         if (r < 0) {
679                 r = -errno;
680                 goto fail;
681         }
682
683         d->sigset = ss_copy;
684
685         if (d->fd >= 0) {
686                 if (ret)
687                         *ret = d;
688                 return 0;
689         }
690
691         d->fd = r;
692
693         ev.events = EPOLLIN;
694         ev.data.ptr = d;
695
696         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev);
697         if (r < 0) {
698                 r = -errno;
699                 goto fail;
700         }
701
702         if (ret)
703                 *ret = d;
704
705         return 0;
706
707 fail:
708         if (added) {
709                 d->fd = safe_close(d->fd);
710                 hashmap_remove(e->signal_data, &d->priority);
711                 free(d);
712         }
713
714         return r;
715 }
716
717 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
718         assert(e);
719         assert(d);
720
721         /* Turns off the specified signal in the signal data
722          * object. If the signal mask of the object becomes empty that
723          * way removes it. */
724
725         if (sigismember(&d->sigset, sig) == 0)
726                 return;
727
728         assert_se(sigdelset(&d->sigset, sig) >= 0);
729
730         if (sigisemptyset(&d->sigset)) {
731
732                 /* If all the mask is all-zero we can get rid of the structure */
733                 hashmap_remove(e->signal_data, &d->priority);
734                 assert(!d->current);
735                 safe_close(d->fd);
736                 free(d);
737                 return;
738         }
739
740         assert(d->fd >= 0);
741
742         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
743                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
744 }
745
746 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
747         struct signal_data *d;
748         static const int64_t zero_priority = 0;
749
750         assert(e);
751
752         /* Rechecks if the specified signal is still something we are
753          * interested in. If not, we'll unmask it, and possibly drop
754          * the signalfd for it. */
755
756         if (sig == SIGCHLD &&
757             e->n_enabled_child_sources > 0)
758                 return;
759
760         if (e->signal_sources &&
761             e->signal_sources[sig] &&
762             e->signal_sources[sig]->enabled != SD_EVENT_OFF)
763                 return;
764
765         /*
766          * The specified signal might be enabled in three different queues:
767          *
768          * 1) the one that belongs to the priority passed (if it is non-NULL)
769          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
770          * 3) the 0 priority (to cover the SIGCHLD case)
771          *
772          * Hence, let's remove it from all three here.
773          */
774
775         if (priority) {
776                 d = hashmap_get(e->signal_data, priority);
777                 if (d)
778                         event_unmask_signal_data(e, d, sig);
779         }
780
781         if (e->signal_sources && e->signal_sources[sig]) {
782                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
783                 if (d)
784                         event_unmask_signal_data(e, d, sig);
785         }
786
787         d = hashmap_get(e->signal_data, &zero_priority);
788         if (d)
789                 event_unmask_signal_data(e, d, sig);
790 }
791
792 static void source_disconnect(sd_event_source *s) {
793         sd_event *event;
794
795         assert(s);
796
797         if (!s->event)
798                 return;
799
800         assert(s->event->n_sources > 0);
801
802         switch (s->type) {
803
804         case SOURCE_IO:
805                 if (s->io.fd >= 0)
806                         source_io_unregister(s);
807
808                 break;
809
810         case SOURCE_TIME_REALTIME:
811         case SOURCE_TIME_BOOTTIME:
812         case SOURCE_TIME_MONOTONIC:
813         case SOURCE_TIME_REALTIME_ALARM:
814         case SOURCE_TIME_BOOTTIME_ALARM: {
815                 struct clock_data *d;
816
817                 d = event_get_clock_data(s->event, s->type);
818                 assert(d);
819
820                 prioq_remove(d->earliest, s, &s->time.earliest_index);
821                 prioq_remove(d->latest, s, &s->time.latest_index);
822                 d->needs_rearm = true;
823                 break;
824         }
825
826         case SOURCE_SIGNAL:
827                 if (s->signal.sig > 0) {
828
829                         if (s->event->signal_sources)
830                                 s->event->signal_sources[s->signal.sig] = NULL;
831
832                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
833                 }
834
835                 break;
836
837         case SOURCE_CHILD:
838                 if (s->child.pid > 0) {
839                         if (s->enabled != SD_EVENT_OFF) {
840                                 assert(s->event->n_enabled_child_sources > 0);
841                                 s->event->n_enabled_child_sources--;
842                         }
843
844                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
845                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
846                 }
847
848                 break;
849
850         case SOURCE_DEFER:
851                 /* nothing */
852                 break;
853
854         case SOURCE_POST:
855                 set_remove(s->event->post_sources, s);
856                 break;
857
858         case SOURCE_EXIT:
859                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
860                 break;
861
862         default:
863                 assert_not_reached("Wut? I shouldn't exist.");
864         }
865
866         if (s->pending)
867                 prioq_remove(s->event->pending, s, &s->pending_index);
868
869         if (s->prepare)
870                 prioq_remove(s->event->prepare, s, &s->prepare_index);
871
872         event = s->event;
873
874         s->type = _SOURCE_EVENT_SOURCE_TYPE_INVALID;
875         s->event = NULL;
876         LIST_REMOVE(sources, event->sources, s);
877         event->n_sources--;
878
879         if (!s->floating)
880                 sd_event_unref(event);
881 }
882
883 static void source_free(sd_event_source *s) {
884         assert(s);
885
886         source_disconnect(s);
887         free(s->description);
888         free(s);
889 }
890
891 static int source_set_pending(sd_event_source *s, bool b) {
892         int r;
893
894         assert(s);
895         assert(s->type != SOURCE_EXIT);
896
897         if (s->pending == b)
898                 return 0;
899
900         s->pending = b;
901
902         if (b) {
903                 s->pending_iteration = s->event->iteration;
904
905                 r = prioq_put(s->event->pending, s, &s->pending_index);
906                 if (r < 0) {
907                         s->pending = false;
908                         return r;
909                 }
910         } else
911                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
912
913         if (EVENT_SOURCE_IS_TIME(s->type)) {
914                 struct clock_data *d;
915
916                 d = event_get_clock_data(s->event, s->type);
917                 assert(d);
918
919                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
920                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
921                 d->needs_rearm = true;
922         }
923
924         if (s->type == SOURCE_SIGNAL && !b) {
925                 struct signal_data *d;
926
927                 d = hashmap_get(s->event->signal_data, &s->priority);
928                 if (d && d->current == s)
929                         d->current = NULL;
930         }
931
932         return 0;
933 }
934
935 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
936         sd_event_source *s;
937
938         assert(e);
939
940         s = new0(sd_event_source, 1);
941         if (!s)
942                 return NULL;
943
944         s->n_ref = 1;
945         s->event = e;
946         s->floating = floating;
947         s->type = type;
948         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
949
950         if (!floating)
951                 sd_event_ref(e);
952
953         LIST_PREPEND(sources, e->sources, s);
954         e->n_sources ++;
955
956         return s;
957 }
958
959 _public_ int sd_event_add_io(
960                 sd_event *e,
961                 sd_event_source **ret,
962                 int fd,
963                 uint32_t events,
964                 sd_event_io_handler_t callback,
965                 void *userdata) {
966
967         sd_event_source *s;
968         int r;
969
970         assert_return(e, -EINVAL);
971         assert_return(fd >= 0, -EBADF);
972         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
973         assert_return(callback, -EINVAL);
974         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
975         assert_return(!event_pid_changed(e), -ECHILD);
976
977         s = source_new(e, !ret, SOURCE_IO);
978         if (!s)
979                 return -ENOMEM;
980
981         s->wakeup = WAKEUP_EVENT_SOURCE;
982         s->io.fd = fd;
983         s->io.events = events;
984         s->io.callback = callback;
985         s->userdata = userdata;
986         s->enabled = SD_EVENT_ON;
987
988         r = source_io_register(s, s->enabled, events);
989         if (r < 0) {
990                 source_free(s);
991                 return r;
992         }
993
994         if (ret)
995                 *ret = s;
996
997         return 0;
998 }
999
1000 static void initialize_perturb(sd_event *e) {
1001         sd_id128_t bootid = {};
1002
1003         /* When we sleep for longer, we try to realign the wakeup to
1004            the same time wihtin each minute/second/250ms, so that
1005            events all across the system can be coalesced into a single
1006            CPU wakeup. However, let's take some system-specific
1007            randomness for this value, so that in a network of systems
1008            with synced clocks timer events are distributed a
1009            bit. Here, we calculate a perturbation usec offset from the
1010            boot ID. */
1011
1012         if (_likely_(e->perturb != USEC_INFINITY))
1013                 return;
1014
1015         if (sd_id128_get_boot(&bootid) >= 0)
1016                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1017 }
1018
1019 static int event_setup_timer_fd(
1020                 sd_event *e,
1021                 struct clock_data *d,
1022                 clockid_t clock) {
1023
1024         struct epoll_event ev = {};
1025         int r, fd;
1026
1027         assert(e);
1028         assert(d);
1029
1030         if (_likely_(d->fd >= 0))
1031                 return 0;
1032
1033         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1034         if (fd < 0)
1035                 return -errno;
1036
1037         ev.events = EPOLLIN;
1038         ev.data.ptr = d;
1039
1040         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
1041         if (r < 0) {
1042                 safe_close(fd);
1043                 return -errno;
1044         }
1045
1046         d->fd = fd;
1047         return 0;
1048 }
1049
1050 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1051         assert(s);
1052
1053         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1054 }
1055
1056 _public_ int sd_event_add_time(
1057                 sd_event *e,
1058                 sd_event_source **ret,
1059                 clockid_t clock,
1060                 uint64_t usec,
1061                 uint64_t accuracy,
1062                 sd_event_time_handler_t callback,
1063                 void *userdata) {
1064
1065         EventSourceType type;
1066         sd_event_source *s;
1067         struct clock_data *d;
1068         int r;
1069
1070         assert_return(e, -EINVAL);
1071         assert_return(accuracy != (uint64_t) -1, -EINVAL);
1072         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1073         assert_return(!event_pid_changed(e), -ECHILD);
1074
1075         if (!callback)
1076                 callback = time_exit_callback;
1077
1078         type = clock_to_event_source_type(clock);
1079         assert_return(type >= 0, -EOPNOTSUPP);
1080
1081         d = event_get_clock_data(e, type);
1082         assert(d);
1083
1084         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1085         if (r < 0)
1086                 return r;
1087
1088         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1089         if (r < 0)
1090                 return r;
1091
1092         if (d->fd < 0) {
1093                 r = event_setup_timer_fd(e, d, clock);
1094                 if (r < 0)
1095                         return r;
1096         }
1097
1098         s = source_new(e, !ret, type);
1099         if (!s)
1100                 return -ENOMEM;
1101
1102         s->time.next = usec;
1103         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1104         s->time.callback = callback;
1105         s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
1106         s->userdata = userdata;
1107         s->enabled = SD_EVENT_ONESHOT;
1108
1109         d->needs_rearm = true;
1110
1111         r = prioq_put(d->earliest, s, &s->time.earliest_index);
1112         if (r < 0)
1113                 goto fail;
1114
1115         r = prioq_put(d->latest, s, &s->time.latest_index);
1116         if (r < 0)
1117                 goto fail;
1118
1119         if (ret)
1120                 *ret = s;
1121
1122         return 0;
1123
1124 fail:
1125         source_free(s);
1126         return r;
1127 }
1128
1129 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1130         assert(s);
1131
1132         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1133 }
1134
1135 _public_ int sd_event_add_signal(
1136                 sd_event *e,
1137                 sd_event_source **ret,
1138                 int sig,
1139                 sd_event_signal_handler_t callback,
1140                 void *userdata) {
1141
1142         sd_event_source *s;
1143         struct signal_data *d;
1144         sigset_t ss;
1145         int r;
1146
1147         assert_return(e, -EINVAL);
1148         assert_return(sig > 0, -EINVAL);
1149         assert_return(sig < _NSIG, -EINVAL);
1150         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1151         assert_return(!event_pid_changed(e), -ECHILD);
1152
1153         if (!callback)
1154                 callback = signal_exit_callback;
1155
1156         r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
1157         if (r != 0)
1158                 return -r;
1159
1160         if (!sigismember(&ss, sig))
1161                 return -EBUSY;
1162
1163         if (!e->signal_sources) {
1164                 e->signal_sources = new0(sd_event_source*, _NSIG);
1165                 if (!e->signal_sources)
1166                         return -ENOMEM;
1167         } else if (e->signal_sources[sig])
1168                 return -EBUSY;
1169
1170         s = source_new(e, !ret, SOURCE_SIGNAL);
1171         if (!s)
1172                 return -ENOMEM;
1173
1174         s->signal.sig = sig;
1175         s->signal.callback = callback;
1176         s->userdata = userdata;
1177         s->enabled = SD_EVENT_ON;
1178
1179         e->signal_sources[sig] = s;
1180
1181         r = event_make_signal_data(e, sig, &d);
1182                 if (r < 0) {
1183                         source_free(s);
1184                         return r;
1185                 }
1186
1187         /* Use the signal name as description for the event source by default */
1188         (void) sd_event_source_set_description(s, signal_to_string(sig));
1189
1190         if (ret)
1191                 *ret = s;
1192
1193         return 0;
1194 }
1195
1196 #if 0 /// UNNEEDED by elogind
1197 _public_ int sd_event_add_child(
1198                 sd_event *e,
1199                 sd_event_source **ret,
1200                 pid_t pid,
1201                 int options,
1202                 sd_event_child_handler_t callback,
1203                 void *userdata) {
1204
1205         sd_event_source *s;
1206         int r;
1207
1208         assert_return(e, -EINVAL);
1209         assert_return(pid > 1, -EINVAL);
1210         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1211         assert_return(options != 0, -EINVAL);
1212         assert_return(callback, -EINVAL);
1213         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1214         assert_return(!event_pid_changed(e), -ECHILD);
1215
1216         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1217         if (r < 0)
1218                 return r;
1219
1220         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1221                 return -EBUSY;
1222
1223         s = source_new(e, !ret, SOURCE_CHILD);
1224         if (!s)
1225                 return -ENOMEM;
1226
1227         s->child.pid = pid;
1228         s->child.options = options;
1229         s->child.callback = callback;
1230         s->userdata = userdata;
1231         s->enabled = SD_EVENT_ONESHOT;
1232
1233         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1234         if (r < 0) {
1235                 source_free(s);
1236                 return r;
1237         }
1238
1239         e->n_enabled_child_sources ++;
1240
1241         r = event_make_signal_data(e, SIGCHLD, NULL);
1242                 if (r < 0) {
1243                 e->n_enabled_child_sources--;
1244                         source_free(s);
1245                         return r;
1246                 }
1247
1248         e->need_process_child = true;
1249
1250         if (ret)
1251                 *ret = s;
1252
1253         return 0;
1254 }
1255
1256 _public_ int sd_event_add_defer(
1257                 sd_event *e,
1258                 sd_event_source **ret,
1259                 sd_event_handler_t callback,
1260                 void *userdata) {
1261
1262         sd_event_source *s;
1263         int r;
1264
1265         assert_return(e, -EINVAL);
1266         assert_return(callback, -EINVAL);
1267         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1268         assert_return(!event_pid_changed(e), -ECHILD);
1269
1270         s = source_new(e, !ret, SOURCE_DEFER);
1271         if (!s)
1272                 return -ENOMEM;
1273
1274         s->defer.callback = callback;
1275         s->userdata = userdata;
1276         s->enabled = SD_EVENT_ONESHOT;
1277
1278         r = source_set_pending(s, true);
1279         if (r < 0) {
1280                 source_free(s);
1281                 return r;
1282         }
1283
1284         if (ret)
1285                 *ret = s;
1286
1287         return 0;
1288 }
1289 #endif // 0
1290
1291 _public_ int sd_event_add_post(
1292                 sd_event *e,
1293                 sd_event_source **ret,
1294                 sd_event_handler_t callback,
1295                 void *userdata) {
1296
1297         sd_event_source *s;
1298         int r;
1299
1300         assert_return(e, -EINVAL);
1301         assert_return(callback, -EINVAL);
1302         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1303         assert_return(!event_pid_changed(e), -ECHILD);
1304
1305         r = set_ensure_allocated(&e->post_sources, NULL);
1306         if (r < 0)
1307                 return r;
1308
1309         s = source_new(e, !ret, SOURCE_POST);
1310         if (!s)
1311                 return -ENOMEM;
1312
1313         s->post.callback = callback;
1314         s->userdata = userdata;
1315         s->enabled = SD_EVENT_ON;
1316
1317         r = set_put(e->post_sources, s);
1318         if (r < 0) {
1319                 source_free(s);
1320                 return r;
1321         }
1322
1323         if (ret)
1324                 *ret = s;
1325
1326         return 0;
1327 }
1328
1329 _public_ int sd_event_add_exit(
1330                 sd_event *e,
1331                 sd_event_source **ret,
1332                 sd_event_handler_t callback,
1333                 void *userdata) {
1334
1335         sd_event_source *s;
1336         int r;
1337
1338         assert_return(e, -EINVAL);
1339         assert_return(callback, -EINVAL);
1340         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1341         assert_return(!event_pid_changed(e), -ECHILD);
1342
1343         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1344         if (r < 0)
1345                 return r;
1346
1347         s = source_new(e, !ret, SOURCE_EXIT);
1348         if (!s)
1349                 return -ENOMEM;
1350
1351         s->exit.callback = callback;
1352         s->userdata = userdata;
1353         s->exit.prioq_index = PRIOQ_IDX_NULL;
1354         s->enabled = SD_EVENT_ONESHOT;
1355
1356         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1357         if (r < 0) {
1358                 source_free(s);
1359                 return r;
1360         }
1361
1362         if (ret)
1363                 *ret = s;
1364
1365         return 0;
1366 }
1367
1368 #if 0 /// UNNEEDED by elogind
1369 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
1370         assert_return(s, NULL);
1371
1372         assert(s->n_ref >= 1);
1373         s->n_ref++;
1374
1375         return s;
1376 }
1377 #endif // 0
1378
1379 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
1380
1381         if (!s)
1382                 return NULL;
1383
1384         assert(s->n_ref >= 1);
1385         s->n_ref--;
1386
1387         if (s->n_ref <= 0) {
1388                 /* Here's a special hack: when we are called from a
1389                  * dispatch handler we won't free the event source
1390                  * immediately, but we will detach the fd from the
1391                  * epoll. This way it is safe for the caller to unref
1392                  * the event source and immediately close the fd, but
1393                  * we still retain a valid event source object after
1394                  * the callback. */
1395
1396                 if (s->dispatching) {
1397                         if (s->type == SOURCE_IO)
1398                                 source_io_unregister(s);
1399
1400                         source_disconnect(s);
1401                 } else
1402                         source_free(s);
1403         }
1404
1405         return NULL;
1406 }
1407
1408 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
1409         assert_return(s, -EINVAL);
1410         assert_return(!event_pid_changed(s->event), -ECHILD);
1411
1412         return free_and_strdup(&s->description, description);
1413 }
1414
1415 #if 0 /// UNNEEDED by elogind
1416 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
1417         assert_return(s, -EINVAL);
1418         assert_return(description, -EINVAL);
1419         assert_return(s->description, -ENXIO);
1420         assert_return(!event_pid_changed(s->event), -ECHILD);
1421
1422         *description = s->description;
1423         return 0;
1424 }
1425 #endif // 0
1426
1427 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1428         assert_return(s, NULL);
1429
1430         return s->event;
1431 }
1432
1433 #if 0 /// UNNEEDED by elogind
1434 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1435         assert_return(s, -EINVAL);
1436         assert_return(s->type != SOURCE_EXIT, -EDOM);
1437         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1438         assert_return(!event_pid_changed(s->event), -ECHILD);
1439
1440         return s->pending;
1441 }
1442
1443 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1444         assert_return(s, -EINVAL);
1445         assert_return(s->type == SOURCE_IO, -EDOM);
1446         assert_return(!event_pid_changed(s->event), -ECHILD);
1447
1448         return s->io.fd;
1449 }
1450 #endif // 0
1451
1452 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1453         int r;
1454
1455         assert_return(s, -EINVAL);
1456         assert_return(fd >= 0, -EBADF);
1457         assert_return(s->type == SOURCE_IO, -EDOM);
1458         assert_return(!event_pid_changed(s->event), -ECHILD);
1459
1460         if (s->io.fd == fd)
1461                 return 0;
1462
1463         if (s->enabled == SD_EVENT_OFF) {
1464                 s->io.fd = fd;
1465                 s->io.registered = false;
1466         } else {
1467                 int saved_fd;
1468
1469                 saved_fd = s->io.fd;
1470                 assert(s->io.registered);
1471
1472                 s->io.fd = fd;
1473                 s->io.registered = false;
1474
1475                 r = source_io_register(s, s->enabled, s->io.events);
1476                 if (r < 0) {
1477                         s->io.fd = saved_fd;
1478                         s->io.registered = true;
1479                         return r;
1480                 }
1481
1482                 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
1483         }
1484
1485         return 0;
1486 }
1487
1488 #if 0 /// UNNEEDED by elogind
1489 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1490         assert_return(s, -EINVAL);
1491         assert_return(events, -EINVAL);
1492         assert_return(s->type == SOURCE_IO, -EDOM);
1493         assert_return(!event_pid_changed(s->event), -ECHILD);
1494
1495         *events = s->io.events;
1496         return 0;
1497 }
1498 #endif // 0
1499
1500 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1501         int r;
1502
1503         assert_return(s, -EINVAL);
1504         assert_return(s->type == SOURCE_IO, -EDOM);
1505         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1506         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1507         assert_return(!event_pid_changed(s->event), -ECHILD);
1508
1509         /* edge-triggered updates are never skipped, so we can reset edges */
1510         if (s->io.events == events && !(events & EPOLLET))
1511                 return 0;
1512
1513         if (s->enabled != SD_EVENT_OFF) {
1514                 r = source_io_register(s, s->enabled, events);
1515                 if (r < 0)
1516                         return r;
1517         }
1518
1519         s->io.events = events;
1520         source_set_pending(s, false);
1521
1522         return 0;
1523 }
1524
1525 #if 0 /// UNNEEDED by elogind
1526 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1527         assert_return(s, -EINVAL);
1528         assert_return(revents, -EINVAL);
1529         assert_return(s->type == SOURCE_IO, -EDOM);
1530         assert_return(s->pending, -ENODATA);
1531         assert_return(!event_pid_changed(s->event), -ECHILD);
1532
1533         *revents = s->io.revents;
1534         return 0;
1535 }
1536
1537 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1538         assert_return(s, -EINVAL);
1539         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1540         assert_return(!event_pid_changed(s->event), -ECHILD);
1541
1542         return s->signal.sig;
1543 }
1544
1545 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
1546         assert_return(s, -EINVAL);
1547         assert_return(!event_pid_changed(s->event), -ECHILD);
1548
1549         return s->priority;
1550 }
1551 #endif // 0
1552
1553 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
1554         int r;
1555
1556         assert_return(s, -EINVAL);
1557         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1558         assert_return(!event_pid_changed(s->event), -ECHILD);
1559
1560         if (s->priority == priority)
1561                 return 0;
1562
1563         if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
1564                 struct signal_data *old, *d;
1565
1566                 /* Move us from the signalfd belonging to the old
1567                  * priority to the signalfd of the new priority */
1568
1569                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
1570
1571                 s->priority = priority;
1572
1573                 r = event_make_signal_data(s->event, s->signal.sig, &d);
1574                 if (r < 0) {
1575                         s->priority = old->priority;
1576                         return r;
1577                 }
1578
1579                 event_unmask_signal_data(s->event, old, s->signal.sig);
1580         } else
1581         s->priority = priority;
1582
1583         if (s->pending)
1584                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1585
1586         if (s->prepare)
1587                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1588
1589         if (s->type == SOURCE_EXIT)
1590                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1591
1592         return 0;
1593 }
1594
1595 #if 0 /// UNNEEDED by elogind
1596 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1597         assert_return(s, -EINVAL);
1598         assert_return(m, -EINVAL);
1599         assert_return(!event_pid_changed(s->event), -ECHILD);
1600
1601         *m = s->enabled;
1602         return 0;
1603 }
1604 #endif // 0
1605
1606 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1607         int r;
1608
1609         assert_return(s, -EINVAL);
1610         assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
1611         assert_return(!event_pid_changed(s->event), -ECHILD);
1612
1613         /* If we are dead anyway, we are fine with turning off
1614          * sources, but everything else needs to fail. */
1615         if (s->event->state == SD_EVENT_FINISHED)
1616                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
1617
1618         if (s->enabled == m)
1619                 return 0;
1620
1621         if (m == SD_EVENT_OFF) {
1622
1623                 switch (s->type) {
1624
1625                 case SOURCE_IO:
1626                         source_io_unregister(s);
1627                         s->enabled = m;
1628                         break;
1629
1630                 case SOURCE_TIME_REALTIME:
1631                 case SOURCE_TIME_BOOTTIME:
1632                 case SOURCE_TIME_MONOTONIC:
1633                 case SOURCE_TIME_REALTIME_ALARM:
1634                 case SOURCE_TIME_BOOTTIME_ALARM: {
1635                         struct clock_data *d;
1636
1637                         s->enabled = m;
1638                         d = event_get_clock_data(s->event, s->type);
1639                         assert(d);
1640
1641                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1642                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1643                         d->needs_rearm = true;
1644                         break;
1645                 }
1646
1647                 case SOURCE_SIGNAL:
1648                         s->enabled = m;
1649
1650                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1651                         break;
1652
1653                 case SOURCE_CHILD:
1654                         s->enabled = m;
1655
1656                         assert(s->event->n_enabled_child_sources > 0);
1657                         s->event->n_enabled_child_sources--;
1658
1659                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1660                         break;
1661
1662                 case SOURCE_EXIT:
1663                         s->enabled = m;
1664                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1665                         break;
1666
1667                 case SOURCE_DEFER:
1668                 case SOURCE_POST:
1669                         s->enabled = m;
1670                         break;
1671
1672                 default:
1673                         assert_not_reached("Wut? I shouldn't exist.");
1674                 }
1675
1676         } else {
1677                 switch (s->type) {
1678
1679                 case SOURCE_IO:
1680                         r = source_io_register(s, m, s->io.events);
1681                         if (r < 0)
1682                                 return r;
1683
1684                         s->enabled = m;
1685                         break;
1686
1687                 case SOURCE_TIME_REALTIME:
1688                 case SOURCE_TIME_BOOTTIME:
1689                 case SOURCE_TIME_MONOTONIC:
1690                 case SOURCE_TIME_REALTIME_ALARM:
1691                 case SOURCE_TIME_BOOTTIME_ALARM: {
1692                         struct clock_data *d;
1693
1694                         s->enabled = m;
1695                         d = event_get_clock_data(s->event, s->type);
1696                         assert(d);
1697
1698                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1699                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1700                         d->needs_rearm = true;
1701                         break;
1702                 }
1703
1704                 case SOURCE_SIGNAL:
1705
1706                         s->enabled = m;
1707
1708                         r = event_make_signal_data(s->event, s->signal.sig, NULL);
1709                                 if (r < 0) {
1710                                         s->enabled = SD_EVENT_OFF;
1711                                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1712                                         return r;
1713                                 }
1714
1715                         break;
1716
1717                 case SOURCE_CHILD:
1718
1719                         if (s->enabled == SD_EVENT_OFF)
1720                                 s->event->n_enabled_child_sources++;
1721
1722                         s->enabled = m;
1723
1724                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
1725                                         if (r < 0) {
1726                                                 s->enabled = SD_EVENT_OFF;
1727                                 s->event->n_enabled_child_sources--;
1728                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1729                                                 return r;
1730                                         }
1731
1732                         break;
1733
1734                 case SOURCE_EXIT:
1735                         s->enabled = m;
1736                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1737                         break;
1738
1739                 case SOURCE_DEFER:
1740                 case SOURCE_POST:
1741                         s->enabled = m;
1742                         break;
1743
1744                 default:
1745                         assert_not_reached("Wut? I shouldn't exist.");
1746                 }
1747         }
1748
1749         if (s->pending)
1750                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1751
1752         if (s->prepare)
1753                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1754
1755         return 0;
1756 }
1757
1758 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1759         assert_return(s, -EINVAL);
1760         assert_return(usec, -EINVAL);
1761         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1762         assert_return(!event_pid_changed(s->event), -ECHILD);
1763
1764         *usec = s->time.next;
1765         return 0;
1766 }
1767
1768 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1769         struct clock_data *d;
1770
1771         assert_return(s, -EINVAL);
1772         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1773         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1774         assert_return(!event_pid_changed(s->event), -ECHILD);
1775
1776         s->time.next = usec;
1777
1778         source_set_pending(s, false);
1779
1780         d = event_get_clock_data(s->event, s->type);
1781         assert(d);
1782
1783         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1784         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1785         d->needs_rearm = true;
1786
1787         return 0;
1788 }
1789
1790 #if 0 /// UNNEEDED by elogind
1791 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1792         assert_return(s, -EINVAL);
1793         assert_return(usec, -EINVAL);
1794         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1795         assert_return(!event_pid_changed(s->event), -ECHILD);
1796
1797         *usec = s->time.accuracy;
1798         return 0;
1799 }
1800
1801 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1802         struct clock_data *d;
1803
1804         assert_return(s, -EINVAL);
1805         assert_return(usec != (uint64_t) -1, -EINVAL);
1806         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1807         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1808         assert_return(!event_pid_changed(s->event), -ECHILD);
1809
1810         if (usec == 0)
1811                 usec = DEFAULT_ACCURACY_USEC;
1812
1813         s->time.accuracy = usec;
1814
1815         source_set_pending(s, false);
1816
1817         d = event_get_clock_data(s->event, s->type);
1818         assert(d);
1819
1820         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1821         d->needs_rearm = true;
1822
1823         return 0;
1824 }
1825
1826 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
1827         assert_return(s, -EINVAL);
1828         assert_return(clock, -EINVAL);
1829         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1830         assert_return(!event_pid_changed(s->event), -ECHILD);
1831
1832         *clock = event_source_type_to_clock(s->type);
1833         return 0;
1834 }
1835
1836 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1837         assert_return(s, -EINVAL);
1838         assert_return(pid, -EINVAL);
1839         assert_return(s->type == SOURCE_CHILD, -EDOM);
1840         assert_return(!event_pid_changed(s->event), -ECHILD);
1841
1842         *pid = s->child.pid;
1843         return 0;
1844 }
1845 #endif // 0
1846
1847 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1848         int r;
1849
1850         assert_return(s, -EINVAL);
1851         assert_return(s->type != SOURCE_EXIT, -EDOM);
1852         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1853         assert_return(!event_pid_changed(s->event), -ECHILD);
1854
1855         if (s->prepare == callback)
1856                 return 0;
1857
1858         if (callback && s->prepare) {
1859                 s->prepare = callback;
1860                 return 0;
1861         }
1862
1863         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1864         if (r < 0)
1865                 return r;
1866
1867         s->prepare = callback;
1868
1869         if (callback) {
1870                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1871                 if (r < 0)
1872                         return r;
1873         } else
1874                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1875
1876         return 0;
1877 }
1878
1879 #if 0 /// UNNEEDED by elogind
1880 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1881         assert_return(s, NULL);
1882
1883         return s->userdata;
1884 }
1885
1886 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1887         void *ret;
1888
1889         assert_return(s, NULL);
1890
1891         ret = s->userdata;
1892         s->userdata = userdata;
1893
1894         return ret;
1895 }
1896 #endif // 0
1897
1898 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1899         usec_t c;
1900         assert(e);
1901         assert(a <= b);
1902
1903         if (a <= 0)
1904                 return 0;
1905         if (a >= USEC_INFINITY)
1906                 return USEC_INFINITY;
1907
1908         if (b <= a + 1)
1909                 return a;
1910
1911         initialize_perturb(e);
1912
1913         /*
1914           Find a good time to wake up again between times a and b. We
1915           have two goals here:
1916
1917           a) We want to wake up as seldom as possible, hence prefer
1918              later times over earlier times.
1919
1920           b) But if we have to wake up, then let's make sure to
1921              dispatch as much as possible on the entire system.
1922
1923           We implement this by waking up everywhere at the same time
1924           within any given minute if we can, synchronised via the
1925           perturbation value determined from the boot ID. If we can't,
1926           then we try to find the same spot in every 10s, then 1s and
1927           then 250ms step. Otherwise, we pick the last possible time
1928           to wake up.
1929         */
1930
1931         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1932         if (c >= b) {
1933                 if (_unlikely_(c < USEC_PER_MINUTE))
1934                         return b;
1935
1936                 c -= USEC_PER_MINUTE;
1937         }
1938
1939         if (c >= a)
1940                 return c;
1941
1942         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1943         if (c >= b) {
1944                 if (_unlikely_(c < USEC_PER_SEC*10))
1945                         return b;
1946
1947                 c -= USEC_PER_SEC*10;
1948         }
1949
1950         if (c >= a)
1951                 return c;
1952
1953         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1954         if (c >= b) {
1955                 if (_unlikely_(c < USEC_PER_SEC))
1956                         return b;
1957
1958                 c -= USEC_PER_SEC;
1959         }
1960
1961         if (c >= a)
1962                 return c;
1963
1964         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1965         if (c >= b) {
1966                 if (_unlikely_(c < USEC_PER_MSEC*250))
1967                         return b;
1968
1969                 c -= USEC_PER_MSEC*250;
1970         }
1971
1972         if (c >= a)
1973                 return c;
1974
1975         return b;
1976 }
1977
1978 static int event_arm_timer(
1979                 sd_event *e,
1980                 struct clock_data *d) {
1981
1982         struct itimerspec its = {};
1983         sd_event_source *a, *b;
1984         usec_t t;
1985         int r;
1986
1987         assert(e);
1988         assert(d);
1989
1990         if (!d->needs_rearm)
1991                 return 0;
1992         else
1993                 d->needs_rearm = false;
1994
1995         a = prioq_peek(d->earliest);
1996         if (!a || a->enabled == SD_EVENT_OFF || a->time.next == USEC_INFINITY) {
1997
1998                 if (d->fd < 0)
1999                         return 0;
2000
2001                 if (d->next == USEC_INFINITY)
2002                         return 0;
2003
2004                 /* disarm */
2005                 r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2006                 if (r < 0)
2007                         return r;
2008
2009                 d->next = USEC_INFINITY;
2010                 return 0;
2011         }
2012
2013         b = prioq_peek(d->latest);
2014         assert_se(b && b->enabled != SD_EVENT_OFF);
2015
2016         t = sleep_between(e, a->time.next, time_event_source_latest(b));
2017         if (d->next == t)
2018                 return 0;
2019
2020         assert_se(d->fd >= 0);
2021
2022         if (t == 0) {
2023                 /* We don' want to disarm here, just mean some time looooong ago. */
2024                 its.it_value.tv_sec = 0;
2025                 its.it_value.tv_nsec = 1;
2026         } else
2027                 timespec_store(&its.it_value, t);
2028
2029         r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2030         if (r < 0)
2031                 return -errno;
2032
2033         d->next = t;
2034         return 0;
2035 }
2036
2037 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
2038         assert(e);
2039         assert(s);
2040         assert(s->type == SOURCE_IO);
2041
2042         /* If the event source was already pending, we just OR in the
2043          * new revents, otherwise we reset the value. The ORing is
2044          * necessary to handle EPOLLONESHOT events properly where
2045          * readability might happen independently of writability, and
2046          * we need to keep track of both */
2047
2048         if (s->pending)
2049                 s->io.revents |= revents;
2050         else
2051                 s->io.revents = revents;
2052
2053         return source_set_pending(s, true);
2054 }
2055
2056 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
2057         uint64_t x;
2058         ssize_t ss;
2059
2060         assert(e);
2061         assert(fd >= 0);
2062
2063         assert_return(events == EPOLLIN, -EIO);
2064
2065         ss = read(fd, &x, sizeof(x));
2066         if (ss < 0) {
2067                 if (errno == EAGAIN || errno == EINTR)
2068                         return 0;
2069
2070                 return -errno;
2071         }
2072
2073         if (_unlikely_(ss != sizeof(x)))
2074                 return -EIO;
2075
2076         if (next)
2077                 *next = USEC_INFINITY;
2078
2079         return 0;
2080 }
2081
2082 static int process_timer(
2083                 sd_event *e,
2084                 usec_t n,
2085                 struct clock_data *d) {
2086
2087         sd_event_source *s;
2088         int r;
2089
2090         assert(e);
2091         assert(d);
2092
2093         for (;;) {
2094                 s = prioq_peek(d->earliest);
2095                 if (!s ||
2096                     s->time.next > n ||
2097                     s->enabled == SD_EVENT_OFF ||
2098                     s->pending)
2099                         break;
2100
2101                 r = source_set_pending(s, true);
2102                 if (r < 0)
2103                         return r;
2104
2105                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2106                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
2107                 d->needs_rearm = true;
2108         }
2109
2110         return 0;
2111 }
2112
2113 static int process_child(sd_event *e) {
2114         sd_event_source *s;
2115         Iterator i;
2116         int r;
2117
2118         assert(e);
2119
2120         e->need_process_child = false;
2121
2122         /*
2123            So, this is ugly. We iteratively invoke waitid() with P_PID
2124            + WNOHANG for each PID we wait for, instead of using
2125            P_ALL. This is because we only want to get child
2126            information of very specific child processes, and not all
2127            of them. We might not have processed the SIGCHLD even of a
2128            previous invocation and we don't want to maintain a
2129            unbounded *per-child* event queue, hence we really don't
2130            want anything flushed out of the kernel's queue that we
2131            don't care about. Since this is O(n) this means that if you
2132            have a lot of processes you probably want to handle SIGCHLD
2133            yourself.
2134
2135            We do not reap the children here (by using WNOWAIT), this
2136            is only done after the event source is dispatched so that
2137            the callback still sees the process as a zombie.
2138         */
2139
2140         HASHMAP_FOREACH(s, e->child_sources, i) {
2141                 assert(s->type == SOURCE_CHILD);
2142
2143                 if (s->pending)
2144                         continue;
2145
2146                 if (s->enabled == SD_EVENT_OFF)
2147                         continue;
2148
2149                 zero(s->child.siginfo);
2150                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
2151                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
2152                 if (r < 0)
2153                         return -errno;
2154
2155                 if (s->child.siginfo.si_pid != 0) {
2156                         bool zombie =
2157                                 s->child.siginfo.si_code == CLD_EXITED ||
2158                                 s->child.siginfo.si_code == CLD_KILLED ||
2159                                 s->child.siginfo.si_code == CLD_DUMPED;
2160
2161                         if (!zombie && (s->child.options & WEXITED)) {
2162                                 /* If the child isn't dead then let's
2163                                  * immediately remove the state change
2164                                  * from the queue, since there's no
2165                                  * benefit in leaving it queued */
2166
2167                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
2168                                 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
2169                         }
2170
2171                         r = source_set_pending(s, true);
2172                         if (r < 0)
2173                                 return r;
2174                 }
2175         }
2176
2177         return 0;
2178 }
2179
2180 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
2181         bool read_one = false;
2182         int r;
2183
2184         assert(e);
2185         assert_return(events == EPOLLIN, -EIO);
2186
2187         /* If there's a signal queued on this priority and SIGCHLD is
2188            on this priority too, then make sure to recheck the
2189            children we watch. This is because we only ever dequeue
2190            the first signal per priority, and if we dequeue one, and
2191            SIGCHLD might be enqueued later we wouldn't know, but we
2192            might have higher priority children we care about hence we
2193            need to check that explicitly. */
2194
2195         if (sigismember(&d->sigset, SIGCHLD))
2196                 e->need_process_child = true;
2197
2198         /* If there's already an event source pending for this
2199          * priority we don't read another */
2200         if (d->current)
2201                 return 0;
2202
2203         for (;;) {
2204                 struct signalfd_siginfo si;
2205                 ssize_t n;
2206                 sd_event_source *s = NULL;
2207
2208                 n = read(d->fd, &si, sizeof(si));
2209                 if (n < 0) {
2210                         if (errno == EAGAIN || errno == EINTR)
2211                                 return read_one;
2212
2213                         return -errno;
2214                 }
2215
2216                 if (_unlikely_(n != sizeof(si)))
2217                         return -EIO;
2218
2219                 assert(si.ssi_signo < _NSIG);
2220
2221                 read_one = true;
2222
2223                 if (e->signal_sources)
2224                         s = e->signal_sources[si.ssi_signo];
2225                 if (!s)
2226                         continue;
2227                 if (s->pending)
2228                         continue;
2229
2230                 s->signal.siginfo = si;
2231                 d->current = s;
2232
2233                 r = source_set_pending(s, true);
2234                 if (r < 0)
2235                         return r;
2236
2237                 return 1;
2238         }
2239 }
2240
2241 static int source_dispatch(sd_event_source *s) {
2242         int r = 0;
2243
2244         assert(s);
2245         assert(s->pending || s->type == SOURCE_EXIT);
2246
2247         if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) {
2248                 r = source_set_pending(s, false);
2249                 if (r < 0)
2250                         return r;
2251         }
2252
2253         if (s->type != SOURCE_POST) {
2254                 sd_event_source *z;
2255                 Iterator i;
2256
2257                 /* If we execute a non-post source, let's mark all
2258                  * post sources as pending */
2259
2260                 SET_FOREACH(z, s->event->post_sources, i) {
2261                         if (z->enabled == SD_EVENT_OFF)
2262                                 continue;
2263
2264                         r = source_set_pending(z, true);
2265                         if (r < 0)
2266                                 return r;
2267                 }
2268         }
2269
2270         if (s->enabled == SD_EVENT_ONESHOT) {
2271                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
2272                 if (r < 0)
2273                         return r;
2274         }
2275
2276         s->dispatching = true;
2277
2278         switch (s->type) {
2279
2280         case SOURCE_IO:
2281                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
2282                 break;
2283
2284         case SOURCE_TIME_REALTIME:
2285         case SOURCE_TIME_BOOTTIME:
2286         case SOURCE_TIME_MONOTONIC:
2287         case SOURCE_TIME_REALTIME_ALARM:
2288         case SOURCE_TIME_BOOTTIME_ALARM:
2289                 r = s->time.callback(s, s->time.next, s->userdata);
2290                 break;
2291
2292         case SOURCE_SIGNAL:
2293                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
2294                 break;
2295
2296         case SOURCE_CHILD: {
2297                 bool zombie;
2298
2299                 zombie = s->child.siginfo.si_code == CLD_EXITED ||
2300                          s->child.siginfo.si_code == CLD_KILLED ||
2301                          s->child.siginfo.si_code == CLD_DUMPED;
2302
2303                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
2304
2305                 /* Now, reap the PID for good. */
2306                 if (zombie)
2307                         waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
2308
2309                 break;
2310         }
2311
2312         case SOURCE_DEFER:
2313                 r = s->defer.callback(s, s->userdata);
2314                 break;
2315
2316         case SOURCE_POST:
2317                 r = s->post.callback(s, s->userdata);
2318                 break;
2319
2320         case SOURCE_EXIT:
2321                 r = s->exit.callback(s, s->userdata);
2322                 break;
2323
2324         case SOURCE_WATCHDOG:
2325         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
2326         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
2327                 assert_not_reached("Wut? I shouldn't exist.");
2328         }
2329
2330         s->dispatching = false;
2331
2332         if (r < 0)
2333                 log_debug_errno(r, "Event source %s (type %s) returned error, disabling: %m",
2334                                 strna(s->description), event_source_type_to_string(s->type));
2335
2336         if (s->n_ref == 0)
2337                 source_free(s);
2338         else if (r < 0)
2339                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
2340
2341         return 1;
2342 }
2343
2344 static int event_prepare(sd_event *e) {
2345         int r;
2346
2347         assert(e);
2348
2349         for (;;) {
2350                 sd_event_source *s;
2351
2352                 s = prioq_peek(e->prepare);
2353                 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
2354                         break;
2355
2356                 s->prepare_iteration = e->iteration;
2357                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
2358                 if (r < 0)
2359                         return r;
2360
2361                 assert(s->prepare);
2362
2363                 s->dispatching = true;
2364                 r = s->prepare(s, s->userdata);
2365                 s->dispatching = false;
2366
2367                 if (r < 0)
2368                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, disabling: %m",
2369                                         strna(s->description), event_source_type_to_string(s->type));
2370
2371                 if (s->n_ref == 0)
2372                         source_free(s);
2373                 else if (r < 0)
2374                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
2375         }
2376
2377         return 0;
2378 }
2379
2380 static int dispatch_exit(sd_event *e) {
2381         sd_event_source *p;
2382         int r;
2383
2384         assert(e);
2385
2386         p = prioq_peek(e->exit);
2387         if (!p || p->enabled == SD_EVENT_OFF) {
2388                 e->state = SD_EVENT_FINISHED;
2389                 return 0;
2390         }
2391
2392         sd_event_ref(e);
2393         e->iteration++;
2394         e->state = SD_EVENT_EXITING;
2395
2396         r = source_dispatch(p);
2397
2398         e->state = SD_EVENT_INITIAL;
2399         sd_event_unref(e);
2400
2401         return r;
2402 }
2403
2404 static sd_event_source* event_next_pending(sd_event *e) {
2405         sd_event_source *p;
2406
2407         assert(e);
2408
2409         p = prioq_peek(e->pending);
2410         if (!p)
2411                 return NULL;
2412
2413         if (p->enabled == SD_EVENT_OFF)
2414                 return NULL;
2415
2416         return p;
2417 }
2418
2419 static int arm_watchdog(sd_event *e) {
2420         struct itimerspec its = {};
2421         usec_t t;
2422         int r;
2423
2424         assert(e);
2425         assert(e->watchdog_fd >= 0);
2426
2427         t = sleep_between(e,
2428                           e->watchdog_last + (e->watchdog_period / 2),
2429                           e->watchdog_last + (e->watchdog_period * 3 / 4));
2430
2431         timespec_store(&its.it_value, t);
2432
2433         /* Make sure we never set the watchdog to 0, which tells the
2434          * kernel to disable it. */
2435         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
2436                 its.it_value.tv_nsec = 1;
2437
2438         r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
2439         if (r < 0)
2440                 return -errno;
2441
2442         return 0;
2443 }
2444
2445 static int process_watchdog(sd_event *e) {
2446         assert(e);
2447
2448         if (!e->watchdog)
2449                 return 0;
2450
2451         /* Don't notify watchdog too often */
2452         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
2453                 return 0;
2454
2455         sd_notify(false, "WATCHDOG=1");
2456         e->watchdog_last = e->timestamp.monotonic;
2457
2458         return arm_watchdog(e);
2459 }
2460
2461 _public_ int sd_event_prepare(sd_event *e) {
2462         int r;
2463
2464         assert_return(e, -EINVAL);
2465         assert_return(!event_pid_changed(e), -ECHILD);
2466         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2467         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2468
2469         if (e->exit_requested)
2470                 goto pending;
2471
2472         e->iteration++;
2473
2474         e->state = SD_EVENT_PREPARING;
2475         r = event_prepare(e);
2476         e->state = SD_EVENT_INITIAL;
2477         if (r < 0)
2478                 return r;
2479
2480         r = event_arm_timer(e, &e->realtime);
2481         if (r < 0)
2482                 return r;
2483
2484         r = event_arm_timer(e, &e->boottime);
2485         if (r < 0)
2486                 return r;
2487
2488         r = event_arm_timer(e, &e->monotonic);
2489         if (r < 0)
2490                 return r;
2491
2492         r = event_arm_timer(e, &e->realtime_alarm);
2493         if (r < 0)
2494                 return r;
2495
2496         r = event_arm_timer(e, &e->boottime_alarm);
2497         if (r < 0)
2498                 return r;
2499
2500         if (event_next_pending(e) || e->need_process_child)
2501                 goto pending;
2502
2503         e->state = SD_EVENT_ARMED;
2504
2505         return 0;
2506
2507 pending:
2508         e->state = SD_EVENT_ARMED;
2509         r = sd_event_wait(e, 0);
2510         if (r == 0)
2511                 e->state = SD_EVENT_ARMED;
2512
2513         return r;
2514 }
2515
2516 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
2517         struct epoll_event *ev_queue;
2518         unsigned ev_queue_max;
2519         int r, m, i;
2520
2521         assert_return(e, -EINVAL);
2522         assert_return(!event_pid_changed(e), -ECHILD);
2523         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2524         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
2525
2526         if (e->exit_requested) {
2527                 e->state = SD_EVENT_PENDING;
2528                 return 1;
2529         }
2530
2531         ev_queue_max = MAX(e->n_sources, 1u);
2532         ev_queue = newa(struct epoll_event, ev_queue_max);
2533
2534         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
2535                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
2536         if (m < 0) {
2537                 if (errno == EINTR) {
2538                         e->state = SD_EVENT_PENDING;
2539                         return 1;
2540                 }
2541
2542                 r = -errno;
2543                 goto finish;
2544         }
2545
2546         dual_timestamp_get(&e->timestamp);
2547         e->timestamp_boottime = now(CLOCK_BOOTTIME);
2548
2549         for (i = 0; i < m; i++) {
2550
2551                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
2552                         r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
2553                 else {
2554                         WakeupType *t = ev_queue[i].data.ptr;
2555
2556                         switch (*t) {
2557
2558                         case WAKEUP_EVENT_SOURCE:
2559                         r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
2560                                 break;
2561
2562                         case WAKEUP_CLOCK_DATA: {
2563                                 struct clock_data *d = ev_queue[i].data.ptr;
2564                                 r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
2565                                 break;
2566                         }
2567
2568                         case WAKEUP_SIGNAL_DATA:
2569                                 r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
2570                                 break;
2571
2572                         default:
2573                                 assert_not_reached("Invalid wake-up pointer");
2574                         }
2575                 }
2576                 if (r < 0)
2577                         goto finish;
2578         }
2579
2580         r = process_watchdog(e);
2581         if (r < 0)
2582                 goto finish;
2583
2584         r = process_timer(e, e->timestamp.realtime, &e->realtime);
2585         if (r < 0)
2586                 goto finish;
2587
2588         r = process_timer(e, e->timestamp_boottime, &e->boottime);
2589         if (r < 0)
2590                 goto finish;
2591
2592         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
2593         if (r < 0)
2594                 goto finish;
2595
2596         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
2597         if (r < 0)
2598                 goto finish;
2599
2600         r = process_timer(e, e->timestamp_boottime, &e->boottime_alarm);
2601         if (r < 0)
2602                 goto finish;
2603
2604         if (e->need_process_child) {
2605                 r = process_child(e);
2606                 if (r < 0)
2607                         goto finish;
2608         }
2609
2610         if (event_next_pending(e)) {
2611                 e->state = SD_EVENT_PENDING;
2612
2613                 return 1;
2614         }
2615
2616         r = 0;
2617
2618 finish:
2619         e->state = SD_EVENT_INITIAL;
2620
2621         return r;
2622 }
2623
2624 _public_ int sd_event_dispatch(sd_event *e) {
2625         sd_event_source *p;
2626         int r;
2627
2628         assert_return(e, -EINVAL);
2629         assert_return(!event_pid_changed(e), -ECHILD);
2630         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2631         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
2632
2633         if (e->exit_requested)
2634                 return dispatch_exit(e);
2635
2636         p = event_next_pending(e);
2637         if (p) {
2638                 sd_event_ref(e);
2639
2640                 e->state = SD_EVENT_RUNNING;
2641                 r = source_dispatch(p);
2642                 e->state = SD_EVENT_INITIAL;
2643
2644                 sd_event_unref(e);
2645
2646                 return r;
2647         }
2648
2649         e->state = SD_EVENT_INITIAL;
2650
2651         return 1;
2652 }
2653
2654 static void event_log_delays(sd_event *e) {
2655         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1];
2656         unsigned i;
2657         int o;
2658
2659         for (i = o = 0; i < ELEMENTSOF(e->delays); i++) {
2660                 o += snprintf(&b[o], sizeof(b) - o, "%u ", e->delays[i]);
2661                 e->delays[i] = 0;
2662         }
2663         log_debug("Event loop iterations: %.*s", o, b);
2664 }
2665
2666 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
2667         int r;
2668
2669         assert_return(e, -EINVAL);
2670         assert_return(!event_pid_changed(e), -ECHILD);
2671         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2672         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2673
2674         if (e->profile_delays && e->last_run) {
2675                 usec_t this_run;
2676                 unsigned l;
2677
2678                 this_run = now(CLOCK_MONOTONIC);
2679
2680                 l = u64log2(this_run - e->last_run);
2681                 assert(l < sizeof(e->delays));
2682                 e->delays[l]++;
2683
2684                 if (this_run - e->last_log >= 5*USEC_PER_SEC) {
2685                         event_log_delays(e);
2686                         e->last_log = this_run;
2687                 }
2688         }
2689
2690         r = sd_event_prepare(e);
2691         if (r == 0)
2692                 /* There was nothing? Then wait... */
2693                 r = sd_event_wait(e, timeout);
2694
2695         if (e->profile_delays)
2696                 e->last_run = now(CLOCK_MONOTONIC);
2697
2698         if (r > 0) {
2699                 /* There's something now, then let's dispatch it */
2700                 r = sd_event_dispatch(e);
2701                 if (r < 0)
2702                         return r;
2703
2704                 return 1;
2705         }
2706
2707         return r;
2708 }
2709
2710 #if 0 /// UNNEEDED by elogind
2711 _public_ int sd_event_loop(sd_event *e) {
2712         int r;
2713
2714         assert_return(e, -EINVAL);
2715         assert_return(!event_pid_changed(e), -ECHILD);
2716         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2717
2718         sd_event_ref(e);
2719
2720         while (e->state != SD_EVENT_FINISHED) {
2721                 r = sd_event_run(e, (uint64_t) -1);
2722                 if (r < 0)
2723                         goto finish;
2724         }
2725
2726         r = e->exit_code;
2727
2728 finish:
2729         sd_event_unref(e);
2730         return r;
2731 }
2732
2733 _public_ int sd_event_get_fd(sd_event *e) {
2734
2735         assert_return(e, -EINVAL);
2736         assert_return(!event_pid_changed(e), -ECHILD);
2737
2738         return e->epoll_fd;
2739 }
2740 #endif // 0
2741
2742 _public_ int sd_event_get_state(sd_event *e) {
2743         assert_return(e, -EINVAL);
2744         assert_return(!event_pid_changed(e), -ECHILD);
2745
2746         return e->state;
2747 }
2748
2749 #if 0 /// UNNEEDED by elogind
2750 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2751         assert_return(e, -EINVAL);
2752         assert_return(code, -EINVAL);
2753         assert_return(!event_pid_changed(e), -ECHILD);
2754
2755         if (!e->exit_requested)
2756                 return -ENODATA;
2757
2758         *code = e->exit_code;
2759         return 0;
2760 }
2761 #endif // 0
2762
2763 _public_ int sd_event_exit(sd_event *e, int code) {
2764         assert_return(e, -EINVAL);
2765         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2766         assert_return(!event_pid_changed(e), -ECHILD);
2767
2768         e->exit_requested = true;
2769         e->exit_code = code;
2770
2771         return 0;
2772 }
2773
2774 #if 0 /// UNNEEDED by elogind
2775 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
2776         assert_return(e, -EINVAL);
2777         assert_return(usec, -EINVAL);
2778         assert_return(!event_pid_changed(e), -ECHILD);
2779         assert_return(IN_SET(clock,
2780                              CLOCK_REALTIME,
2781                              CLOCK_REALTIME_ALARM,
2782                              CLOCK_MONOTONIC,
2783                              CLOCK_BOOTTIME,
2784                              CLOCK_BOOTTIME_ALARM), -EOPNOTSUPP);
2785
2786         if (!dual_timestamp_is_set(&e->timestamp)) {
2787                 /* Implicitly fall back to now() if we never ran
2788                  * before and thus have no cached time. */
2789                 *usec = now(clock);
2790                 return 1;
2791         }
2792
2793         switch (clock) {
2794
2795         case CLOCK_REALTIME:
2796         case CLOCK_REALTIME_ALARM:
2797                 *usec = e->timestamp.realtime;
2798                 break;
2799
2800         case CLOCK_MONOTONIC:
2801                 *usec = e->timestamp.monotonic;
2802                 break;
2803
2804         default:
2805                 *usec = e->timestamp_boottime;
2806                 break;
2807         }
2808
2809         return 0;
2810 }
2811 #endif // 0
2812
2813 _public_ int sd_event_default(sd_event **ret) {
2814
2815         static thread_local sd_event *default_event = NULL;
2816         sd_event *e = NULL;
2817         int r;
2818
2819         if (!ret)
2820                 return !!default_event;
2821
2822         if (default_event) {
2823                 *ret = sd_event_ref(default_event);
2824                 return 0;
2825         }
2826
2827         r = sd_event_new(&e);
2828         if (r < 0)
2829                 return r;
2830
2831         e->default_event_ptr = &default_event;
2832         e->tid = gettid();
2833         default_event = e;
2834
2835         *ret = e;
2836         return 1;
2837 }
2838
2839 #if 0 /// UNNEEDED by elogind
2840 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2841         assert_return(e, -EINVAL);
2842         assert_return(tid, -EINVAL);
2843         assert_return(!event_pid_changed(e), -ECHILD);
2844
2845         if (e->tid != 0) {
2846                 *tid = e->tid;
2847                 return 0;
2848         }
2849
2850         return -ENXIO;
2851 }
2852 #endif // 0
2853
2854 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2855         int r;
2856
2857         assert_return(e, -EINVAL);
2858         assert_return(!event_pid_changed(e), -ECHILD);
2859
2860         if (e->watchdog == !!b)
2861                 return e->watchdog;
2862
2863         if (b) {
2864                 struct epoll_event ev = {};
2865
2866                 r = sd_watchdog_enabled(false, &e->watchdog_period);
2867                 if (r <= 0)
2868                         return r;
2869
2870                 /* Issue first ping immediately */
2871                 sd_notify(false, "WATCHDOG=1");
2872                 e->watchdog_last = now(CLOCK_MONOTONIC);
2873
2874                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2875                 if (e->watchdog_fd < 0)
2876                         return -errno;
2877
2878                 r = arm_watchdog(e);
2879                 if (r < 0)
2880                         goto fail;
2881
2882                 ev.events = EPOLLIN;
2883                 ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
2884
2885                 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2886                 if (r < 0) {
2887                         r = -errno;
2888                         goto fail;
2889                 }
2890
2891         } else {
2892                 if (e->watchdog_fd >= 0) {
2893                         epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2894                         e->watchdog_fd = safe_close(e->watchdog_fd);
2895                 }
2896         }
2897
2898         e->watchdog = !!b;
2899         return e->watchdog;
2900
2901 fail:
2902         e->watchdog_fd = safe_close(e->watchdog_fd);
2903         return r;
2904 }
2905
2906 #if 0 /// UNNEEDED by elogind
2907 _public_ int sd_event_get_watchdog(sd_event *e) {
2908         assert_return(e, -EINVAL);
2909         assert_return(!event_pid_changed(e), -ECHILD);
2910
2911         return e->watchdog;
2912 }
2913 #endif // 0