chiark / gitweb /
tree-wide: drop license boilerplate
[elogind.git] / src / libelogind / sd-event / sd-event.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3   This file is part of systemd.
4
5   Copyright 2013 Lennart Poettering
6 ***/
7
8 #include <sys/epoll.h>
9 #include <sys/timerfd.h>
10 #include <sys/wait.h>
11
12 #include "sd-daemon.h"
13 #include "sd-event.h"
14 #include "sd-id128.h"
15
16 #include "alloc-util.h"
17 #include "fd-util.h"
18 #include "hashmap.h"
19 #include "list.h"
20 #include "macro.h"
21 #include "missing.h"
22 #include "prioq.h"
23 #include "process-util.h"
24 #include "set.h"
25 #include "signal-util.h"
26 #include "string-table.h"
27 #include "string-util.h"
28 #include "time-util.h"
29 #include "util.h"
30
31 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
32
33 typedef enum EventSourceType {
34         SOURCE_IO,
35         SOURCE_TIME_REALTIME,
36         SOURCE_TIME_BOOTTIME,
37         SOURCE_TIME_MONOTONIC,
38         SOURCE_TIME_REALTIME_ALARM,
39         SOURCE_TIME_BOOTTIME_ALARM,
40         SOURCE_SIGNAL,
41         SOURCE_CHILD,
42         SOURCE_DEFER,
43         SOURCE_POST,
44         SOURCE_EXIT,
45         SOURCE_WATCHDOG,
46         _SOURCE_EVENT_SOURCE_TYPE_MAX,
47         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
48 } EventSourceType;
49
50 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
51         [SOURCE_IO] = "io",
52         [SOURCE_TIME_REALTIME] = "realtime",
53         [SOURCE_TIME_BOOTTIME] = "bootime",
54         [SOURCE_TIME_MONOTONIC] = "monotonic",
55         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
56         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
57         [SOURCE_SIGNAL] = "signal",
58         [SOURCE_CHILD] = "child",
59         [SOURCE_DEFER] = "defer",
60         [SOURCE_POST] = "post",
61         [SOURCE_EXIT] = "exit",
62         [SOURCE_WATCHDOG] = "watchdog",
63 };
64
65 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
66
67 /* All objects we use in epoll events start with this value, so that
68  * we know how to dispatch it */
69 typedef enum WakeupType {
70         WAKEUP_NONE,
71         WAKEUP_EVENT_SOURCE,
72         WAKEUP_CLOCK_DATA,
73         WAKEUP_SIGNAL_DATA,
74         _WAKEUP_TYPE_MAX,
75         _WAKEUP_TYPE_INVALID = -1,
76 } WakeupType;
77
78 #define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
79
80 struct sd_event_source {
81         WakeupType wakeup;
82
83         unsigned n_ref;
84
85         sd_event *event;
86         void *userdata;
87         sd_event_handler_t prepare;
88
89         char *description;
90
91         EventSourceType type:5;
92         int enabled:3;
93         bool pending:1;
94         bool dispatching:1;
95         bool floating:1;
96
97         int64_t priority;
98         unsigned pending_index;
99         unsigned prepare_index;
100         uint64_t pending_iteration;
101         uint64_t prepare_iteration;
102
103         LIST_FIELDS(sd_event_source, sources);
104
105         union {
106                 struct {
107                         sd_event_io_handler_t callback;
108                         int fd;
109                         uint32_t events;
110                         uint32_t revents;
111                         bool registered:1;
112                         bool owned:1;
113                 } io;
114                 struct {
115                         sd_event_time_handler_t callback;
116                         usec_t next, accuracy;
117                         unsigned earliest_index;
118                         unsigned latest_index;
119                 } time;
120                 struct {
121                         sd_event_signal_handler_t callback;
122                         struct signalfd_siginfo siginfo;
123                         int sig;
124                 } signal;
125                 struct {
126                         sd_event_child_handler_t callback;
127                         siginfo_t siginfo;
128                         pid_t pid;
129                         int options;
130                 } child;
131                 struct {
132                         sd_event_handler_t callback;
133                 } defer;
134                 struct {
135                         sd_event_handler_t callback;
136                 } post;
137                 struct {
138                         sd_event_handler_t callback;
139                         unsigned prioq_index;
140                 } exit;
141         };
142 };
143
144 struct clock_data {
145         WakeupType wakeup;
146         int fd;
147
148         /* For all clocks we maintain two priority queues each, one
149          * ordered for the earliest times the events may be
150          * dispatched, and one ordered by the latest times they must
151          * have been dispatched. The range between the top entries in
152          * the two prioqs is the time window we can freely schedule
153          * wakeups in */
154
155         Prioq *earliest;
156         Prioq *latest;
157         usec_t next;
158
159         bool needs_rearm:1;
160 };
161
162 struct signal_data {
163         WakeupType wakeup;
164
165         /* For each priority we maintain one signal fd, so that we
166          * only have to dequeue a single event per priority at a
167          * time. */
168
169         int fd;
170         int64_t priority;
171         sigset_t sigset;
172         sd_event_source *current;
173 };
174
175 struct sd_event {
176         unsigned n_ref;
177
178         int epoll_fd;
179         int watchdog_fd;
180
181         Prioq *pending;
182         Prioq *prepare;
183
184         /* timerfd_create() only supports these five clocks so far. We
185          * can add support for more clocks when the kernel learns to
186          * deal with them, too. */
187         struct clock_data realtime;
188         struct clock_data boottime;
189         struct clock_data monotonic;
190         struct clock_data realtime_alarm;
191         struct clock_data boottime_alarm;
192
193         usec_t perturb;
194
195         sd_event_source **signal_sources; /* indexed by signal number */
196         Hashmap *signal_data; /* indexed by priority */
197
198         Hashmap *child_sources;
199         unsigned n_enabled_child_sources;
200
201         Set *post_sources;
202
203         Prioq *exit;
204
205         pid_t original_pid;
206
207         uint64_t iteration;
208         triple_timestamp timestamp;
209         int state;
210
211         bool exit_requested:1;
212         bool need_process_child:1;
213         bool watchdog:1;
214         bool profile_delays:1;
215
216         int exit_code;
217
218         pid_t tid;
219         sd_event **default_event_ptr;
220
221         usec_t watchdog_last, watchdog_period;
222
223         unsigned n_sources;
224
225         LIST_HEAD(sd_event_source, sources);
226
227         usec_t last_run, last_log;
228         unsigned delays[sizeof(usec_t) * 8];
229 };
230
231 static thread_local sd_event *default_event = NULL;
232
233 static void source_disconnect(sd_event_source *s);
234
235 static sd_event *event_resolve(sd_event *e) {
236         return e == SD_EVENT_DEFAULT ? default_event : e;
237 }
238
239 static int pending_prioq_compare(const void *a, const void *b) {
240         const sd_event_source *x = a, *y = b;
241
242         assert(x->pending);
243         assert(y->pending);
244
245         /* Enabled ones first */
246         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
247                 return -1;
248         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
249                 return 1;
250
251         /* Lower priority values first */
252         if (x->priority < y->priority)
253                 return -1;
254         if (x->priority > y->priority)
255                 return 1;
256
257         /* Older entries first */
258         if (x->pending_iteration < y->pending_iteration)
259                 return -1;
260         if (x->pending_iteration > y->pending_iteration)
261                 return 1;
262
263         return 0;
264 }
265
266 static int prepare_prioq_compare(const void *a, const void *b) {
267         const sd_event_source *x = a, *y = b;
268
269         assert(x->prepare);
270         assert(y->prepare);
271
272         /* Enabled ones first */
273         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
274                 return -1;
275         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
276                 return 1;
277
278         /* Move most recently prepared ones last, so that we can stop
279          * preparing as soon as we hit one that has already been
280          * prepared in the current iteration */
281         if (x->prepare_iteration < y->prepare_iteration)
282                 return -1;
283         if (x->prepare_iteration > y->prepare_iteration)
284                 return 1;
285
286         /* Lower priority values first */
287         if (x->priority < y->priority)
288                 return -1;
289         if (x->priority > y->priority)
290                 return 1;
291
292         return 0;
293 }
294
295 static int earliest_time_prioq_compare(const void *a, const void *b) {
296         const sd_event_source *x = a, *y = b;
297
298         assert(EVENT_SOURCE_IS_TIME(x->type));
299         assert(x->type == y->type);
300
301         /* Enabled ones first */
302         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
303                 return -1;
304         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
305                 return 1;
306
307         /* Move the pending ones to the end */
308         if (!x->pending && y->pending)
309                 return -1;
310         if (x->pending && !y->pending)
311                 return 1;
312
313         /* Order by time */
314         if (x->time.next < y->time.next)
315                 return -1;
316         if (x->time.next > y->time.next)
317                 return 1;
318
319         return 0;
320 }
321
322 static usec_t time_event_source_latest(const sd_event_source *s) {
323         return usec_add(s->time.next, s->time.accuracy);
324 }
325
326 static int latest_time_prioq_compare(const void *a, const void *b) {
327         const sd_event_source *x = a, *y = b;
328
329         assert(EVENT_SOURCE_IS_TIME(x->type));
330         assert(x->type == y->type);
331
332         /* Enabled ones first */
333         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
334                 return -1;
335         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
336                 return 1;
337
338         /* Move the pending ones to the end */
339         if (!x->pending && y->pending)
340                 return -1;
341         if (x->pending && !y->pending)
342                 return 1;
343
344         /* Order by time */
345         if (time_event_source_latest(x) < time_event_source_latest(y))
346                 return -1;
347         if (time_event_source_latest(x) > time_event_source_latest(y))
348                 return 1;
349
350         return 0;
351 }
352
353 static int exit_prioq_compare(const void *a, const void *b) {
354         const sd_event_source *x = a, *y = b;
355
356         assert(x->type == SOURCE_EXIT);
357         assert(y->type == SOURCE_EXIT);
358
359         /* Enabled ones first */
360         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
361                 return -1;
362         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
363                 return 1;
364
365         /* Lower priority values first */
366         if (x->priority < y->priority)
367                 return -1;
368         if (x->priority > y->priority)
369                 return 1;
370
371         return 0;
372 }
373
374 static void free_clock_data(struct clock_data *d) {
375         assert(d);
376         assert(d->wakeup == WAKEUP_CLOCK_DATA);
377
378         safe_close(d->fd);
379         prioq_free(d->earliest);
380         prioq_free(d->latest);
381 }
382
383 static void event_free(sd_event *e) {
384         sd_event_source *s;
385
386         assert(e);
387
388         while ((s = e->sources)) {
389                 assert(s->floating);
390                 source_disconnect(s);
391                 sd_event_source_unref(s);
392         }
393
394         assert(e->n_sources == 0);
395
396         if (e->default_event_ptr)
397                 *(e->default_event_ptr) = NULL;
398
399         safe_close(e->epoll_fd);
400         safe_close(e->watchdog_fd);
401
402         free_clock_data(&e->realtime);
403         free_clock_data(&e->boottime);
404         free_clock_data(&e->monotonic);
405         free_clock_data(&e->realtime_alarm);
406         free_clock_data(&e->boottime_alarm);
407
408         prioq_free(e->pending);
409         prioq_free(e->prepare);
410         prioq_free(e->exit);
411
412         free(e->signal_sources);
413         hashmap_free(e->signal_data);
414
415         hashmap_free(e->child_sources);
416         set_free(e->post_sources);
417         free(e);
418 }
419
420 _public_ int sd_event_new(sd_event** ret) {
421         sd_event *e;
422         int r;
423
424         assert_return(ret, -EINVAL);
425
426         e = new0(sd_event, 1);
427         if (!e)
428                 return -ENOMEM;
429
430         e->n_ref = 1;
431         e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
432         e->realtime.next = e->boottime.next = e->monotonic.next = e->realtime_alarm.next = e->boottime_alarm.next = USEC_INFINITY;
433         e->realtime.wakeup = e->boottime.wakeup = e->monotonic.wakeup = e->realtime_alarm.wakeup = e->boottime_alarm.wakeup = WAKEUP_CLOCK_DATA;
434         e->original_pid = getpid_cached();
435         e->perturb = USEC_INFINITY;
436
437         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
438         if (r < 0)
439                 goto fail;
440
441         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
442         if (e->epoll_fd < 0) {
443                 r = -errno;
444                 goto fail;
445         }
446
447         e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
448
449         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
450                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 ... 2^63 us will be logged every 5s.");
451                 e->profile_delays = true;
452         }
453
454         *ret = e;
455         return 0;
456
457 fail:
458         event_free(e);
459         return r;
460 }
461
462 _public_ sd_event* sd_event_ref(sd_event *e) {
463
464         if (!e)
465                 return NULL;
466
467         assert(e->n_ref >= 1);
468         e->n_ref++;
469
470         return e;
471 }
472
473 _public_ sd_event* sd_event_unref(sd_event *e) {
474
475         if (!e)
476                 return NULL;
477
478         assert(e->n_ref >= 1);
479         e->n_ref--;
480
481         if (e->n_ref <= 0)
482                 event_free(e);
483
484         return NULL;
485 }
486
487 static bool event_pid_changed(sd_event *e) {
488         assert(e);
489
490         /* We don't support people creating an event loop and keeping
491          * it around over a fork(). Let's complain. */
492
493         return e->original_pid != getpid_cached();
494 }
495
496 static void source_io_unregister(sd_event_source *s) {
497         int r;
498
499         assert(s);
500         assert(s->type == SOURCE_IO);
501
502         if (event_pid_changed(s->event))
503                 return;
504
505         if (!s->io.registered)
506                 return;
507
508         r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
509         if (r < 0)
510                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
511                                 strna(s->description), event_source_type_to_string(s->type));
512
513         s->io.registered = false;
514 }
515
516 static int source_io_register(
517                 sd_event_source *s,
518                 int enabled,
519                 uint32_t events) {
520
521         struct epoll_event ev = {};
522         int r;
523
524         assert(s);
525         assert(s->type == SOURCE_IO);
526         assert(enabled != SD_EVENT_OFF);
527
528         ev.events = events;
529         ev.data.ptr = s;
530
531         if (enabled == SD_EVENT_ONESHOT)
532                 ev.events |= EPOLLONESHOT;
533
534         if (s->io.registered)
535                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
536         else
537                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
538         if (r < 0)
539                 return -errno;
540
541         s->io.registered = true;
542
543         return 0;
544 }
545
546 static clockid_t event_source_type_to_clock(EventSourceType t) {
547
548         switch (t) {
549
550         case SOURCE_TIME_REALTIME:
551                 return CLOCK_REALTIME;
552
553         case SOURCE_TIME_BOOTTIME:
554                 return CLOCK_BOOTTIME;
555
556         case SOURCE_TIME_MONOTONIC:
557                 return CLOCK_MONOTONIC;
558
559         case SOURCE_TIME_REALTIME_ALARM:
560                 return CLOCK_REALTIME_ALARM;
561
562         case SOURCE_TIME_BOOTTIME_ALARM:
563                 return CLOCK_BOOTTIME_ALARM;
564
565         default:
566                 return (clockid_t) -1;
567         }
568 }
569
570 static EventSourceType clock_to_event_source_type(clockid_t clock) {
571
572         switch (clock) {
573
574         case CLOCK_REALTIME:
575                 return SOURCE_TIME_REALTIME;
576
577         case CLOCK_BOOTTIME:
578                 return SOURCE_TIME_BOOTTIME;
579
580         case CLOCK_MONOTONIC:
581                 return SOURCE_TIME_MONOTONIC;
582
583         case CLOCK_REALTIME_ALARM:
584                 return SOURCE_TIME_REALTIME_ALARM;
585
586         case CLOCK_BOOTTIME_ALARM:
587                 return SOURCE_TIME_BOOTTIME_ALARM;
588
589         default:
590                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
591         }
592 }
593
594 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
595         assert(e);
596
597         switch (t) {
598
599         case SOURCE_TIME_REALTIME:
600                 return &e->realtime;
601
602         case SOURCE_TIME_BOOTTIME:
603                 return &e->boottime;
604
605         case SOURCE_TIME_MONOTONIC:
606                 return &e->monotonic;
607
608         case SOURCE_TIME_REALTIME_ALARM:
609                 return &e->realtime_alarm;
610
611         case SOURCE_TIME_BOOTTIME_ALARM:
612                 return &e->boottime_alarm;
613
614         default:
615                 return NULL;
616         }
617 }
618
619 static int event_make_signal_data(
620                 sd_event *e,
621                 int sig,
622                 struct signal_data **ret) {
623
624         struct epoll_event ev = {};
625         struct signal_data *d;
626         bool added = false;
627         sigset_t ss_copy;
628         int64_t priority;
629         int r;
630
631         assert(e);
632
633         if (event_pid_changed(e))
634                 return -ECHILD;
635
636         if (e->signal_sources && e->signal_sources[sig])
637                 priority = e->signal_sources[sig]->priority;
638         else
639                 priority = 0;
640
641         d = hashmap_get(e->signal_data, &priority);
642         if (d) {
643                 if (sigismember(&d->sigset, sig) > 0) {
644                         if (ret)
645                                 *ret = d;
646                         return 0;
647                 }
648         } else {
649                 r = hashmap_ensure_allocated(&e->signal_data, &uint64_hash_ops);
650                 if (r < 0)
651                         return r;
652
653                 d = new0(struct signal_data, 1);
654                 if (!d)
655                         return -ENOMEM;
656
657                 d->wakeup = WAKEUP_SIGNAL_DATA;
658                 d->fd  = -1;
659                 d->priority = priority;
660
661                 r = hashmap_put(e->signal_data, &d->priority, d);
662                 if (r < 0) {
663                         free(d);
664                         return r;
665                 }
666
667                 added = true;
668         }
669
670         ss_copy = d->sigset;
671         assert_se(sigaddset(&ss_copy, sig) >= 0);
672
673         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
674         if (r < 0) {
675                 r = -errno;
676                 goto fail;
677         }
678
679         d->sigset = ss_copy;
680
681         if (d->fd >= 0) {
682                 if (ret)
683                         *ret = d;
684                 return 0;
685         }
686
687         d->fd = fd_move_above_stdio(r);
688
689         ev.events = EPOLLIN;
690         ev.data.ptr = d;
691
692         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev);
693         if (r < 0)  {
694                 r = -errno;
695                 goto fail;
696         }
697
698         if (ret)
699                 *ret = d;
700
701         return 0;
702
703 fail:
704         if (added) {
705                 d->fd = safe_close(d->fd);
706                 hashmap_remove(e->signal_data, &d->priority);
707                 free(d);
708         }
709
710         return r;
711 }
712
713 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
714         assert(e);
715         assert(d);
716
717         /* Turns off the specified signal in the signal data
718          * object. If the signal mask of the object becomes empty that
719          * way removes it. */
720
721         if (sigismember(&d->sigset, sig) == 0)
722                 return;
723
724         assert_se(sigdelset(&d->sigset, sig) >= 0);
725
726         if (sigisemptyset(&d->sigset)) {
727
728                 /* If all the mask is all-zero we can get rid of the structure */
729                 hashmap_remove(e->signal_data, &d->priority);
730                 safe_close(d->fd);
731                 free(d);
732                 return;
733         }
734
735         assert(d->fd >= 0);
736
737         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
738                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
739 }
740
741 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
742         struct signal_data *d;
743         static const int64_t zero_priority = 0;
744
745         assert(e);
746
747         /* Rechecks if the specified signal is still something we are
748          * interested in. If not, we'll unmask it, and possibly drop
749          * the signalfd for it. */
750
751         if (sig == SIGCHLD &&
752             e->n_enabled_child_sources > 0)
753                 return;
754
755         if (e->signal_sources &&
756             e->signal_sources[sig] &&
757             e->signal_sources[sig]->enabled != SD_EVENT_OFF)
758                 return;
759
760         /*
761          * The specified signal might be enabled in three different queues:
762          *
763          * 1) the one that belongs to the priority passed (if it is non-NULL)
764          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
765          * 3) the 0 priority (to cover the SIGCHLD case)
766          *
767          * Hence, let's remove it from all three here.
768          */
769
770         if (priority) {
771                 d = hashmap_get(e->signal_data, priority);
772                 if (d)
773                         event_unmask_signal_data(e, d, sig);
774         }
775
776         if (e->signal_sources && e->signal_sources[sig]) {
777                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
778                 if (d)
779                         event_unmask_signal_data(e, d, sig);
780         }
781
782         d = hashmap_get(e->signal_data, &zero_priority);
783         if (d)
784                 event_unmask_signal_data(e, d, sig);
785 }
786
787 static void source_disconnect(sd_event_source *s) {
788         sd_event *event;
789
790         assert(s);
791
792         if (!s->event)
793                 return;
794
795         assert(s->event->n_sources > 0);
796
797         switch (s->type) {
798
799         case SOURCE_IO:
800                 if (s->io.fd >= 0)
801                         source_io_unregister(s);
802
803                 break;
804
805         case SOURCE_TIME_REALTIME:
806         case SOURCE_TIME_BOOTTIME:
807         case SOURCE_TIME_MONOTONIC:
808         case SOURCE_TIME_REALTIME_ALARM:
809         case SOURCE_TIME_BOOTTIME_ALARM: {
810                 struct clock_data *d;
811
812                 d = event_get_clock_data(s->event, s->type);
813                 assert(d);
814
815                 prioq_remove(d->earliest, s, &s->time.earliest_index);
816                 prioq_remove(d->latest, s, &s->time.latest_index);
817                 d->needs_rearm = true;
818                 break;
819         }
820
821         case SOURCE_SIGNAL:
822                 if (s->signal.sig > 0) {
823
824                         if (s->event->signal_sources)
825                                 s->event->signal_sources[s->signal.sig] = NULL;
826
827                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
828                 }
829
830                 break;
831
832         case SOURCE_CHILD:
833                 if (s->child.pid > 0) {
834                         if (s->enabled != SD_EVENT_OFF) {
835                                 assert(s->event->n_enabled_child_sources > 0);
836                                 s->event->n_enabled_child_sources--;
837                         }
838
839                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
840                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
841                 }
842
843                 break;
844
845         case SOURCE_DEFER:
846                 /* nothing */
847                 break;
848
849         case SOURCE_POST:
850                 set_remove(s->event->post_sources, s);
851                 break;
852
853         case SOURCE_EXIT:
854                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
855                 break;
856
857         default:
858                 assert_not_reached("Wut? I shouldn't exist.");
859         }
860
861         if (s->pending)
862                 prioq_remove(s->event->pending, s, &s->pending_index);
863
864         if (s->prepare)
865                 prioq_remove(s->event->prepare, s, &s->prepare_index);
866
867         event = s->event;
868
869         s->type = _SOURCE_EVENT_SOURCE_TYPE_INVALID;
870         s->event = NULL;
871         LIST_REMOVE(sources, event->sources, s);
872         event->n_sources--;
873
874         if (!s->floating)
875                 sd_event_unref(event);
876 }
877
878 static void source_free(sd_event_source *s) {
879         assert(s);
880
881         source_disconnect(s);
882
883         if (s->type == SOURCE_IO && s->io.owned)
884                 safe_close(s->io.fd);
885
886         free(s->description);
887         free(s);
888 }
889
890 static int source_set_pending(sd_event_source *s, bool b) {
891         int r;
892
893         assert(s);
894         assert(s->type != SOURCE_EXIT);
895
896         if (s->pending == b)
897                 return 0;
898
899         s->pending = b;
900
901         if (b) {
902                 s->pending_iteration = s->event->iteration;
903
904                 r = prioq_put(s->event->pending, s, &s->pending_index);
905                 if (r < 0) {
906                         s->pending = false;
907                         return r;
908                 }
909         } else
910                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
911
912         if (EVENT_SOURCE_IS_TIME(s->type)) {
913                 struct clock_data *d;
914
915                 d = event_get_clock_data(s->event, s->type);
916                 assert(d);
917
918                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
919                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
920                 d->needs_rearm = true;
921         }
922
923         if (s->type == SOURCE_SIGNAL && !b) {
924                 struct signal_data *d;
925
926                 d = hashmap_get(s->event->signal_data, &s->priority);
927                 if (d && d->current == s)
928                         d->current = NULL;
929         }
930
931         return 0;
932 }
933
934 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
935         sd_event_source *s;
936
937         assert(e);
938
939         s = new0(sd_event_source, 1);
940         if (!s)
941                 return NULL;
942
943         s->n_ref = 1;
944         s->event = e;
945         s->floating = floating;
946         s->type = type;
947         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
948
949         if (!floating)
950                 sd_event_ref(e);
951
952         LIST_PREPEND(sources, e->sources, s);
953         e->n_sources++;
954
955         return s;
956 }
957
958 _public_ int sd_event_add_io(
959                 sd_event *e,
960                 sd_event_source **ret,
961                 int fd,
962                 uint32_t events,
963                 sd_event_io_handler_t callback,
964                 void *userdata) {
965
966         sd_event_source *s;
967         int r;
968
969         assert_return(e, -EINVAL);
970         assert_return(e = event_resolve(e), -ENOPKG);
971         assert_return(fd >= 0, -EBADF);
972         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
973         assert_return(callback, -EINVAL);
974         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
975         assert_return(!event_pid_changed(e), -ECHILD);
976
977         s = source_new(e, !ret, SOURCE_IO);
978         if (!s)
979                 return -ENOMEM;
980
981         s->wakeup = WAKEUP_EVENT_SOURCE;
982         s->io.fd = fd;
983         s->io.events = events;
984         s->io.callback = callback;
985         s->userdata = userdata;
986         s->enabled = SD_EVENT_ON;
987
988         r = source_io_register(s, s->enabled, events);
989         if (r < 0) {
990                 source_free(s);
991                 return r;
992         }
993
994         if (ret)
995                 *ret = s;
996
997         return 0;
998 }
999
1000 static void initialize_perturb(sd_event *e) {
1001         sd_id128_t bootid = {};
1002
1003         /* When we sleep for longer, we try to realign the wakeup to
1004            the same time wihtin each minute/second/250ms, so that
1005            events all across the system can be coalesced into a single
1006            CPU wakeup. However, let's take some system-specific
1007            randomness for this value, so that in a network of systems
1008            with synced clocks timer events are distributed a
1009            bit. Here, we calculate a perturbation usec offset from the
1010            boot ID. */
1011
1012         if (_likely_(e->perturb != USEC_INFINITY))
1013                 return;
1014
1015         if (sd_id128_get_boot(&bootid) >= 0)
1016                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1017 }
1018
1019 static int event_setup_timer_fd(
1020                 sd_event *e,
1021                 struct clock_data *d,
1022                 clockid_t clock) {
1023
1024         struct epoll_event ev = {};
1025         int r, fd;
1026
1027         assert(e);
1028         assert(d);
1029
1030         if (_likely_(d->fd >= 0))
1031                 return 0;
1032
1033         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1034         if (fd < 0)
1035                 return -errno;
1036
1037         fd = fd_move_above_stdio(fd);
1038
1039         ev.events = EPOLLIN;
1040         ev.data.ptr = d;
1041
1042         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
1043         if (r < 0) {
1044                 safe_close(fd);
1045                 return -errno;
1046         }
1047
1048         d->fd = fd;
1049         return 0;
1050 }
1051
1052 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1053         assert(s);
1054
1055         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1056 }
1057
1058 _public_ int sd_event_add_time(
1059                 sd_event *e,
1060                 sd_event_source **ret,
1061                 clockid_t clock,
1062                 uint64_t usec,
1063                 uint64_t accuracy,
1064                 sd_event_time_handler_t callback,
1065                 void *userdata) {
1066
1067         EventSourceType type;
1068         sd_event_source *s;
1069         struct clock_data *d;
1070         int r;
1071
1072         assert_return(e, -EINVAL);
1073         assert_return(e = event_resolve(e), -ENOPKG);
1074         assert_return(accuracy != (uint64_t) -1, -EINVAL);
1075         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1076         assert_return(!event_pid_changed(e), -ECHILD);
1077
1078         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1079                 return -EOPNOTSUPP;
1080
1081         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1082         if (type < 0)
1083                 return -EOPNOTSUPP;
1084
1085         if (!callback)
1086                 callback = time_exit_callback;
1087
1088         d = event_get_clock_data(e, type);
1089         assert(d);
1090
1091         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1092         if (r < 0)
1093                 return r;
1094
1095         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1096         if (r < 0)
1097                 return r;
1098
1099         if (d->fd < 0) {
1100                 r = event_setup_timer_fd(e, d, clock);
1101                 if (r < 0)
1102                         return r;
1103         }
1104
1105         s = source_new(e, !ret, type);
1106         if (!s)
1107                 return -ENOMEM;
1108
1109         s->time.next = usec;
1110         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1111         s->time.callback = callback;
1112         s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
1113         s->userdata = userdata;
1114         s->enabled = SD_EVENT_ONESHOT;
1115
1116         d->needs_rearm = true;
1117
1118         r = prioq_put(d->earliest, s, &s->time.earliest_index);
1119         if (r < 0)
1120                 goto fail;
1121
1122         r = prioq_put(d->latest, s, &s->time.latest_index);
1123         if (r < 0)
1124                 goto fail;
1125
1126         if (ret)
1127                 *ret = s;
1128
1129         return 0;
1130
1131 fail:
1132         source_free(s);
1133         return r;
1134 }
1135
1136 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1137         assert(s);
1138
1139         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1140 }
1141
1142 _public_ int sd_event_add_signal(
1143                 sd_event *e,
1144                 sd_event_source **ret,
1145                 int sig,
1146                 sd_event_signal_handler_t callback,
1147                 void *userdata) {
1148
1149         sd_event_source *s;
1150         struct signal_data *d;
1151         sigset_t ss;
1152         int r;
1153
1154         assert_return(e, -EINVAL);
1155         assert_return(e = event_resolve(e), -ENOPKG);
1156         assert_return(SIGNAL_VALID(sig), -EINVAL);
1157         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1158         assert_return(!event_pid_changed(e), -ECHILD);
1159
1160         if (!callback)
1161                 callback = signal_exit_callback;
1162
1163         r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
1164         if (r != 0)
1165                 return -r;
1166
1167         if (!sigismember(&ss, sig))
1168                 return -EBUSY;
1169
1170         if (!e->signal_sources) {
1171                 e->signal_sources = new0(sd_event_source*, _NSIG);
1172                 if (!e->signal_sources)
1173                         return -ENOMEM;
1174         } else if (e->signal_sources[sig])
1175                 return -EBUSY;
1176
1177         s = source_new(e, !ret, SOURCE_SIGNAL);
1178         if (!s)
1179                 return -ENOMEM;
1180
1181         s->signal.sig = sig;
1182         s->signal.callback = callback;
1183         s->userdata = userdata;
1184         s->enabled = SD_EVENT_ON;
1185
1186         e->signal_sources[sig] = s;
1187
1188         r = event_make_signal_data(e, sig, &d);
1189         if (r < 0) {
1190                 source_free(s);
1191                 return r;
1192         }
1193
1194         /* Use the signal name as description for the event source by default */
1195         (void) sd_event_source_set_description(s, signal_to_string(sig));
1196
1197         if (ret)
1198                 *ret = s;
1199
1200         return 0;
1201 }
1202
1203 _public_ int sd_event_add_child(
1204                 sd_event *e,
1205                 sd_event_source **ret,
1206                 pid_t pid,
1207                 int options,
1208                 sd_event_child_handler_t callback,
1209                 void *userdata) {
1210
1211         sd_event_source *s;
1212         int r;
1213
1214         assert_return(e, -EINVAL);
1215         assert_return(e = event_resolve(e), -ENOPKG);
1216         assert_return(pid > 1, -EINVAL);
1217         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1218         assert_return(options != 0, -EINVAL);
1219         assert_return(callback, -EINVAL);
1220         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1221         assert_return(!event_pid_changed(e), -ECHILD);
1222
1223         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1224         if (r < 0)
1225                 return r;
1226
1227         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1228                 return -EBUSY;
1229
1230         s = source_new(e, !ret, SOURCE_CHILD);
1231         if (!s)
1232                 return -ENOMEM;
1233
1234         s->child.pid = pid;
1235         s->child.options = options;
1236         s->child.callback = callback;
1237         s->userdata = userdata;
1238         s->enabled = SD_EVENT_ONESHOT;
1239
1240         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1241         if (r < 0) {
1242                 source_free(s);
1243                 return r;
1244         }
1245
1246         e->n_enabled_child_sources++;
1247
1248         r = event_make_signal_data(e, SIGCHLD, NULL);
1249         if (r < 0) {
1250                 e->n_enabled_child_sources--;
1251                 source_free(s);
1252                 return r;
1253         }
1254
1255         e->need_process_child = true;
1256
1257         if (ret)
1258                 *ret = s;
1259
1260         return 0;
1261 }
1262
1263 _public_ int sd_event_add_defer(
1264                 sd_event *e,
1265                 sd_event_source **ret,
1266                 sd_event_handler_t callback,
1267                 void *userdata) {
1268
1269         sd_event_source *s;
1270         int r;
1271
1272         assert_return(e, -EINVAL);
1273         assert_return(e = event_resolve(e), -ENOPKG);
1274         assert_return(callback, -EINVAL);
1275         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1276         assert_return(!event_pid_changed(e), -ECHILD);
1277
1278         s = source_new(e, !ret, SOURCE_DEFER);
1279         if (!s)
1280                 return -ENOMEM;
1281
1282         s->defer.callback = callback;
1283         s->userdata = userdata;
1284         s->enabled = SD_EVENT_ONESHOT;
1285
1286         r = source_set_pending(s, true);
1287         if (r < 0) {
1288                 source_free(s);
1289                 return r;
1290         }
1291
1292         if (ret)
1293                 *ret = s;
1294
1295         return 0;
1296 }
1297
1298 _public_ int sd_event_add_post(
1299                 sd_event *e,
1300                 sd_event_source **ret,
1301                 sd_event_handler_t callback,
1302                 void *userdata) {
1303
1304         sd_event_source *s;
1305         int r;
1306
1307         assert_return(e, -EINVAL);
1308         assert_return(e = event_resolve(e), -ENOPKG);
1309         assert_return(callback, -EINVAL);
1310         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1311         assert_return(!event_pid_changed(e), -ECHILD);
1312
1313         r = set_ensure_allocated(&e->post_sources, NULL);
1314         if (r < 0)
1315                 return r;
1316
1317         s = source_new(e, !ret, SOURCE_POST);
1318         if (!s)
1319                 return -ENOMEM;
1320
1321         s->post.callback = callback;
1322         s->userdata = userdata;
1323         s->enabled = SD_EVENT_ON;
1324
1325         r = set_put(e->post_sources, s);
1326         if (r < 0) {
1327                 source_free(s);
1328                 return r;
1329         }
1330
1331         if (ret)
1332                 *ret = s;
1333
1334         return 0;
1335 }
1336
1337 _public_ int sd_event_add_exit(
1338                 sd_event *e,
1339                 sd_event_source **ret,
1340                 sd_event_handler_t callback,
1341                 void *userdata) {
1342
1343         sd_event_source *s;
1344         int r;
1345
1346         assert_return(e, -EINVAL);
1347         assert_return(e = event_resolve(e), -ENOPKG);
1348         assert_return(callback, -EINVAL);
1349         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1350         assert_return(!event_pid_changed(e), -ECHILD);
1351
1352         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1353         if (r < 0)
1354                 return r;
1355
1356         s = source_new(e, !ret, SOURCE_EXIT);
1357         if (!s)
1358                 return -ENOMEM;
1359
1360         s->exit.callback = callback;
1361         s->userdata = userdata;
1362         s->exit.prioq_index = PRIOQ_IDX_NULL;
1363         s->enabled = SD_EVENT_ONESHOT;
1364
1365         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1366         if (r < 0) {
1367                 source_free(s);
1368                 return r;
1369         }
1370
1371         if (ret)
1372                 *ret = s;
1373
1374         return 0;
1375 }
1376
1377 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
1378
1379         if (!s)
1380                 return NULL;
1381
1382         assert(s->n_ref >= 1);
1383         s->n_ref++;
1384
1385         return s;
1386 }
1387
1388 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
1389
1390         if (!s)
1391                 return NULL;
1392
1393         assert(s->n_ref >= 1);
1394         s->n_ref--;
1395
1396         if (s->n_ref <= 0) {
1397                 /* Here's a special hack: when we are called from a
1398                  * dispatch handler we won't free the event source
1399                  * immediately, but we will detach the fd from the
1400                  * epoll. This way it is safe for the caller to unref
1401                  * the event source and immediately close the fd, but
1402                  * we still retain a valid event source object after
1403                  * the callback. */
1404
1405                 if (s->dispatching) {
1406                         if (s->type == SOURCE_IO)
1407                                 source_io_unregister(s);
1408
1409                         source_disconnect(s);
1410                 } else
1411                         source_free(s);
1412         }
1413
1414         return NULL;
1415 }
1416
1417 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
1418         assert_return(s, -EINVAL);
1419         assert_return(!event_pid_changed(s->event), -ECHILD);
1420
1421         return free_and_strdup(&s->description, description);
1422 }
1423
1424 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
1425         assert_return(s, -EINVAL);
1426         assert_return(description, -EINVAL);
1427         assert_return(s->description, -ENXIO);
1428         assert_return(!event_pid_changed(s->event), -ECHILD);
1429
1430         *description = s->description;
1431         return 0;
1432 }
1433
1434 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1435         assert_return(s, NULL);
1436
1437         return s->event;
1438 }
1439
1440 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1441         assert_return(s, -EINVAL);
1442         assert_return(s->type != SOURCE_EXIT, -EDOM);
1443         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1444         assert_return(!event_pid_changed(s->event), -ECHILD);
1445
1446         return s->pending;
1447 }
1448
1449 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1450         assert_return(s, -EINVAL);
1451         assert_return(s->type == SOURCE_IO, -EDOM);
1452         assert_return(!event_pid_changed(s->event), -ECHILD);
1453
1454         return s->io.fd;
1455 }
1456
1457 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1458         int r;
1459
1460         assert_return(s, -EINVAL);
1461         assert_return(fd >= 0, -EBADF);
1462         assert_return(s->type == SOURCE_IO, -EDOM);
1463         assert_return(!event_pid_changed(s->event), -ECHILD);
1464
1465         if (s->io.fd == fd)
1466                 return 0;
1467
1468         if (s->enabled == SD_EVENT_OFF) {
1469                 s->io.fd = fd;
1470                 s->io.registered = false;
1471         } else {
1472                 int saved_fd;
1473
1474                 saved_fd = s->io.fd;
1475                 assert(s->io.registered);
1476
1477                 s->io.fd = fd;
1478                 s->io.registered = false;
1479
1480                 r = source_io_register(s, s->enabled, s->io.events);
1481                 if (r < 0) {
1482                         s->io.fd = saved_fd;
1483                         s->io.registered = true;
1484                         return r;
1485                 }
1486
1487                 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
1488         }
1489
1490         return 0;
1491 }
1492
1493 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
1494         assert_return(s, -EINVAL);
1495         assert_return(s->type == SOURCE_IO, -EDOM);
1496
1497         return s->io.owned;
1498 }
1499
1500 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
1501         assert_return(s, -EINVAL);
1502         assert_return(s->type == SOURCE_IO, -EDOM);
1503
1504         s->io.owned = own;
1505         return 0;
1506 }
1507
1508 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1509         assert_return(s, -EINVAL);
1510         assert_return(events, -EINVAL);
1511         assert_return(s->type == SOURCE_IO, -EDOM);
1512         assert_return(!event_pid_changed(s->event), -ECHILD);
1513
1514         *events = s->io.events;
1515         return 0;
1516 }
1517
1518 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1519         int r;
1520
1521         assert_return(s, -EINVAL);
1522         assert_return(s->type == SOURCE_IO, -EDOM);
1523         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1524         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1525         assert_return(!event_pid_changed(s->event), -ECHILD);
1526
1527         /* edge-triggered updates are never skipped, so we can reset edges */
1528         if (s->io.events == events && !(events & EPOLLET))
1529                 return 0;
1530
1531         if (s->enabled != SD_EVENT_OFF) {
1532                 r = source_io_register(s, s->enabled, events);
1533                 if (r < 0)
1534                         return r;
1535         }
1536
1537         s->io.events = events;
1538         source_set_pending(s, false);
1539
1540         return 0;
1541 }
1542
1543 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1544         assert_return(s, -EINVAL);
1545         assert_return(revents, -EINVAL);
1546         assert_return(s->type == SOURCE_IO, -EDOM);
1547         assert_return(s->pending, -ENODATA);
1548         assert_return(!event_pid_changed(s->event), -ECHILD);
1549
1550         *revents = s->io.revents;
1551         return 0;
1552 }
1553
1554 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1555         assert_return(s, -EINVAL);
1556         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1557         assert_return(!event_pid_changed(s->event), -ECHILD);
1558
1559         return s->signal.sig;
1560 }
1561
1562 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
1563         assert_return(s, -EINVAL);
1564         assert_return(!event_pid_changed(s->event), -ECHILD);
1565
1566         *priority = s->priority;
1567         return 0;
1568 }
1569
1570 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
1571         int r;
1572
1573         assert_return(s, -EINVAL);
1574         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1575         assert_return(!event_pid_changed(s->event), -ECHILD);
1576
1577         if (s->priority == priority)
1578                 return 0;
1579
1580         if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
1581                 struct signal_data *old, *d;
1582
1583                 /* Move us from the signalfd belonging to the old
1584                  * priority to the signalfd of the new priority */
1585
1586                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
1587
1588                 s->priority = priority;
1589
1590                 r = event_make_signal_data(s->event, s->signal.sig, &d);
1591                 if (r < 0) {
1592                         s->priority = old->priority;
1593                         return r;
1594                 }
1595
1596                 event_unmask_signal_data(s->event, old, s->signal.sig);
1597         } else
1598                 s->priority = priority;
1599
1600         if (s->pending)
1601                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1602
1603         if (s->prepare)
1604                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1605
1606         if (s->type == SOURCE_EXIT)
1607                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1608
1609         return 0;
1610 }
1611
1612 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1613         assert_return(s, -EINVAL);
1614         assert_return(m, -EINVAL);
1615         assert_return(!event_pid_changed(s->event), -ECHILD);
1616
1617         *m = s->enabled;
1618         return 0;
1619 }
1620
1621 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1622         int r;
1623
1624         assert_return(s, -EINVAL);
1625         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
1626         assert_return(!event_pid_changed(s->event), -ECHILD);
1627
1628         /* If we are dead anyway, we are fine with turning off
1629          * sources, but everything else needs to fail. */
1630         if (s->event->state == SD_EVENT_FINISHED)
1631                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
1632
1633         if (s->enabled == m)
1634                 return 0;
1635
1636         if (m == SD_EVENT_OFF) {
1637
1638                 switch (s->type) {
1639
1640                 case SOURCE_IO:
1641                         source_io_unregister(s);
1642                         s->enabled = m;
1643                         break;
1644
1645                 case SOURCE_TIME_REALTIME:
1646                 case SOURCE_TIME_BOOTTIME:
1647                 case SOURCE_TIME_MONOTONIC:
1648                 case SOURCE_TIME_REALTIME_ALARM:
1649                 case SOURCE_TIME_BOOTTIME_ALARM: {
1650                         struct clock_data *d;
1651
1652                         s->enabled = m;
1653                         d = event_get_clock_data(s->event, s->type);
1654                         assert(d);
1655
1656                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1657                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1658                         d->needs_rearm = true;
1659                         break;
1660                 }
1661
1662                 case SOURCE_SIGNAL:
1663                         s->enabled = m;
1664
1665                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1666                         break;
1667
1668                 case SOURCE_CHILD:
1669                         s->enabled = m;
1670
1671                         assert(s->event->n_enabled_child_sources > 0);
1672                         s->event->n_enabled_child_sources--;
1673
1674                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1675                         break;
1676
1677                 case SOURCE_EXIT:
1678                         s->enabled = m;
1679                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1680                         break;
1681
1682                 case SOURCE_DEFER:
1683                 case SOURCE_POST:
1684                         s->enabled = m;
1685                         break;
1686
1687                 default:
1688                         assert_not_reached("Wut? I shouldn't exist.");
1689                 }
1690
1691         } else {
1692                 switch (s->type) {
1693
1694                 case SOURCE_IO:
1695                         r = source_io_register(s, m, s->io.events);
1696                         if (r < 0)
1697                                 return r;
1698
1699                         s->enabled = m;
1700                         break;
1701
1702                 case SOURCE_TIME_REALTIME:
1703                 case SOURCE_TIME_BOOTTIME:
1704                 case SOURCE_TIME_MONOTONIC:
1705                 case SOURCE_TIME_REALTIME_ALARM:
1706                 case SOURCE_TIME_BOOTTIME_ALARM: {
1707                         struct clock_data *d;
1708
1709                         s->enabled = m;
1710                         d = event_get_clock_data(s->event, s->type);
1711                         assert(d);
1712
1713                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1714                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1715                         d->needs_rearm = true;
1716                         break;
1717                 }
1718
1719                 case SOURCE_SIGNAL:
1720
1721                         s->enabled = m;
1722
1723                         r = event_make_signal_data(s->event, s->signal.sig, NULL);
1724                         if (r < 0) {
1725                                 s->enabled = SD_EVENT_OFF;
1726                                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1727                                 return r;
1728                         }
1729
1730                         break;
1731
1732                 case SOURCE_CHILD:
1733
1734                         if (s->enabled == SD_EVENT_OFF)
1735                                 s->event->n_enabled_child_sources++;
1736
1737                         s->enabled = m;
1738
1739                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
1740                         if (r < 0) {
1741                                 s->enabled = SD_EVENT_OFF;
1742                                 s->event->n_enabled_child_sources--;
1743                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1744                                 return r;
1745                         }
1746
1747                         break;
1748
1749                 case SOURCE_EXIT:
1750                         s->enabled = m;
1751                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1752                         break;
1753
1754                 case SOURCE_DEFER:
1755                 case SOURCE_POST:
1756                         s->enabled = m;
1757                         break;
1758
1759                 default:
1760                         assert_not_reached("Wut? I shouldn't exist.");
1761                 }
1762         }
1763
1764         if (s->pending)
1765                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1766
1767         if (s->prepare)
1768                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1769
1770         return 0;
1771 }
1772
1773 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1774         assert_return(s, -EINVAL);
1775         assert_return(usec, -EINVAL);
1776         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1777         assert_return(!event_pid_changed(s->event), -ECHILD);
1778
1779         *usec = s->time.next;
1780         return 0;
1781 }
1782
1783 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1784         struct clock_data *d;
1785
1786         assert_return(s, -EINVAL);
1787         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1788         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1789         assert_return(!event_pid_changed(s->event), -ECHILD);
1790
1791         s->time.next = usec;
1792
1793         source_set_pending(s, false);
1794
1795         d = event_get_clock_data(s->event, s->type);
1796         assert(d);
1797
1798         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1799         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1800         d->needs_rearm = true;
1801
1802         return 0;
1803 }
1804
1805 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1806         assert_return(s, -EINVAL);
1807         assert_return(usec, -EINVAL);
1808         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1809         assert_return(!event_pid_changed(s->event), -ECHILD);
1810
1811         *usec = s->time.accuracy;
1812         return 0;
1813 }
1814
1815 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1816         struct clock_data *d;
1817
1818         assert_return(s, -EINVAL);
1819         assert_return(usec != (uint64_t) -1, -EINVAL);
1820         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1821         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1822         assert_return(!event_pid_changed(s->event), -ECHILD);
1823
1824         if (usec == 0)
1825                 usec = DEFAULT_ACCURACY_USEC;
1826
1827         s->time.accuracy = usec;
1828
1829         source_set_pending(s, false);
1830
1831         d = event_get_clock_data(s->event, s->type);
1832         assert(d);
1833
1834         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1835         d->needs_rearm = true;
1836
1837         return 0;
1838 }
1839
1840 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
1841         assert_return(s, -EINVAL);
1842         assert_return(clock, -EINVAL);
1843         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1844         assert_return(!event_pid_changed(s->event), -ECHILD);
1845
1846         *clock = event_source_type_to_clock(s->type);
1847         return 0;
1848 }
1849
1850 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1851         assert_return(s, -EINVAL);
1852         assert_return(pid, -EINVAL);
1853         assert_return(s->type == SOURCE_CHILD, -EDOM);
1854         assert_return(!event_pid_changed(s->event), -ECHILD);
1855
1856         *pid = s->child.pid;
1857         return 0;
1858 }
1859
1860 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1861         int r;
1862
1863         assert_return(s, -EINVAL);
1864         assert_return(s->type != SOURCE_EXIT, -EDOM);
1865         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1866         assert_return(!event_pid_changed(s->event), -ECHILD);
1867
1868         if (s->prepare == callback)
1869                 return 0;
1870
1871         if (callback && s->prepare) {
1872                 s->prepare = callback;
1873                 return 0;
1874         }
1875
1876         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1877         if (r < 0)
1878                 return r;
1879
1880         s->prepare = callback;
1881
1882         if (callback) {
1883                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1884                 if (r < 0)
1885                         return r;
1886         } else
1887                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1888
1889         return 0;
1890 }
1891
1892 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1893         assert_return(s, NULL);
1894
1895         return s->userdata;
1896 }
1897
1898 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1899         void *ret;
1900
1901         assert_return(s, NULL);
1902
1903         ret = s->userdata;
1904         s->userdata = userdata;
1905
1906         return ret;
1907 }
1908
1909 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1910         usec_t c;
1911         assert(e);
1912         assert(a <= b);
1913
1914         if (a <= 0)
1915                 return 0;
1916         if (a >= USEC_INFINITY)
1917                 return USEC_INFINITY;
1918
1919         if (b <= a + 1)
1920                 return a;
1921
1922         initialize_perturb(e);
1923
1924         /*
1925           Find a good time to wake up again between times a and b. We
1926           have two goals here:
1927
1928           a) We want to wake up as seldom as possible, hence prefer
1929              later times over earlier times.
1930
1931           b) But if we have to wake up, then let's make sure to
1932              dispatch as much as possible on the entire system.
1933
1934           We implement this by waking up everywhere at the same time
1935           within any given minute if we can, synchronised via the
1936           perturbation value determined from the boot ID. If we can't,
1937           then we try to find the same spot in every 10s, then 1s and
1938           then 250ms step. Otherwise, we pick the last possible time
1939           to wake up.
1940         */
1941
1942         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1943         if (c >= b) {
1944                 if (_unlikely_(c < USEC_PER_MINUTE))
1945                         return b;
1946
1947                 c -= USEC_PER_MINUTE;
1948         }
1949
1950         if (c >= a)
1951                 return c;
1952
1953         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1954         if (c >= b) {
1955                 if (_unlikely_(c < USEC_PER_SEC*10))
1956                         return b;
1957
1958                 c -= USEC_PER_SEC*10;
1959         }
1960
1961         if (c >= a)
1962                 return c;
1963
1964         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1965         if (c >= b) {
1966                 if (_unlikely_(c < USEC_PER_SEC))
1967                         return b;
1968
1969                 c -= USEC_PER_SEC;
1970         }
1971
1972         if (c >= a)
1973                 return c;
1974
1975         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1976         if (c >= b) {
1977                 if (_unlikely_(c < USEC_PER_MSEC*250))
1978                         return b;
1979
1980                 c -= USEC_PER_MSEC*250;
1981         }
1982
1983         if (c >= a)
1984                 return c;
1985
1986         return b;
1987 }
1988
1989 static int event_arm_timer(
1990                 sd_event *e,
1991                 struct clock_data *d) {
1992
1993         struct itimerspec its = {};
1994         sd_event_source *a, *b;
1995         usec_t t;
1996         int r;
1997
1998         assert(e);
1999         assert(d);
2000
2001         if (!d->needs_rearm)
2002                 return 0;
2003         else
2004                 d->needs_rearm = false;
2005
2006         a = prioq_peek(d->earliest);
2007         if (!a || a->enabled == SD_EVENT_OFF || a->time.next == USEC_INFINITY) {
2008
2009                 if (d->fd < 0)
2010                         return 0;
2011
2012                 if (d->next == USEC_INFINITY)
2013                         return 0;
2014
2015                 /* disarm */
2016                 r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2017                 if (r < 0)
2018                         return r;
2019
2020                 d->next = USEC_INFINITY;
2021                 return 0;
2022         }
2023
2024         b = prioq_peek(d->latest);
2025         assert_se(b && b->enabled != SD_EVENT_OFF);
2026
2027         t = sleep_between(e, a->time.next, time_event_source_latest(b));
2028         if (d->next == t)
2029                 return 0;
2030
2031         assert_se(d->fd >= 0);
2032
2033         if (t == 0) {
2034                 /* We don' want to disarm here, just mean some time looooong ago. */
2035                 its.it_value.tv_sec = 0;
2036                 its.it_value.tv_nsec = 1;
2037         } else
2038                 timespec_store(&its.it_value, t);
2039
2040         r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2041         if (r < 0)
2042                 return -errno;
2043
2044         d->next = t;
2045         return 0;
2046 }
2047
2048 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
2049         assert(e);
2050         assert(s);
2051         assert(s->type == SOURCE_IO);
2052
2053         /* If the event source was already pending, we just OR in the
2054          * new revents, otherwise we reset the value. The ORing is
2055          * necessary to handle EPOLLONESHOT events properly where
2056          * readability might happen independently of writability, and
2057          * we need to keep track of both */
2058
2059         if (s->pending)
2060                 s->io.revents |= revents;
2061         else
2062                 s->io.revents = revents;
2063
2064         return source_set_pending(s, true);
2065 }
2066
2067 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
2068         uint64_t x;
2069         ssize_t ss;
2070
2071         assert(e);
2072         assert(fd >= 0);
2073
2074         assert_return(events == EPOLLIN, -EIO);
2075
2076         ss = read(fd, &x, sizeof(x));
2077         if (ss < 0) {
2078                 if (IN_SET(errno, EAGAIN, EINTR))
2079                         return 0;
2080
2081                 return -errno;
2082         }
2083
2084         if (_unlikely_(ss != sizeof(x)))
2085                 return -EIO;
2086
2087         if (next)
2088                 *next = USEC_INFINITY;
2089
2090         return 0;
2091 }
2092
2093 static int process_timer(
2094                 sd_event *e,
2095                 usec_t n,
2096                 struct clock_data *d) {
2097
2098         sd_event_source *s;
2099         int r;
2100
2101         assert(e);
2102         assert(d);
2103
2104         for (;;) {
2105                 s = prioq_peek(d->earliest);
2106                 if (!s ||
2107                     s->time.next > n ||
2108                     s->enabled == SD_EVENT_OFF ||
2109                     s->pending)
2110                         break;
2111
2112                 r = source_set_pending(s, true);
2113                 if (r < 0)
2114                         return r;
2115
2116                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2117                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
2118                 d->needs_rearm = true;
2119         }
2120
2121         return 0;
2122 }
2123
2124 static int process_child(sd_event *e) {
2125         sd_event_source *s;
2126         Iterator i;
2127         int r;
2128
2129         assert(e);
2130
2131         e->need_process_child = false;
2132
2133         /*
2134            So, this is ugly. We iteratively invoke waitid() with P_PID
2135            + WNOHANG for each PID we wait for, instead of using
2136            P_ALL. This is because we only want to get child
2137            information of very specific child processes, and not all
2138            of them. We might not have processed the SIGCHLD even of a
2139            previous invocation and we don't want to maintain a
2140            unbounded *per-child* event queue, hence we really don't
2141            want anything flushed out of the kernel's queue that we
2142            don't care about. Since this is O(n) this means that if you
2143            have a lot of processes you probably want to handle SIGCHLD
2144            yourself.
2145
2146            We do not reap the children here (by using WNOWAIT), this
2147            is only done after the event source is dispatched so that
2148            the callback still sees the process as a zombie.
2149         */
2150
2151         HASHMAP_FOREACH(s, e->child_sources, i) {
2152                 assert(s->type == SOURCE_CHILD);
2153
2154                 if (s->pending)
2155                         continue;
2156
2157                 if (s->enabled == SD_EVENT_OFF)
2158                         continue;
2159
2160                 zero(s->child.siginfo);
2161                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
2162                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
2163                 if (r < 0)
2164                         return -errno;
2165
2166                 if (s->child.siginfo.si_pid != 0) {
2167                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
2168
2169                         if (!zombie && (s->child.options & WEXITED)) {
2170                                 /* If the child isn't dead then let's
2171                                  * immediately remove the state change
2172                                  * from the queue, since there's no
2173                                  * benefit in leaving it queued */
2174
2175                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
2176                                 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
2177                         }
2178
2179                         r = source_set_pending(s, true);
2180                         if (r < 0)
2181                                 return r;
2182                 }
2183         }
2184
2185         return 0;
2186 }
2187
2188 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
2189         bool read_one = false;
2190         int r;
2191
2192         assert(e);
2193         assert_return(events == EPOLLIN, -EIO);
2194
2195         /* If there's a signal queued on this priority and SIGCHLD is
2196            on this priority too, then make sure to recheck the
2197            children we watch. This is because we only ever dequeue
2198            the first signal per priority, and if we dequeue one, and
2199            SIGCHLD might be enqueued later we wouldn't know, but we
2200            might have higher priority children we care about hence we
2201            need to check that explicitly. */
2202
2203         if (sigismember(&d->sigset, SIGCHLD))
2204                 e->need_process_child = true;
2205
2206         /* If there's already an event source pending for this
2207          * priority we don't read another */
2208         if (d->current)
2209                 return 0;
2210
2211         for (;;) {
2212                 struct signalfd_siginfo si;
2213                 ssize_t n;
2214                 sd_event_source *s = NULL;
2215
2216                 n = read(d->fd, &si, sizeof(si));
2217                 if (n < 0) {
2218                         if (IN_SET(errno, EAGAIN, EINTR))
2219                                 return read_one;
2220
2221                         return -errno;
2222                 }
2223
2224                 if (_unlikely_(n != sizeof(si)))
2225                         return -EIO;
2226
2227                 assert(SIGNAL_VALID(si.ssi_signo));
2228
2229                 read_one = true;
2230
2231                 if (e->signal_sources)
2232                         s = e->signal_sources[si.ssi_signo];
2233                 if (!s)
2234                         continue;
2235                 if (s->pending)
2236                         continue;
2237
2238                 s->signal.siginfo = si;
2239                 d->current = s;
2240
2241                 r = source_set_pending(s, true);
2242                 if (r < 0)
2243                         return r;
2244
2245                 return 1;
2246         }
2247 }
2248
2249 static int source_dispatch(sd_event_source *s) {
2250         EventSourceType saved_type;
2251         int r = 0;
2252
2253         assert(s);
2254         assert(s->pending || s->type == SOURCE_EXIT);
2255
2256         /* Save the event source type, here, so that we still know it after the event callback which might invalidate
2257          * the event. */
2258         saved_type = s->type;
2259
2260         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2261                 r = source_set_pending(s, false);
2262                 if (r < 0)
2263                         return r;
2264         }
2265
2266         if (s->type != SOURCE_POST) {
2267                 sd_event_source *z;
2268                 Iterator i;
2269
2270                 /* If we execute a non-post source, let's mark all
2271                  * post sources as pending */
2272
2273                 SET_FOREACH(z, s->event->post_sources, i) {
2274                         if (z->enabled == SD_EVENT_OFF)
2275                                 continue;
2276
2277                         r = source_set_pending(z, true);
2278                         if (r < 0)
2279                                 return r;
2280                 }
2281         }
2282
2283         if (s->enabled == SD_EVENT_ONESHOT) {
2284                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
2285                 if (r < 0)
2286                         return r;
2287         }
2288
2289         s->dispatching = true;
2290
2291         switch (s->type) {
2292
2293         case SOURCE_IO:
2294                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
2295                 break;
2296
2297         case SOURCE_TIME_REALTIME:
2298         case SOURCE_TIME_BOOTTIME:
2299         case SOURCE_TIME_MONOTONIC:
2300         case SOURCE_TIME_REALTIME_ALARM:
2301         case SOURCE_TIME_BOOTTIME_ALARM:
2302                 r = s->time.callback(s, s->time.next, s->userdata);
2303                 break;
2304
2305         case SOURCE_SIGNAL:
2306                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
2307                 break;
2308
2309         case SOURCE_CHILD: {
2310                 bool zombie;
2311
2312                 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
2313
2314                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
2315
2316                 /* Now, reap the PID for good. */
2317                 if (zombie)
2318                         waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
2319
2320                 break;
2321         }
2322
2323         case SOURCE_DEFER:
2324                 r = s->defer.callback(s, s->userdata);
2325                 break;
2326
2327         case SOURCE_POST:
2328                 r = s->post.callback(s, s->userdata);
2329                 break;
2330
2331         case SOURCE_EXIT:
2332                 r = s->exit.callback(s, s->userdata);
2333                 break;
2334
2335         case SOURCE_WATCHDOG:
2336         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
2337         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
2338                 assert_not_reached("Wut? I shouldn't exist.");
2339         }
2340
2341         s->dispatching = false;
2342
2343         if (r < 0)
2344                 log_debug_errno(r, "Event source %s (type %s) returned error, disabling: %m",
2345                                 strna(s->description), event_source_type_to_string(saved_type));
2346
2347         if (s->n_ref == 0)
2348                 source_free(s);
2349         else if (r < 0)
2350                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
2351
2352         return 1;
2353 }
2354
2355 static int event_prepare(sd_event *e) {
2356         int r;
2357
2358         assert(e);
2359
2360         for (;;) {
2361                 sd_event_source *s;
2362
2363                 s = prioq_peek(e->prepare);
2364                 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
2365                         break;
2366
2367                 s->prepare_iteration = e->iteration;
2368                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
2369                 if (r < 0)
2370                         return r;
2371
2372                 assert(s->prepare);
2373
2374                 s->dispatching = true;
2375                 r = s->prepare(s, s->userdata);
2376                 s->dispatching = false;
2377
2378                 if (r < 0)
2379                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, disabling: %m",
2380                                         strna(s->description), event_source_type_to_string(s->type));
2381
2382                 if (s->n_ref == 0)
2383                         source_free(s);
2384                 else if (r < 0)
2385                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
2386         }
2387
2388         return 0;
2389 }
2390
2391 static int dispatch_exit(sd_event *e) {
2392         sd_event_source *p;
2393         int r;
2394
2395         assert(e);
2396
2397         p = prioq_peek(e->exit);
2398         if (!p || p->enabled == SD_EVENT_OFF) {
2399                 e->state = SD_EVENT_FINISHED;
2400                 return 0;
2401         }
2402
2403         sd_event_ref(e);
2404         e->iteration++;
2405         e->state = SD_EVENT_EXITING;
2406
2407         r = source_dispatch(p);
2408
2409         e->state = SD_EVENT_INITIAL;
2410         sd_event_unref(e);
2411
2412         return r;
2413 }
2414
2415 static sd_event_source* event_next_pending(sd_event *e) {
2416         sd_event_source *p;
2417
2418         assert(e);
2419
2420         p = prioq_peek(e->pending);
2421         if (!p)
2422                 return NULL;
2423
2424         if (p->enabled == SD_EVENT_OFF)
2425                 return NULL;
2426
2427         return p;
2428 }
2429
2430 static int arm_watchdog(sd_event *e) {
2431         struct itimerspec its = {};
2432         usec_t t;
2433         int r;
2434
2435         assert(e);
2436         assert(e->watchdog_fd >= 0);
2437
2438         t = sleep_between(e,
2439                           e->watchdog_last + (e->watchdog_period / 2),
2440                           e->watchdog_last + (e->watchdog_period * 3 / 4));
2441
2442         timespec_store(&its.it_value, t);
2443
2444         /* Make sure we never set the watchdog to 0, which tells the
2445          * kernel to disable it. */
2446         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
2447                 its.it_value.tv_nsec = 1;
2448
2449         r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
2450         if (r < 0)
2451                 return -errno;
2452
2453         return 0;
2454 }
2455
2456 static int process_watchdog(sd_event *e) {
2457         assert(e);
2458
2459         if (!e->watchdog)
2460                 return 0;
2461
2462         /* Don't notify watchdog too often */
2463         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
2464                 return 0;
2465
2466         sd_notify(false, "WATCHDOG=1");
2467         e->watchdog_last = e->timestamp.monotonic;
2468
2469         return arm_watchdog(e);
2470 }
2471
2472 _public_ int sd_event_prepare(sd_event *e) {
2473         int r;
2474
2475         assert_return(e, -EINVAL);
2476         assert_return(e = event_resolve(e), -ENOPKG);
2477         assert_return(!event_pid_changed(e), -ECHILD);
2478         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2479         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2480
2481         if (e->exit_requested)
2482                 goto pending;
2483
2484         e->iteration++;
2485
2486         e->state = SD_EVENT_PREPARING;
2487         r = event_prepare(e);
2488         e->state = SD_EVENT_INITIAL;
2489         if (r < 0)
2490                 return r;
2491
2492         r = event_arm_timer(e, &e->realtime);
2493         if (r < 0)
2494                 return r;
2495
2496         r = event_arm_timer(e, &e->boottime);
2497         if (r < 0)
2498                 return r;
2499
2500         r = event_arm_timer(e, &e->monotonic);
2501         if (r < 0)
2502                 return r;
2503
2504         r = event_arm_timer(e, &e->realtime_alarm);
2505         if (r < 0)
2506                 return r;
2507
2508         r = event_arm_timer(e, &e->boottime_alarm);
2509         if (r < 0)
2510                 return r;
2511
2512         if (event_next_pending(e) || e->need_process_child)
2513                 goto pending;
2514
2515         e->state = SD_EVENT_ARMED;
2516
2517         return 0;
2518
2519 pending:
2520         e->state = SD_EVENT_ARMED;
2521         r = sd_event_wait(e, 0);
2522         if (r == 0)
2523                 e->state = SD_EVENT_ARMED;
2524
2525         return r;
2526 }
2527
2528 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
2529         struct epoll_event *ev_queue;
2530         unsigned ev_queue_max;
2531         int r, m, i;
2532
2533         assert_return(e, -EINVAL);
2534         assert_return(e = event_resolve(e), -ENOPKG);
2535         assert_return(!event_pid_changed(e), -ECHILD);
2536         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2537         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
2538
2539         if (e->exit_requested) {
2540                 e->state = SD_EVENT_PENDING;
2541                 return 1;
2542         }
2543
2544         ev_queue_max = MAX(e->n_sources, 1u);
2545         ev_queue = newa(struct epoll_event, ev_queue_max);
2546
2547         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
2548                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
2549         if (m < 0) {
2550                 if (errno == EINTR) {
2551                         e->state = SD_EVENT_PENDING;
2552                         return 1;
2553                 }
2554
2555                 r = -errno;
2556                 goto finish;
2557         }
2558
2559         triple_timestamp_get(&e->timestamp);
2560
2561         for (i = 0; i < m; i++) {
2562
2563                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
2564                         r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
2565                 else {
2566                         WakeupType *t = ev_queue[i].data.ptr;
2567
2568                         switch (*t) {
2569
2570                         case WAKEUP_EVENT_SOURCE:
2571                                 r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
2572                                 break;
2573
2574                         case WAKEUP_CLOCK_DATA: {
2575                                 struct clock_data *d = ev_queue[i].data.ptr;
2576                                 r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
2577                                 break;
2578                         }
2579
2580                         case WAKEUP_SIGNAL_DATA:
2581                                 r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
2582                                 break;
2583
2584                         default:
2585                                 assert_not_reached("Invalid wake-up pointer");
2586                         }
2587                 }
2588                 if (r < 0)
2589                         goto finish;
2590         }
2591
2592         r = process_watchdog(e);
2593         if (r < 0)
2594                 goto finish;
2595
2596         r = process_timer(e, e->timestamp.realtime, &e->realtime);
2597         if (r < 0)
2598                 goto finish;
2599
2600         r = process_timer(e, e->timestamp.boottime, &e->boottime);
2601         if (r < 0)
2602                 goto finish;
2603
2604         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
2605         if (r < 0)
2606                 goto finish;
2607
2608         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
2609         if (r < 0)
2610                 goto finish;
2611
2612         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
2613         if (r < 0)
2614                 goto finish;
2615
2616         if (e->need_process_child) {
2617                 r = process_child(e);
2618                 if (r < 0)
2619                         goto finish;
2620         }
2621
2622         if (event_next_pending(e)) {
2623                 e->state = SD_EVENT_PENDING;
2624
2625                 return 1;
2626         }
2627
2628         r = 0;
2629
2630 finish:
2631         e->state = SD_EVENT_INITIAL;
2632
2633         return r;
2634 }
2635
2636 _public_ int sd_event_dispatch(sd_event *e) {
2637         sd_event_source *p;
2638         int r;
2639
2640         assert_return(e, -EINVAL);
2641         assert_return(e = event_resolve(e), -ENOPKG);
2642         assert_return(!event_pid_changed(e), -ECHILD);
2643         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2644         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
2645
2646         if (e->exit_requested)
2647                 return dispatch_exit(e);
2648
2649         p = event_next_pending(e);
2650         if (p) {
2651                 sd_event_ref(e);
2652
2653                 e->state = SD_EVENT_RUNNING;
2654                 r = source_dispatch(p);
2655                 e->state = SD_EVENT_INITIAL;
2656
2657                 sd_event_unref(e);
2658
2659                 return r;
2660         }
2661
2662         e->state = SD_EVENT_INITIAL;
2663
2664         return 1;
2665 }
2666
2667 static void event_log_delays(sd_event *e) {
2668         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1];
2669         unsigned i;
2670         int o;
2671
2672         for (i = o = 0; i < ELEMENTSOF(e->delays); i++) {
2673                 o += snprintf(&b[o], sizeof(b) - o, "%u ", e->delays[i]);
2674                 e->delays[i] = 0;
2675         }
2676         log_debug("Event loop iterations: %.*s", o, b);
2677 }
2678
2679 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
2680         int r;
2681
2682         assert_return(e, -EINVAL);
2683         assert_return(e = event_resolve(e), -ENOPKG);
2684         assert_return(!event_pid_changed(e), -ECHILD);
2685         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2686         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2687
2688         if (e->profile_delays && e->last_run) {
2689                 usec_t this_run;
2690                 unsigned l;
2691
2692                 this_run = now(CLOCK_MONOTONIC);
2693
2694                 l = u64log2(this_run - e->last_run);
2695                 assert(l < sizeof(e->delays));
2696                 e->delays[l]++;
2697
2698                 if (this_run - e->last_log >= 5*USEC_PER_SEC) {
2699                         event_log_delays(e);
2700                         e->last_log = this_run;
2701                 }
2702         }
2703
2704         r = sd_event_prepare(e);
2705         if (r == 0)
2706                 /* There was nothing? Then wait... */
2707                 r = sd_event_wait(e, timeout);
2708
2709         if (e->profile_delays)
2710                 e->last_run = now(CLOCK_MONOTONIC);
2711
2712         if (r > 0) {
2713                 /* There's something now, then let's dispatch it */
2714                 r = sd_event_dispatch(e);
2715                 if (r < 0)
2716                         return r;
2717
2718                 return 1;
2719         }
2720
2721         return r;
2722 }
2723
2724 _public_ int sd_event_loop(sd_event *e) {
2725         int r;
2726
2727         assert_return(e, -EINVAL);
2728         assert_return(e = event_resolve(e), -ENOPKG);
2729         assert_return(!event_pid_changed(e), -ECHILD);
2730         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2731
2732         sd_event_ref(e);
2733
2734         while (e->state != SD_EVENT_FINISHED) {
2735                 r = sd_event_run(e, (uint64_t) -1);
2736                 if (r < 0)
2737                         goto finish;
2738         }
2739
2740         r = e->exit_code;
2741
2742 finish:
2743         sd_event_unref(e);
2744         return r;
2745 }
2746
2747 _public_ int sd_event_get_fd(sd_event *e) {
2748
2749         assert_return(e, -EINVAL);
2750         assert_return(e = event_resolve(e), -ENOPKG);
2751         assert_return(!event_pid_changed(e), -ECHILD);
2752
2753         return e->epoll_fd;
2754 }
2755
2756 _public_ int sd_event_get_state(sd_event *e) {
2757         assert_return(e, -EINVAL);
2758         assert_return(e = event_resolve(e), -ENOPKG);
2759         assert_return(!event_pid_changed(e), -ECHILD);
2760
2761         return e->state;
2762 }
2763
2764 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2765         assert_return(e, -EINVAL);
2766         assert_return(e = event_resolve(e), -ENOPKG);
2767         assert_return(code, -EINVAL);
2768         assert_return(!event_pid_changed(e), -ECHILD);
2769
2770         if (!e->exit_requested)
2771                 return -ENODATA;
2772
2773         *code = e->exit_code;
2774         return 0;
2775 }
2776
2777 _public_ int sd_event_exit(sd_event *e, int code) {
2778         assert_return(e, -EINVAL);
2779         assert_return(e = event_resolve(e), -ENOPKG);
2780         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2781         assert_return(!event_pid_changed(e), -ECHILD);
2782
2783         e->exit_requested = true;
2784         e->exit_code = code;
2785
2786         return 0;
2787 }
2788
2789 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
2790         assert_return(e, -EINVAL);
2791         assert_return(e = event_resolve(e), -ENOPKG);
2792         assert_return(usec, -EINVAL);
2793         assert_return(!event_pid_changed(e), -ECHILD);
2794
2795         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
2796                 return -EOPNOTSUPP;
2797
2798         /* Generate a clean error in case CLOCK_BOOTTIME is not available. Note that don't use clock_supported() here,
2799          * for a reason: there are systems where CLOCK_BOOTTIME is supported, but CLOCK_BOOTTIME_ALARM is not, but for
2800          * the purpose of getting the time this doesn't matter. */
2801         if (IN_SET(clock, CLOCK_BOOTTIME, CLOCK_BOOTTIME_ALARM) && !clock_boottime_supported())
2802                 return -EOPNOTSUPP;
2803
2804         if (!triple_timestamp_is_set(&e->timestamp)) {
2805                 /* Implicitly fall back to now() if we never ran
2806                  * before and thus have no cached time. */
2807                 *usec = now(clock);
2808                 return 1;
2809         }
2810
2811         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
2812         return 0;
2813 }
2814
2815 _public_ int sd_event_default(sd_event **ret) {
2816         sd_event *e = NULL;
2817         int r;
2818
2819         if (!ret)
2820                 return !!default_event;
2821
2822         if (default_event) {
2823                 *ret = sd_event_ref(default_event);
2824                 return 0;
2825         }
2826
2827         r = sd_event_new(&e);
2828         if (r < 0)
2829                 return r;
2830
2831         e->default_event_ptr = &default_event;
2832         e->tid = gettid();
2833         default_event = e;
2834
2835         *ret = e;
2836         return 1;
2837 }
2838
2839 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2840         assert_return(e, -EINVAL);
2841         assert_return(e = event_resolve(e), -ENOPKG);
2842         assert_return(tid, -EINVAL);
2843         assert_return(!event_pid_changed(e), -ECHILD);
2844
2845         if (e->tid != 0) {
2846                 *tid = e->tid;
2847                 return 0;
2848         }
2849
2850         return -ENXIO;
2851 }
2852
2853 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2854         int r;
2855
2856         assert_return(e, -EINVAL);
2857         assert_return(e = event_resolve(e), -ENOPKG);
2858         assert_return(!event_pid_changed(e), -ECHILD);
2859
2860         if (e->watchdog == !!b)
2861                 return e->watchdog;
2862
2863         if (b) {
2864                 struct epoll_event ev = {};
2865
2866                 r = sd_watchdog_enabled(false, &e->watchdog_period);
2867                 if (r <= 0)
2868                         return r;
2869
2870                 /* Issue first ping immediately */
2871                 sd_notify(false, "WATCHDOG=1");
2872                 e->watchdog_last = now(CLOCK_MONOTONIC);
2873
2874                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2875                 if (e->watchdog_fd < 0)
2876                         return -errno;
2877
2878                 r = arm_watchdog(e);
2879                 if (r < 0)
2880                         goto fail;
2881
2882                 ev.events = EPOLLIN;
2883                 ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
2884
2885                 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2886                 if (r < 0) {
2887                         r = -errno;
2888                         goto fail;
2889                 }
2890
2891         } else {
2892                 if (e->watchdog_fd >= 0) {
2893                         epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2894                         e->watchdog_fd = safe_close(e->watchdog_fd);
2895                 }
2896         }
2897
2898         e->watchdog = !!b;
2899         return e->watchdog;
2900
2901 fail:
2902         e->watchdog_fd = safe_close(e->watchdog_fd);
2903         return r;
2904 }
2905
2906 _public_ int sd_event_get_watchdog(sd_event *e) {
2907         assert_return(e, -EINVAL);
2908         assert_return(e = event_resolve(e), -ENOPKG);
2909         assert_return(!event_pid_changed(e), -ECHILD);
2910
2911         return e->watchdog;
2912 }
2913
2914 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
2915         assert_return(e, -EINVAL);
2916         assert_return(e = event_resolve(e), -ENOPKG);
2917         assert_return(!event_pid_changed(e), -ECHILD);
2918
2919         *ret = e->iteration;
2920         return 0;
2921 }