chiark / gitweb /
sd-event: voidify more things
[elogind.git] / src / libelogind / sd-event / sd-event.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3   This file is part of systemd.
4
5   Copyright 2013 Lennart Poettering
6 ***/
7
8 #include <sys/epoll.h>
9 #include <sys/timerfd.h>
10 #include <sys/wait.h>
11
12 #include "sd-daemon.h"
13 #include "sd-event.h"
14 #include "sd-id128.h"
15
16 #include "alloc-util.h"
17 #include "fd-util.h"
18 #include "hashmap.h"
19 #include "list.h"
20 #include "macro.h"
21 #include "missing.h"
22 #include "prioq.h"
23 #include "process-util.h"
24 #include "set.h"
25 #include "signal-util.h"
26 #include "string-table.h"
27 #include "string-util.h"
28 #include "time-util.h"
29 #include "util.h"
30
31 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
32
33 typedef enum EventSourceType {
34         SOURCE_IO,
35         SOURCE_TIME_REALTIME,
36         SOURCE_TIME_BOOTTIME,
37         SOURCE_TIME_MONOTONIC,
38         SOURCE_TIME_REALTIME_ALARM,
39         SOURCE_TIME_BOOTTIME_ALARM,
40         SOURCE_SIGNAL,
41         SOURCE_CHILD,
42         SOURCE_DEFER,
43         SOURCE_POST,
44         SOURCE_EXIT,
45         SOURCE_WATCHDOG,
46         _SOURCE_EVENT_SOURCE_TYPE_MAX,
47         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
48 } EventSourceType;
49
50 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
51         [SOURCE_IO] = "io",
52         [SOURCE_TIME_REALTIME] = "realtime",
53         [SOURCE_TIME_BOOTTIME] = "bootime",
54         [SOURCE_TIME_MONOTONIC] = "monotonic",
55         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
56         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
57         [SOURCE_SIGNAL] = "signal",
58         [SOURCE_CHILD] = "child",
59         [SOURCE_DEFER] = "defer",
60         [SOURCE_POST] = "post",
61         [SOURCE_EXIT] = "exit",
62         [SOURCE_WATCHDOG] = "watchdog",
63 };
64
65 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
66
67 /* All objects we use in epoll events start with this value, so that
68  * we know how to dispatch it */
69 typedef enum WakeupType {
70         WAKEUP_NONE,
71         WAKEUP_EVENT_SOURCE,
72         WAKEUP_CLOCK_DATA,
73         WAKEUP_SIGNAL_DATA,
74         _WAKEUP_TYPE_MAX,
75         _WAKEUP_TYPE_INVALID = -1,
76 } WakeupType;
77
78 #define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
79
80 struct sd_event_source {
81         WakeupType wakeup;
82
83         unsigned n_ref;
84
85         sd_event *event;
86         void *userdata;
87         sd_event_handler_t prepare;
88
89         char *description;
90
91         EventSourceType type:5;
92         int enabled:3;
93         bool pending:1;
94         bool dispatching:1;
95         bool floating:1;
96
97         int64_t priority;
98         unsigned pending_index;
99         unsigned prepare_index;
100         uint64_t pending_iteration;
101         uint64_t prepare_iteration;
102
103         LIST_FIELDS(sd_event_source, sources);
104
105         union {
106                 struct {
107                         sd_event_io_handler_t callback;
108                         int fd;
109                         uint32_t events;
110                         uint32_t revents;
111                         bool registered:1;
112                         bool owned:1;
113                 } io;
114                 struct {
115                         sd_event_time_handler_t callback;
116                         usec_t next, accuracy;
117                         unsigned earliest_index;
118                         unsigned latest_index;
119                 } time;
120                 struct {
121                         sd_event_signal_handler_t callback;
122                         struct signalfd_siginfo siginfo;
123                         int sig;
124                 } signal;
125                 struct {
126                         sd_event_child_handler_t callback;
127                         siginfo_t siginfo;
128                         pid_t pid;
129                         int options;
130                 } child;
131                 struct {
132                         sd_event_handler_t callback;
133                 } defer;
134                 struct {
135                         sd_event_handler_t callback;
136                 } post;
137                 struct {
138                         sd_event_handler_t callback;
139                         unsigned prioq_index;
140                 } exit;
141         };
142 };
143
144 struct clock_data {
145         WakeupType wakeup;
146         int fd;
147
148         /* For all clocks we maintain two priority queues each, one
149          * ordered for the earliest times the events may be
150          * dispatched, and one ordered by the latest times they must
151          * have been dispatched. The range between the top entries in
152          * the two prioqs is the time window we can freely schedule
153          * wakeups in */
154
155         Prioq *earliest;
156         Prioq *latest;
157         usec_t next;
158
159         bool needs_rearm:1;
160 };
161
162 struct signal_data {
163         WakeupType wakeup;
164
165         /* For each priority we maintain one signal fd, so that we
166          * only have to dequeue a single event per priority at a
167          * time. */
168
169         int fd;
170         int64_t priority;
171         sigset_t sigset;
172         sd_event_source *current;
173 };
174
175 struct sd_event {
176         unsigned n_ref;
177
178         int epoll_fd;
179         int watchdog_fd;
180
181         Prioq *pending;
182         Prioq *prepare;
183
184         /* timerfd_create() only supports these five clocks so far. We
185          * can add support for more clocks when the kernel learns to
186          * deal with them, too. */
187         struct clock_data realtime;
188         struct clock_data boottime;
189         struct clock_data monotonic;
190         struct clock_data realtime_alarm;
191         struct clock_data boottime_alarm;
192
193         usec_t perturb;
194
195         sd_event_source **signal_sources; /* indexed by signal number */
196         Hashmap *signal_data; /* indexed by priority */
197
198         Hashmap *child_sources;
199         unsigned n_enabled_child_sources;
200
201         Set *post_sources;
202
203         Prioq *exit;
204
205         pid_t original_pid;
206
207         uint64_t iteration;
208         triple_timestamp timestamp;
209         int state;
210
211         bool exit_requested:1;
212         bool need_process_child:1;
213         bool watchdog:1;
214         bool profile_delays:1;
215
216         int exit_code;
217
218         pid_t tid;
219         sd_event **default_event_ptr;
220
221         usec_t watchdog_last, watchdog_period;
222
223         unsigned n_sources;
224
225         LIST_HEAD(sd_event_source, sources);
226
227         usec_t last_run, last_log;
228         unsigned delays[sizeof(usec_t) * 8];
229 };
230
231 static thread_local sd_event *default_event = NULL;
232
233 static void source_disconnect(sd_event_source *s);
234
235 static sd_event *event_resolve(sd_event *e) {
236         return e == SD_EVENT_DEFAULT ? default_event : e;
237 }
238
239 static int pending_prioq_compare(const void *a, const void *b) {
240         const sd_event_source *x = a, *y = b;
241
242         assert(x->pending);
243         assert(y->pending);
244
245         /* Enabled ones first */
246         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
247                 return -1;
248         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
249                 return 1;
250
251         /* Lower priority values first */
252         if (x->priority < y->priority)
253                 return -1;
254         if (x->priority > y->priority)
255                 return 1;
256
257         /* Older entries first */
258         if (x->pending_iteration < y->pending_iteration)
259                 return -1;
260         if (x->pending_iteration > y->pending_iteration)
261                 return 1;
262
263         return 0;
264 }
265
266 static int prepare_prioq_compare(const void *a, const void *b) {
267         const sd_event_source *x = a, *y = b;
268
269         assert(x->prepare);
270         assert(y->prepare);
271
272         /* Enabled ones first */
273         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
274                 return -1;
275         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
276                 return 1;
277
278         /* Move most recently prepared ones last, so that we can stop
279          * preparing as soon as we hit one that has already been
280          * prepared in the current iteration */
281         if (x->prepare_iteration < y->prepare_iteration)
282                 return -1;
283         if (x->prepare_iteration > y->prepare_iteration)
284                 return 1;
285
286         /* Lower priority values first */
287         if (x->priority < y->priority)
288                 return -1;
289         if (x->priority > y->priority)
290                 return 1;
291
292         return 0;
293 }
294
295 static int earliest_time_prioq_compare(const void *a, const void *b) {
296         const sd_event_source *x = a, *y = b;
297
298         assert(EVENT_SOURCE_IS_TIME(x->type));
299         assert(x->type == y->type);
300
301         /* Enabled ones first */
302         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
303                 return -1;
304         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
305                 return 1;
306
307         /* Move the pending ones to the end */
308         if (!x->pending && y->pending)
309                 return -1;
310         if (x->pending && !y->pending)
311                 return 1;
312
313         /* Order by time */
314         if (x->time.next < y->time.next)
315                 return -1;
316         if (x->time.next > y->time.next)
317                 return 1;
318
319         return 0;
320 }
321
322 static usec_t time_event_source_latest(const sd_event_source *s) {
323         return usec_add(s->time.next, s->time.accuracy);
324 }
325
326 static int latest_time_prioq_compare(const void *a, const void *b) {
327         const sd_event_source *x = a, *y = b;
328
329         assert(EVENT_SOURCE_IS_TIME(x->type));
330         assert(x->type == y->type);
331
332         /* Enabled ones first */
333         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
334                 return -1;
335         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
336                 return 1;
337
338         /* Move the pending ones to the end */
339         if (!x->pending && y->pending)
340                 return -1;
341         if (x->pending && !y->pending)
342                 return 1;
343
344         /* Order by time */
345         if (time_event_source_latest(x) < time_event_source_latest(y))
346                 return -1;
347         if (time_event_source_latest(x) > time_event_source_latest(y))
348                 return 1;
349
350         return 0;
351 }
352
353 static int exit_prioq_compare(const void *a, const void *b) {
354         const sd_event_source *x = a, *y = b;
355
356         assert(x->type == SOURCE_EXIT);
357         assert(y->type == SOURCE_EXIT);
358
359         /* Enabled ones first */
360         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
361                 return -1;
362         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
363                 return 1;
364
365         /* Lower priority values first */
366         if (x->priority < y->priority)
367                 return -1;
368         if (x->priority > y->priority)
369                 return 1;
370
371         return 0;
372 }
373
374 static void free_clock_data(struct clock_data *d) {
375         assert(d);
376         assert(d->wakeup == WAKEUP_CLOCK_DATA);
377
378         safe_close(d->fd);
379         prioq_free(d->earliest);
380         prioq_free(d->latest);
381 }
382
383 static void event_free(sd_event *e) {
384         sd_event_source *s;
385
386         assert(e);
387
388         while ((s = e->sources)) {
389                 assert(s->floating);
390                 source_disconnect(s);
391                 sd_event_source_unref(s);
392         }
393
394         assert(e->n_sources == 0);
395
396         if (e->default_event_ptr)
397                 *(e->default_event_ptr) = NULL;
398
399         safe_close(e->epoll_fd);
400         safe_close(e->watchdog_fd);
401
402         free_clock_data(&e->realtime);
403         free_clock_data(&e->boottime);
404         free_clock_data(&e->monotonic);
405         free_clock_data(&e->realtime_alarm);
406         free_clock_data(&e->boottime_alarm);
407
408         prioq_free(e->pending);
409         prioq_free(e->prepare);
410         prioq_free(e->exit);
411
412         free(e->signal_sources);
413         hashmap_free(e->signal_data);
414
415         hashmap_free(e->child_sources);
416         set_free(e->post_sources);
417         free(e);
418 }
419
420 _public_ int sd_event_new(sd_event** ret) {
421         sd_event *e;
422         int r;
423
424         assert_return(ret, -EINVAL);
425
426         e = new0(sd_event, 1);
427         if (!e)
428                 return -ENOMEM;
429
430         e->n_ref = 1;
431         e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
432         e->realtime.next = e->boottime.next = e->monotonic.next = e->realtime_alarm.next = e->boottime_alarm.next = USEC_INFINITY;
433         e->realtime.wakeup = e->boottime.wakeup = e->monotonic.wakeup = e->realtime_alarm.wakeup = e->boottime_alarm.wakeup = WAKEUP_CLOCK_DATA;
434         e->original_pid = getpid_cached();
435         e->perturb = USEC_INFINITY;
436
437         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
438         if (r < 0)
439                 goto fail;
440
441         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
442         if (e->epoll_fd < 0) {
443                 r = -errno;
444                 goto fail;
445         }
446
447         e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
448
449         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
450                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 ... 2^63 us will be logged every 5s.");
451                 e->profile_delays = true;
452         }
453
454         *ret = e;
455         return 0;
456
457 fail:
458         event_free(e);
459         return r;
460 }
461
462 _public_ sd_event* sd_event_ref(sd_event *e) {
463
464         if (!e)
465                 return NULL;
466
467         assert(e->n_ref >= 1);
468         e->n_ref++;
469
470         return e;
471 }
472
473 _public_ sd_event* sd_event_unref(sd_event *e) {
474
475         if (!e)
476                 return NULL;
477
478         assert(e->n_ref >= 1);
479         e->n_ref--;
480
481         if (e->n_ref <= 0)
482                 event_free(e);
483
484         return NULL;
485 }
486
487 static bool event_pid_changed(sd_event *e) {
488         assert(e);
489
490         /* We don't support people creating an event loop and keeping
491          * it around over a fork(). Let's complain. */
492
493         return e->original_pid != getpid_cached();
494 }
495
496 static void source_io_unregister(sd_event_source *s) {
497         int r;
498
499         assert(s);
500         assert(s->type == SOURCE_IO);
501
502         if (event_pid_changed(s->event))
503                 return;
504
505         if (!s->io.registered)
506                 return;
507
508         r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
509         if (r < 0)
510                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
511                                 strna(s->description), event_source_type_to_string(s->type));
512
513         s->io.registered = false;
514 }
515
516 static int source_io_register(
517                 sd_event_source *s,
518                 int enabled,
519                 uint32_t events) {
520
521         struct epoll_event ev;
522         int r;
523
524         assert(s);
525         assert(s->type == SOURCE_IO);
526         assert(enabled != SD_EVENT_OFF);
527
528         ev = (struct epoll_event) {
529                 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
530                 .data.ptr = s,
531         };
532
533         if (s->io.registered)
534                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
535         else
536                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
537         if (r < 0)
538                 return -errno;
539
540         s->io.registered = true;
541
542         return 0;
543 }
544
545 static clockid_t event_source_type_to_clock(EventSourceType t) {
546
547         switch (t) {
548
549         case SOURCE_TIME_REALTIME:
550                 return CLOCK_REALTIME;
551
552         case SOURCE_TIME_BOOTTIME:
553                 return CLOCK_BOOTTIME;
554
555         case SOURCE_TIME_MONOTONIC:
556                 return CLOCK_MONOTONIC;
557
558         case SOURCE_TIME_REALTIME_ALARM:
559                 return CLOCK_REALTIME_ALARM;
560
561         case SOURCE_TIME_BOOTTIME_ALARM:
562                 return CLOCK_BOOTTIME_ALARM;
563
564         default:
565                 return (clockid_t) -1;
566         }
567 }
568
569 static EventSourceType clock_to_event_source_type(clockid_t clock) {
570
571         switch (clock) {
572
573         case CLOCK_REALTIME:
574                 return SOURCE_TIME_REALTIME;
575
576         case CLOCK_BOOTTIME:
577                 return SOURCE_TIME_BOOTTIME;
578
579         case CLOCK_MONOTONIC:
580                 return SOURCE_TIME_MONOTONIC;
581
582         case CLOCK_REALTIME_ALARM:
583                 return SOURCE_TIME_REALTIME_ALARM;
584
585         case CLOCK_BOOTTIME_ALARM:
586                 return SOURCE_TIME_BOOTTIME_ALARM;
587
588         default:
589                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
590         }
591 }
592
593 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
594         assert(e);
595
596         switch (t) {
597
598         case SOURCE_TIME_REALTIME:
599                 return &e->realtime;
600
601         case SOURCE_TIME_BOOTTIME:
602                 return &e->boottime;
603
604         case SOURCE_TIME_MONOTONIC:
605                 return &e->monotonic;
606
607         case SOURCE_TIME_REALTIME_ALARM:
608                 return &e->realtime_alarm;
609
610         case SOURCE_TIME_BOOTTIME_ALARM:
611                 return &e->boottime_alarm;
612
613         default:
614                 return NULL;
615         }
616 }
617
618 static int event_make_signal_data(
619                 sd_event *e,
620                 int sig,
621                 struct signal_data **ret) {
622
623         struct epoll_event ev;
624         struct signal_data *d;
625         bool added = false;
626         sigset_t ss_copy;
627         int64_t priority;
628         int r;
629
630         assert(e);
631
632         if (event_pid_changed(e))
633                 return -ECHILD;
634
635         if (e->signal_sources && e->signal_sources[sig])
636                 priority = e->signal_sources[sig]->priority;
637         else
638                 priority = SD_EVENT_PRIORITY_NORMAL;
639
640         d = hashmap_get(e->signal_data, &priority);
641         if (d) {
642                 if (sigismember(&d->sigset, sig) > 0) {
643                         if (ret)
644                                 *ret = d;
645                         return 0;
646                 }
647         } else {
648                 r = hashmap_ensure_allocated(&e->signal_data, &uint64_hash_ops);
649                 if (r < 0)
650                         return r;
651
652                 d = new0(struct signal_data, 1);
653                 if (!d)
654                         return -ENOMEM;
655
656                 d->wakeup = WAKEUP_SIGNAL_DATA;
657                 d->fd  = -1;
658                 d->priority = priority;
659
660                 r = hashmap_put(e->signal_data, &d->priority, d);
661                 if (r < 0) {
662                         free(d);
663                         return r;
664                 }
665
666                 added = true;
667         }
668
669         ss_copy = d->sigset;
670         assert_se(sigaddset(&ss_copy, sig) >= 0);
671
672         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
673         if (r < 0) {
674                 r = -errno;
675                 goto fail;
676         }
677
678         d->sigset = ss_copy;
679
680         if (d->fd >= 0) {
681                 if (ret)
682                         *ret = d;
683                 return 0;
684         }
685
686         d->fd = fd_move_above_stdio(r);
687
688         ev = (struct epoll_event) {
689                 .events = EPOLLIN,
690                 .data.ptr = d,
691         };
692
693         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev);
694         if (r < 0)  {
695                 r = -errno;
696                 goto fail;
697         }
698
699         if (ret)
700                 *ret = d;
701
702         return 0;
703
704 fail:
705         if (added) {
706                 d->fd = safe_close(d->fd);
707                 hashmap_remove(e->signal_data, &d->priority);
708                 free(d);
709         }
710
711         return r;
712 }
713
714 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
715         assert(e);
716         assert(d);
717
718         /* Turns off the specified signal in the signal data
719          * object. If the signal mask of the object becomes empty that
720          * way removes it. */
721
722         if (sigismember(&d->sigset, sig) == 0)
723                 return;
724
725         assert_se(sigdelset(&d->sigset, sig) >= 0);
726
727         if (sigisemptyset(&d->sigset)) {
728
729                 /* If all the mask is all-zero we can get rid of the structure */
730                 hashmap_remove(e->signal_data, &d->priority);
731                 safe_close(d->fd);
732                 free(d);
733                 return;
734         }
735
736         assert(d->fd >= 0);
737
738         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
739                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
740 }
741
742 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
743         struct signal_data *d;
744         static const int64_t zero_priority = 0;
745
746         assert(e);
747
748         /* Rechecks if the specified signal is still something we are
749          * interested in. If not, we'll unmask it, and possibly drop
750          * the signalfd for it. */
751
752         if (sig == SIGCHLD &&
753             e->n_enabled_child_sources > 0)
754                 return;
755
756         if (e->signal_sources &&
757             e->signal_sources[sig] &&
758             e->signal_sources[sig]->enabled != SD_EVENT_OFF)
759                 return;
760
761         /*
762          * The specified signal might be enabled in three different queues:
763          *
764          * 1) the one that belongs to the priority passed (if it is non-NULL)
765          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
766          * 3) the 0 priority (to cover the SIGCHLD case)
767          *
768          * Hence, let's remove it from all three here.
769          */
770
771         if (priority) {
772                 d = hashmap_get(e->signal_data, priority);
773                 if (d)
774                         event_unmask_signal_data(e, d, sig);
775         }
776
777         if (e->signal_sources && e->signal_sources[sig]) {
778                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
779                 if (d)
780                         event_unmask_signal_data(e, d, sig);
781         }
782
783         d = hashmap_get(e->signal_data, &zero_priority);
784         if (d)
785                 event_unmask_signal_data(e, d, sig);
786 }
787
788 static void source_disconnect(sd_event_source *s) {
789         sd_event *event;
790
791         assert(s);
792
793         if (!s->event)
794                 return;
795
796         assert(s->event->n_sources > 0);
797
798         switch (s->type) {
799
800         case SOURCE_IO:
801                 if (s->io.fd >= 0)
802                         source_io_unregister(s);
803
804                 break;
805
806         case SOURCE_TIME_REALTIME:
807         case SOURCE_TIME_BOOTTIME:
808         case SOURCE_TIME_MONOTONIC:
809         case SOURCE_TIME_REALTIME_ALARM:
810         case SOURCE_TIME_BOOTTIME_ALARM: {
811                 struct clock_data *d;
812
813                 d = event_get_clock_data(s->event, s->type);
814                 assert(d);
815
816                 prioq_remove(d->earliest, s, &s->time.earliest_index);
817                 prioq_remove(d->latest, s, &s->time.latest_index);
818                 d->needs_rearm = true;
819                 break;
820         }
821
822         case SOURCE_SIGNAL:
823                 if (s->signal.sig > 0) {
824
825                         if (s->event->signal_sources)
826                                 s->event->signal_sources[s->signal.sig] = NULL;
827
828                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
829                 }
830
831                 break;
832
833         case SOURCE_CHILD:
834                 if (s->child.pid > 0) {
835                         if (s->enabled != SD_EVENT_OFF) {
836                                 assert(s->event->n_enabled_child_sources > 0);
837                                 s->event->n_enabled_child_sources--;
838                         }
839
840                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
841                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
842                 }
843
844                 break;
845
846         case SOURCE_DEFER:
847                 /* nothing */
848                 break;
849
850         case SOURCE_POST:
851                 set_remove(s->event->post_sources, s);
852                 break;
853
854         case SOURCE_EXIT:
855                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
856                 break;
857
858         default:
859                 assert_not_reached("Wut? I shouldn't exist.");
860         }
861
862         if (s->pending)
863                 prioq_remove(s->event->pending, s, &s->pending_index);
864
865         if (s->prepare)
866                 prioq_remove(s->event->prepare, s, &s->prepare_index);
867
868         event = s->event;
869
870         s->type = _SOURCE_EVENT_SOURCE_TYPE_INVALID;
871         s->event = NULL;
872         LIST_REMOVE(sources, event->sources, s);
873         event->n_sources--;
874
875         if (!s->floating)
876                 sd_event_unref(event);
877 }
878
879 static void source_free(sd_event_source *s) {
880         assert(s);
881
882         source_disconnect(s);
883
884         if (s->type == SOURCE_IO && s->io.owned)
885                 safe_close(s->io.fd);
886
887         free(s->description);
888         free(s);
889 }
890
891 static int source_set_pending(sd_event_source *s, bool b) {
892         int r;
893
894         assert(s);
895         assert(s->type != SOURCE_EXIT);
896
897         if (s->pending == b)
898                 return 0;
899
900         s->pending = b;
901
902         if (b) {
903                 s->pending_iteration = s->event->iteration;
904
905                 r = prioq_put(s->event->pending, s, &s->pending_index);
906                 if (r < 0) {
907                         s->pending = false;
908                         return r;
909                 }
910         } else
911                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
912
913         if (EVENT_SOURCE_IS_TIME(s->type)) {
914                 struct clock_data *d;
915
916                 d = event_get_clock_data(s->event, s->type);
917                 assert(d);
918
919                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
920                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
921                 d->needs_rearm = true;
922         }
923
924         if (s->type == SOURCE_SIGNAL && !b) {
925                 struct signal_data *d;
926
927                 d = hashmap_get(s->event->signal_data, &s->priority);
928                 if (d && d->current == s)
929                         d->current = NULL;
930         }
931
932         return 0;
933 }
934
935 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
936         sd_event_source *s;
937
938         assert(e);
939
940         s = new0(sd_event_source, 1);
941         if (!s)
942                 return NULL;
943
944         s->n_ref = 1;
945         s->event = e;
946         s->floating = floating;
947         s->type = type;
948         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
949
950         if (!floating)
951                 sd_event_ref(e);
952
953         LIST_PREPEND(sources, e->sources, s);
954         e->n_sources++;
955
956         return s;
957 }
958
959 _public_ int sd_event_add_io(
960                 sd_event *e,
961                 sd_event_source **ret,
962                 int fd,
963                 uint32_t events,
964                 sd_event_io_handler_t callback,
965                 void *userdata) {
966
967         sd_event_source *s;
968         int r;
969
970         assert_return(e, -EINVAL);
971         assert_return(e = event_resolve(e), -ENOPKG);
972         assert_return(fd >= 0, -EBADF);
973         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
974         assert_return(callback, -EINVAL);
975         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
976         assert_return(!event_pid_changed(e), -ECHILD);
977
978         s = source_new(e, !ret, SOURCE_IO);
979         if (!s)
980                 return -ENOMEM;
981
982         s->wakeup = WAKEUP_EVENT_SOURCE;
983         s->io.fd = fd;
984         s->io.events = events;
985         s->io.callback = callback;
986         s->userdata = userdata;
987         s->enabled = SD_EVENT_ON;
988
989         r = source_io_register(s, s->enabled, events);
990         if (r < 0) {
991                 source_free(s);
992                 return r;
993         }
994
995         if (ret)
996                 *ret = s;
997
998         return 0;
999 }
1000
1001 static void initialize_perturb(sd_event *e) {
1002         sd_id128_t bootid = {};
1003
1004         /* When we sleep for longer, we try to realign the wakeup to
1005            the same time wihtin each minute/second/250ms, so that
1006            events all across the system can be coalesced into a single
1007            CPU wakeup. However, let's take some system-specific
1008            randomness for this value, so that in a network of systems
1009            with synced clocks timer events are distributed a
1010            bit. Here, we calculate a perturbation usec offset from the
1011            boot ID. */
1012
1013         if (_likely_(e->perturb != USEC_INFINITY))
1014                 return;
1015
1016         if (sd_id128_get_boot(&bootid) >= 0)
1017                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1018 }
1019
1020 static int event_setup_timer_fd(
1021                 sd_event *e,
1022                 struct clock_data *d,
1023                 clockid_t clock) {
1024
1025         struct epoll_event ev;
1026         int r, fd;
1027
1028         assert(e);
1029         assert(d);
1030
1031         if (_likely_(d->fd >= 0))
1032                 return 0;
1033
1034         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1035         if (fd < 0)
1036                 return -errno;
1037
1038         fd = fd_move_above_stdio(fd);
1039
1040         ev = (struct epoll_event) {
1041                 .events = EPOLLIN,
1042                 .data.ptr = d,
1043         };
1044
1045         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
1046         if (r < 0) {
1047                 safe_close(fd);
1048                 return -errno;
1049         }
1050
1051         d->fd = fd;
1052         return 0;
1053 }
1054
1055 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1056         assert(s);
1057
1058         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1059 }
1060
1061 _public_ int sd_event_add_time(
1062                 sd_event *e,
1063                 sd_event_source **ret,
1064                 clockid_t clock,
1065                 uint64_t usec,
1066                 uint64_t accuracy,
1067                 sd_event_time_handler_t callback,
1068                 void *userdata) {
1069
1070         EventSourceType type;
1071         sd_event_source *s;
1072         struct clock_data *d;
1073         int r;
1074
1075         assert_return(e, -EINVAL);
1076         assert_return(e = event_resolve(e), -ENOPKG);
1077         assert_return(accuracy != (uint64_t) -1, -EINVAL);
1078         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1079         assert_return(!event_pid_changed(e), -ECHILD);
1080
1081         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1082                 return -EOPNOTSUPP;
1083
1084         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1085         if (type < 0)
1086                 return -EOPNOTSUPP;
1087
1088         if (!callback)
1089                 callback = time_exit_callback;
1090
1091         d = event_get_clock_data(e, type);
1092         assert(d);
1093
1094         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1095         if (r < 0)
1096                 return r;
1097
1098         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1099         if (r < 0)
1100                 return r;
1101
1102         if (d->fd < 0) {
1103                 r = event_setup_timer_fd(e, d, clock);
1104                 if (r < 0)
1105                         return r;
1106         }
1107
1108         s = source_new(e, !ret, type);
1109         if (!s)
1110                 return -ENOMEM;
1111
1112         s->time.next = usec;
1113         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1114         s->time.callback = callback;
1115         s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
1116         s->userdata = userdata;
1117         s->enabled = SD_EVENT_ONESHOT;
1118
1119         d->needs_rearm = true;
1120
1121         r = prioq_put(d->earliest, s, &s->time.earliest_index);
1122         if (r < 0)
1123                 goto fail;
1124
1125         r = prioq_put(d->latest, s, &s->time.latest_index);
1126         if (r < 0)
1127                 goto fail;
1128
1129         if (ret)
1130                 *ret = s;
1131
1132         return 0;
1133
1134 fail:
1135         source_free(s);
1136         return r;
1137 }
1138
1139 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1140         assert(s);
1141
1142         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1143 }
1144
1145 _public_ int sd_event_add_signal(
1146                 sd_event *e,
1147                 sd_event_source **ret,
1148                 int sig,
1149                 sd_event_signal_handler_t callback,
1150                 void *userdata) {
1151
1152         sd_event_source *s;
1153         struct signal_data *d;
1154         sigset_t ss;
1155         int r;
1156
1157         assert_return(e, -EINVAL);
1158         assert_return(e = event_resolve(e), -ENOPKG);
1159         assert_return(SIGNAL_VALID(sig), -EINVAL);
1160         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1161         assert_return(!event_pid_changed(e), -ECHILD);
1162
1163         if (!callback)
1164                 callback = signal_exit_callback;
1165
1166         r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
1167         if (r != 0)
1168                 return -r;
1169
1170         if (!sigismember(&ss, sig))
1171                 return -EBUSY;
1172
1173         if (!e->signal_sources) {
1174                 e->signal_sources = new0(sd_event_source*, _NSIG);
1175                 if (!e->signal_sources)
1176                         return -ENOMEM;
1177         } else if (e->signal_sources[sig])
1178                 return -EBUSY;
1179
1180         s = source_new(e, !ret, SOURCE_SIGNAL);
1181         if (!s)
1182                 return -ENOMEM;
1183
1184         s->signal.sig = sig;
1185         s->signal.callback = callback;
1186         s->userdata = userdata;
1187         s->enabled = SD_EVENT_ON;
1188
1189         e->signal_sources[sig] = s;
1190
1191         r = event_make_signal_data(e, sig, &d);
1192         if (r < 0) {
1193                 source_free(s);
1194                 return r;
1195         }
1196
1197         /* Use the signal name as description for the event source by default */
1198         (void) sd_event_source_set_description(s, signal_to_string(sig));
1199
1200         if (ret)
1201                 *ret = s;
1202
1203         return 0;
1204 }
1205
1206 _public_ int sd_event_add_child(
1207                 sd_event *e,
1208                 sd_event_source **ret,
1209                 pid_t pid,
1210                 int options,
1211                 sd_event_child_handler_t callback,
1212                 void *userdata) {
1213
1214         sd_event_source *s;
1215         int r;
1216
1217         assert_return(e, -EINVAL);
1218         assert_return(e = event_resolve(e), -ENOPKG);
1219         assert_return(pid > 1, -EINVAL);
1220         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1221         assert_return(options != 0, -EINVAL);
1222         assert_return(callback, -EINVAL);
1223         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1224         assert_return(!event_pid_changed(e), -ECHILD);
1225
1226         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1227         if (r < 0)
1228                 return r;
1229
1230         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1231                 return -EBUSY;
1232
1233         s = source_new(e, !ret, SOURCE_CHILD);
1234         if (!s)
1235                 return -ENOMEM;
1236
1237         s->child.pid = pid;
1238         s->child.options = options;
1239         s->child.callback = callback;
1240         s->userdata = userdata;
1241         s->enabled = SD_EVENT_ONESHOT;
1242
1243         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1244         if (r < 0) {
1245                 source_free(s);
1246                 return r;
1247         }
1248
1249         e->n_enabled_child_sources++;
1250
1251         r = event_make_signal_data(e, SIGCHLD, NULL);
1252         if (r < 0) {
1253                 e->n_enabled_child_sources--;
1254                 source_free(s);
1255                 return r;
1256         }
1257
1258         e->need_process_child = true;
1259
1260         if (ret)
1261                 *ret = s;
1262
1263         return 0;
1264 }
1265
1266 _public_ int sd_event_add_defer(
1267                 sd_event *e,
1268                 sd_event_source **ret,
1269                 sd_event_handler_t callback,
1270                 void *userdata) {
1271
1272         sd_event_source *s;
1273         int r;
1274
1275         assert_return(e, -EINVAL);
1276         assert_return(e = event_resolve(e), -ENOPKG);
1277         assert_return(callback, -EINVAL);
1278         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1279         assert_return(!event_pid_changed(e), -ECHILD);
1280
1281         s = source_new(e, !ret, SOURCE_DEFER);
1282         if (!s)
1283                 return -ENOMEM;
1284
1285         s->defer.callback = callback;
1286         s->userdata = userdata;
1287         s->enabled = SD_EVENT_ONESHOT;
1288
1289         r = source_set_pending(s, true);
1290         if (r < 0) {
1291                 source_free(s);
1292                 return r;
1293         }
1294
1295         if (ret)
1296                 *ret = s;
1297
1298         return 0;
1299 }
1300
1301 _public_ int sd_event_add_post(
1302                 sd_event *e,
1303                 sd_event_source **ret,
1304                 sd_event_handler_t callback,
1305                 void *userdata) {
1306
1307         sd_event_source *s;
1308         int r;
1309
1310         assert_return(e, -EINVAL);
1311         assert_return(e = event_resolve(e), -ENOPKG);
1312         assert_return(callback, -EINVAL);
1313         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1314         assert_return(!event_pid_changed(e), -ECHILD);
1315
1316         r = set_ensure_allocated(&e->post_sources, NULL);
1317         if (r < 0)
1318                 return r;
1319
1320         s = source_new(e, !ret, SOURCE_POST);
1321         if (!s)
1322                 return -ENOMEM;
1323
1324         s->post.callback = callback;
1325         s->userdata = userdata;
1326         s->enabled = SD_EVENT_ON;
1327
1328         r = set_put(e->post_sources, s);
1329         if (r < 0) {
1330                 source_free(s);
1331                 return r;
1332         }
1333
1334         if (ret)
1335                 *ret = s;
1336
1337         return 0;
1338 }
1339
1340 _public_ int sd_event_add_exit(
1341                 sd_event *e,
1342                 sd_event_source **ret,
1343                 sd_event_handler_t callback,
1344                 void *userdata) {
1345
1346         sd_event_source *s;
1347         int r;
1348
1349         assert_return(e, -EINVAL);
1350         assert_return(e = event_resolve(e), -ENOPKG);
1351         assert_return(callback, -EINVAL);
1352         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1353         assert_return(!event_pid_changed(e), -ECHILD);
1354
1355         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1356         if (r < 0)
1357                 return r;
1358
1359         s = source_new(e, !ret, SOURCE_EXIT);
1360         if (!s)
1361                 return -ENOMEM;
1362
1363         s->exit.callback = callback;
1364         s->userdata = userdata;
1365         s->exit.prioq_index = PRIOQ_IDX_NULL;
1366         s->enabled = SD_EVENT_ONESHOT;
1367
1368         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1369         if (r < 0) {
1370                 source_free(s);
1371                 return r;
1372         }
1373
1374         if (ret)
1375                 *ret = s;
1376
1377         return 0;
1378 }
1379
1380 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
1381
1382         if (!s)
1383                 return NULL;
1384
1385         assert(s->n_ref >= 1);
1386         s->n_ref++;
1387
1388         return s;
1389 }
1390
1391 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
1392
1393         if (!s)
1394                 return NULL;
1395
1396         assert(s->n_ref >= 1);
1397         s->n_ref--;
1398
1399         if (s->n_ref <= 0) {
1400                 /* Here's a special hack: when we are called from a
1401                  * dispatch handler we won't free the event source
1402                  * immediately, but we will detach the fd from the
1403                  * epoll. This way it is safe for the caller to unref
1404                  * the event source and immediately close the fd, but
1405                  * we still retain a valid event source object after
1406                  * the callback. */
1407
1408                 if (s->dispatching) {
1409                         if (s->type == SOURCE_IO)
1410                                 source_io_unregister(s);
1411
1412                         source_disconnect(s);
1413                 } else
1414                         source_free(s);
1415         }
1416
1417         return NULL;
1418 }
1419
1420 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
1421         assert_return(s, -EINVAL);
1422         assert_return(!event_pid_changed(s->event), -ECHILD);
1423
1424         return free_and_strdup(&s->description, description);
1425 }
1426
1427 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
1428         assert_return(s, -EINVAL);
1429         assert_return(description, -EINVAL);
1430         assert_return(s->description, -ENXIO);
1431         assert_return(!event_pid_changed(s->event), -ECHILD);
1432
1433         *description = s->description;
1434         return 0;
1435 }
1436
1437 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1438         assert_return(s, NULL);
1439
1440         return s->event;
1441 }
1442
1443 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1444         assert_return(s, -EINVAL);
1445         assert_return(s->type != SOURCE_EXIT, -EDOM);
1446         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1447         assert_return(!event_pid_changed(s->event), -ECHILD);
1448
1449         return s->pending;
1450 }
1451
1452 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1453         assert_return(s, -EINVAL);
1454         assert_return(s->type == SOURCE_IO, -EDOM);
1455         assert_return(!event_pid_changed(s->event), -ECHILD);
1456
1457         return s->io.fd;
1458 }
1459
1460 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1461         int r;
1462
1463         assert_return(s, -EINVAL);
1464         assert_return(fd >= 0, -EBADF);
1465         assert_return(s->type == SOURCE_IO, -EDOM);
1466         assert_return(!event_pid_changed(s->event), -ECHILD);
1467
1468         if (s->io.fd == fd)
1469                 return 0;
1470
1471         if (s->enabled == SD_EVENT_OFF) {
1472                 s->io.fd = fd;
1473                 s->io.registered = false;
1474         } else {
1475                 int saved_fd;
1476
1477                 saved_fd = s->io.fd;
1478                 assert(s->io.registered);
1479
1480                 s->io.fd = fd;
1481                 s->io.registered = false;
1482
1483                 r = source_io_register(s, s->enabled, s->io.events);
1484                 if (r < 0) {
1485                         s->io.fd = saved_fd;
1486                         s->io.registered = true;
1487                         return r;
1488                 }
1489
1490                 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
1491         }
1492
1493         return 0;
1494 }
1495
1496 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
1497         assert_return(s, -EINVAL);
1498         assert_return(s->type == SOURCE_IO, -EDOM);
1499
1500         return s->io.owned;
1501 }
1502
1503 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
1504         assert_return(s, -EINVAL);
1505         assert_return(s->type == SOURCE_IO, -EDOM);
1506
1507         s->io.owned = own;
1508         return 0;
1509 }
1510
1511 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1512         assert_return(s, -EINVAL);
1513         assert_return(events, -EINVAL);
1514         assert_return(s->type == SOURCE_IO, -EDOM);
1515         assert_return(!event_pid_changed(s->event), -ECHILD);
1516
1517         *events = s->io.events;
1518         return 0;
1519 }
1520
1521 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1522         int r;
1523
1524         assert_return(s, -EINVAL);
1525         assert_return(s->type == SOURCE_IO, -EDOM);
1526         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1527         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1528         assert_return(!event_pid_changed(s->event), -ECHILD);
1529
1530         /* edge-triggered updates are never skipped, so we can reset edges */
1531         if (s->io.events == events && !(events & EPOLLET))
1532                 return 0;
1533
1534         r = source_set_pending(s, false);
1535         if (r < 0)
1536                 return r;
1537
1538         if (s->enabled != SD_EVENT_OFF) {
1539                 r = source_io_register(s, s->enabled, events);
1540                 if (r < 0)
1541                         return r;
1542         }
1543
1544         s->io.events = events;
1545
1546         return 0;
1547 }
1548
1549 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1550         assert_return(s, -EINVAL);
1551         assert_return(revents, -EINVAL);
1552         assert_return(s->type == SOURCE_IO, -EDOM);
1553         assert_return(s->pending, -ENODATA);
1554         assert_return(!event_pid_changed(s->event), -ECHILD);
1555
1556         *revents = s->io.revents;
1557         return 0;
1558 }
1559
1560 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1561         assert_return(s, -EINVAL);
1562         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1563         assert_return(!event_pid_changed(s->event), -ECHILD);
1564
1565         return s->signal.sig;
1566 }
1567
1568 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
1569         assert_return(s, -EINVAL);
1570         assert_return(!event_pid_changed(s->event), -ECHILD);
1571
1572         *priority = s->priority;
1573         return 0;
1574 }
1575
1576 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
1577         int r;
1578
1579         assert_return(s, -EINVAL);
1580         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1581         assert_return(!event_pid_changed(s->event), -ECHILD);
1582
1583         if (s->priority == priority)
1584                 return 0;
1585
1586         if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
1587                 struct signal_data *old, *d;
1588
1589                 /* Move us from the signalfd belonging to the old
1590                  * priority to the signalfd of the new priority */
1591
1592                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
1593
1594                 s->priority = priority;
1595
1596                 r = event_make_signal_data(s->event, s->signal.sig, &d);
1597                 if (r < 0) {
1598                         s->priority = old->priority;
1599                         return r;
1600                 }
1601
1602                 event_unmask_signal_data(s->event, old, s->signal.sig);
1603         } else
1604                 s->priority = priority;
1605
1606         if (s->pending)
1607                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1608
1609         if (s->prepare)
1610                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1611
1612         if (s->type == SOURCE_EXIT)
1613                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1614
1615         return 0;
1616 }
1617
1618 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1619         assert_return(s, -EINVAL);
1620         assert_return(m, -EINVAL);
1621         assert_return(!event_pid_changed(s->event), -ECHILD);
1622
1623         *m = s->enabled;
1624         return 0;
1625 }
1626
1627 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1628         int r;
1629
1630         assert_return(s, -EINVAL);
1631         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
1632         assert_return(!event_pid_changed(s->event), -ECHILD);
1633
1634         /* If we are dead anyway, we are fine with turning off
1635          * sources, but everything else needs to fail. */
1636         if (s->event->state == SD_EVENT_FINISHED)
1637                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
1638
1639         if (s->enabled == m)
1640                 return 0;
1641
1642         if (m == SD_EVENT_OFF) {
1643
1644                 /* Unset the pending flag when this event source is disabled */
1645                 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
1646                         r = source_set_pending(s, false);
1647                         if (r < 0)
1648                                 return r;
1649                 }
1650
1651                 switch (s->type) {
1652
1653                 case SOURCE_IO:
1654                         source_io_unregister(s);
1655                         s->enabled = m;
1656                         break;
1657
1658                 case SOURCE_TIME_REALTIME:
1659                 case SOURCE_TIME_BOOTTIME:
1660                 case SOURCE_TIME_MONOTONIC:
1661                 case SOURCE_TIME_REALTIME_ALARM:
1662                 case SOURCE_TIME_BOOTTIME_ALARM: {
1663                         struct clock_data *d;
1664
1665                         s->enabled = m;
1666                         d = event_get_clock_data(s->event, s->type);
1667                         assert(d);
1668
1669                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1670                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1671                         d->needs_rearm = true;
1672                         break;
1673                 }
1674
1675                 case SOURCE_SIGNAL:
1676                         s->enabled = m;
1677
1678                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1679                         break;
1680
1681                 case SOURCE_CHILD:
1682                         s->enabled = m;
1683
1684                         assert(s->event->n_enabled_child_sources > 0);
1685                         s->event->n_enabled_child_sources--;
1686
1687                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1688                         break;
1689
1690                 case SOURCE_EXIT:
1691                         s->enabled = m;
1692                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1693                         break;
1694
1695                 case SOURCE_DEFER:
1696                 case SOURCE_POST:
1697                         s->enabled = m;
1698                         break;
1699
1700                 default:
1701                         assert_not_reached("Wut? I shouldn't exist.");
1702                 }
1703
1704         } else {
1705
1706                 /* Unset the pending flag when this event source is enabled */
1707                 if (s->enabled == SD_EVENT_OFF && !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
1708                         r = source_set_pending(s, false);
1709                         if (r < 0)
1710                                 return r;
1711                 }
1712
1713                 switch (s->type) {
1714
1715                 case SOURCE_IO:
1716                         r = source_io_register(s, m, s->io.events);
1717                         if (r < 0)
1718                                 return r;
1719
1720                         s->enabled = m;
1721                         break;
1722
1723                 case SOURCE_TIME_REALTIME:
1724                 case SOURCE_TIME_BOOTTIME:
1725                 case SOURCE_TIME_MONOTONIC:
1726                 case SOURCE_TIME_REALTIME_ALARM:
1727                 case SOURCE_TIME_BOOTTIME_ALARM: {
1728                         struct clock_data *d;
1729
1730                         s->enabled = m;
1731                         d = event_get_clock_data(s->event, s->type);
1732                         assert(d);
1733
1734                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1735                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1736                         d->needs_rearm = true;
1737                         break;
1738                 }
1739
1740                 case SOURCE_SIGNAL:
1741
1742                         s->enabled = m;
1743
1744                         r = event_make_signal_data(s->event, s->signal.sig, NULL);
1745                         if (r < 0) {
1746                                 s->enabled = SD_EVENT_OFF;
1747                                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1748                                 return r;
1749                         }
1750
1751                         break;
1752
1753                 case SOURCE_CHILD:
1754
1755                         if (s->enabled == SD_EVENT_OFF)
1756                                 s->event->n_enabled_child_sources++;
1757
1758                         s->enabled = m;
1759
1760                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
1761                         if (r < 0) {
1762                                 s->enabled = SD_EVENT_OFF;
1763                                 s->event->n_enabled_child_sources--;
1764                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1765                                 return r;
1766                         }
1767
1768                         break;
1769
1770                 case SOURCE_EXIT:
1771                         s->enabled = m;
1772                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1773                         break;
1774
1775                 case SOURCE_DEFER:
1776                 case SOURCE_POST:
1777                         s->enabled = m;
1778                         break;
1779
1780                 default:
1781                         assert_not_reached("Wut? I shouldn't exist.");
1782                 }
1783         }
1784
1785         if (s->pending)
1786                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1787
1788         if (s->prepare)
1789                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1790
1791         return 0;
1792 }
1793
1794 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1795         assert_return(s, -EINVAL);
1796         assert_return(usec, -EINVAL);
1797         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1798         assert_return(!event_pid_changed(s->event), -ECHILD);
1799
1800         *usec = s->time.next;
1801         return 0;
1802 }
1803
1804 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1805         struct clock_data *d;
1806         int r;
1807
1808         assert_return(s, -EINVAL);
1809         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1810         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1811         assert_return(!event_pid_changed(s->event), -ECHILD);
1812
1813         r = source_set_pending(s, false);
1814         if (r < 0)
1815                 return r;
1816
1817         s->time.next = usec;
1818
1819         d = event_get_clock_data(s->event, s->type);
1820         assert(d);
1821
1822         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1823         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1824         d->needs_rearm = true;
1825
1826         return 0;
1827 }
1828
1829 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1830         assert_return(s, -EINVAL);
1831         assert_return(usec, -EINVAL);
1832         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1833         assert_return(!event_pid_changed(s->event), -ECHILD);
1834
1835         *usec = s->time.accuracy;
1836         return 0;
1837 }
1838
1839 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1840         struct clock_data *d;
1841         int r;
1842
1843         assert_return(s, -EINVAL);
1844         assert_return(usec != (uint64_t) -1, -EINVAL);
1845         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1846         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1847         assert_return(!event_pid_changed(s->event), -ECHILD);
1848
1849         r = source_set_pending(s, false);
1850         if (r < 0)
1851                 return r;
1852
1853         if (usec == 0)
1854                 usec = DEFAULT_ACCURACY_USEC;
1855
1856         s->time.accuracy = usec;
1857
1858         d = event_get_clock_data(s->event, s->type);
1859         assert(d);
1860
1861         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1862         d->needs_rearm = true;
1863
1864         return 0;
1865 }
1866
1867 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
1868         assert_return(s, -EINVAL);
1869         assert_return(clock, -EINVAL);
1870         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1871         assert_return(!event_pid_changed(s->event), -ECHILD);
1872
1873         *clock = event_source_type_to_clock(s->type);
1874         return 0;
1875 }
1876
1877 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1878         assert_return(s, -EINVAL);
1879         assert_return(pid, -EINVAL);
1880         assert_return(s->type == SOURCE_CHILD, -EDOM);
1881         assert_return(!event_pid_changed(s->event), -ECHILD);
1882
1883         *pid = s->child.pid;
1884         return 0;
1885 }
1886
1887 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1888         int r;
1889
1890         assert_return(s, -EINVAL);
1891         assert_return(s->type != SOURCE_EXIT, -EDOM);
1892         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1893         assert_return(!event_pid_changed(s->event), -ECHILD);
1894
1895         if (s->prepare == callback)
1896                 return 0;
1897
1898         if (callback && s->prepare) {
1899                 s->prepare = callback;
1900                 return 0;
1901         }
1902
1903         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1904         if (r < 0)
1905                 return r;
1906
1907         s->prepare = callback;
1908
1909         if (callback) {
1910                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1911                 if (r < 0)
1912                         return r;
1913         } else
1914                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1915
1916         return 0;
1917 }
1918
1919 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1920         assert_return(s, NULL);
1921
1922         return s->userdata;
1923 }
1924
1925 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1926         void *ret;
1927
1928         assert_return(s, NULL);
1929
1930         ret = s->userdata;
1931         s->userdata = userdata;
1932
1933         return ret;
1934 }
1935
1936 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1937         usec_t c;
1938         assert(e);
1939         assert(a <= b);
1940
1941         if (a <= 0)
1942                 return 0;
1943         if (a >= USEC_INFINITY)
1944                 return USEC_INFINITY;
1945
1946         if (b <= a + 1)
1947                 return a;
1948
1949         initialize_perturb(e);
1950
1951         /*
1952           Find a good time to wake up again between times a and b. We
1953           have two goals here:
1954
1955           a) We want to wake up as seldom as possible, hence prefer
1956              later times over earlier times.
1957
1958           b) But if we have to wake up, then let's make sure to
1959              dispatch as much as possible on the entire system.
1960
1961           We implement this by waking up everywhere at the same time
1962           within any given minute if we can, synchronised via the
1963           perturbation value determined from the boot ID. If we can't,
1964           then we try to find the same spot in every 10s, then 1s and
1965           then 250ms step. Otherwise, we pick the last possible time
1966           to wake up.
1967         */
1968
1969         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1970         if (c >= b) {
1971                 if (_unlikely_(c < USEC_PER_MINUTE))
1972                         return b;
1973
1974                 c -= USEC_PER_MINUTE;
1975         }
1976
1977         if (c >= a)
1978                 return c;
1979
1980         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1981         if (c >= b) {
1982                 if (_unlikely_(c < USEC_PER_SEC*10))
1983                         return b;
1984
1985                 c -= USEC_PER_SEC*10;
1986         }
1987
1988         if (c >= a)
1989                 return c;
1990
1991         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1992         if (c >= b) {
1993                 if (_unlikely_(c < USEC_PER_SEC))
1994                         return b;
1995
1996                 c -= USEC_PER_SEC;
1997         }
1998
1999         if (c >= a)
2000                 return c;
2001
2002         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
2003         if (c >= b) {
2004                 if (_unlikely_(c < USEC_PER_MSEC*250))
2005                         return b;
2006
2007                 c -= USEC_PER_MSEC*250;
2008         }
2009
2010         if (c >= a)
2011                 return c;
2012
2013         return b;
2014 }
2015
2016 static int event_arm_timer(
2017                 sd_event *e,
2018                 struct clock_data *d) {
2019
2020         struct itimerspec its = {};
2021         sd_event_source *a, *b;
2022         usec_t t;
2023         int r;
2024
2025         assert(e);
2026         assert(d);
2027
2028         if (!d->needs_rearm)
2029                 return 0;
2030         else
2031                 d->needs_rearm = false;
2032
2033         a = prioq_peek(d->earliest);
2034         if (!a || a->enabled == SD_EVENT_OFF || a->time.next == USEC_INFINITY) {
2035
2036                 if (d->fd < 0)
2037                         return 0;
2038
2039                 if (d->next == USEC_INFINITY)
2040                         return 0;
2041
2042                 /* disarm */
2043                 r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2044                 if (r < 0)
2045                         return r;
2046
2047                 d->next = USEC_INFINITY;
2048                 return 0;
2049         }
2050
2051         b = prioq_peek(d->latest);
2052         assert_se(b && b->enabled != SD_EVENT_OFF);
2053
2054         t = sleep_between(e, a->time.next, time_event_source_latest(b));
2055         if (d->next == t)
2056                 return 0;
2057
2058         assert_se(d->fd >= 0);
2059
2060         if (t == 0) {
2061                 /* We don' want to disarm here, just mean some time looooong ago. */
2062                 its.it_value.tv_sec = 0;
2063                 its.it_value.tv_nsec = 1;
2064         } else
2065                 timespec_store(&its.it_value, t);
2066
2067         r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2068         if (r < 0)
2069                 return -errno;
2070
2071         d->next = t;
2072         return 0;
2073 }
2074
2075 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
2076         assert(e);
2077         assert(s);
2078         assert(s->type == SOURCE_IO);
2079
2080         /* If the event source was already pending, we just OR in the
2081          * new revents, otherwise we reset the value. The ORing is
2082          * necessary to handle EPOLLONESHOT events properly where
2083          * readability might happen independently of writability, and
2084          * we need to keep track of both */
2085
2086         if (s->pending)
2087                 s->io.revents |= revents;
2088         else
2089                 s->io.revents = revents;
2090
2091         return source_set_pending(s, true);
2092 }
2093
2094 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
2095         uint64_t x;
2096         ssize_t ss;
2097
2098         assert(e);
2099         assert(fd >= 0);
2100
2101         assert_return(events == EPOLLIN, -EIO);
2102
2103         ss = read(fd, &x, sizeof(x));
2104         if (ss < 0) {
2105                 if (IN_SET(errno, EAGAIN, EINTR))
2106                         return 0;
2107
2108                 return -errno;
2109         }
2110
2111         if (_unlikely_(ss != sizeof(x)))
2112                 return -EIO;
2113
2114         if (next)
2115                 *next = USEC_INFINITY;
2116
2117         return 0;
2118 }
2119
2120 static int process_timer(
2121                 sd_event *e,
2122                 usec_t n,
2123                 struct clock_data *d) {
2124
2125         sd_event_source *s;
2126         int r;
2127
2128         assert(e);
2129         assert(d);
2130
2131         for (;;) {
2132                 s = prioq_peek(d->earliest);
2133                 if (!s ||
2134                     s->time.next > n ||
2135                     s->enabled == SD_EVENT_OFF ||
2136                     s->pending)
2137                         break;
2138
2139                 r = source_set_pending(s, true);
2140                 if (r < 0)
2141                         return r;
2142
2143                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2144                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
2145                 d->needs_rearm = true;
2146         }
2147
2148         return 0;
2149 }
2150
2151 static int process_child(sd_event *e) {
2152         sd_event_source *s;
2153         Iterator i;
2154         int r;
2155
2156         assert(e);
2157
2158         e->need_process_child = false;
2159
2160         /*
2161            So, this is ugly. We iteratively invoke waitid() with P_PID
2162            + WNOHANG for each PID we wait for, instead of using
2163            P_ALL. This is because we only want to get child
2164            information of very specific child processes, and not all
2165            of them. We might not have processed the SIGCHLD even of a
2166            previous invocation and we don't want to maintain a
2167            unbounded *per-child* event queue, hence we really don't
2168            want anything flushed out of the kernel's queue that we
2169            don't care about. Since this is O(n) this means that if you
2170            have a lot of processes you probably want to handle SIGCHLD
2171            yourself.
2172
2173            We do not reap the children here (by using WNOWAIT), this
2174            is only done after the event source is dispatched so that
2175            the callback still sees the process as a zombie.
2176         */
2177
2178         HASHMAP_FOREACH(s, e->child_sources, i) {
2179                 assert(s->type == SOURCE_CHILD);
2180
2181                 if (s->pending)
2182                         continue;
2183
2184                 if (s->enabled == SD_EVENT_OFF)
2185                         continue;
2186
2187                 zero(s->child.siginfo);
2188                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
2189                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
2190                 if (r < 0)
2191                         return -errno;
2192
2193                 if (s->child.siginfo.si_pid != 0) {
2194                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
2195
2196                         if (!zombie && (s->child.options & WEXITED)) {
2197                                 /* If the child isn't dead then let's
2198                                  * immediately remove the state change
2199                                  * from the queue, since there's no
2200                                  * benefit in leaving it queued */
2201
2202                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
2203                                 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
2204                         }
2205
2206                         r = source_set_pending(s, true);
2207                         if (r < 0)
2208                                 return r;
2209                 }
2210         }
2211
2212         return 0;
2213 }
2214
2215 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
2216         bool read_one = false;
2217         int r;
2218
2219         assert(e);
2220         assert_return(events == EPOLLIN, -EIO);
2221
2222         /* If there's a signal queued on this priority and SIGCHLD is
2223            on this priority too, then make sure to recheck the
2224            children we watch. This is because we only ever dequeue
2225            the first signal per priority, and if we dequeue one, and
2226            SIGCHLD might be enqueued later we wouldn't know, but we
2227            might have higher priority children we care about hence we
2228            need to check that explicitly. */
2229
2230         if (sigismember(&d->sigset, SIGCHLD))
2231                 e->need_process_child = true;
2232
2233         /* If there's already an event source pending for this
2234          * priority we don't read another */
2235         if (d->current)
2236                 return 0;
2237
2238         for (;;) {
2239                 struct signalfd_siginfo si;
2240                 ssize_t n;
2241                 sd_event_source *s = NULL;
2242
2243                 n = read(d->fd, &si, sizeof(si));
2244                 if (n < 0) {
2245                         if (IN_SET(errno, EAGAIN, EINTR))
2246                                 return read_one;
2247
2248                         return -errno;
2249                 }
2250
2251                 if (_unlikely_(n != sizeof(si)))
2252                         return -EIO;
2253
2254                 assert(SIGNAL_VALID(si.ssi_signo));
2255
2256                 read_one = true;
2257
2258                 if (e->signal_sources)
2259                         s = e->signal_sources[si.ssi_signo];
2260                 if (!s)
2261                         continue;
2262                 if (s->pending)
2263                         continue;
2264
2265                 s->signal.siginfo = si;
2266                 d->current = s;
2267
2268                 r = source_set_pending(s, true);
2269                 if (r < 0)
2270                         return r;
2271
2272                 return 1;
2273         }
2274 }
2275
2276 static int source_dispatch(sd_event_source *s) {
2277         EventSourceType saved_type;
2278         int r = 0;
2279
2280         assert(s);
2281         assert(s->pending || s->type == SOURCE_EXIT);
2282
2283         /* Save the event source type, here, so that we still know it after the event callback which might invalidate
2284          * the event. */
2285         saved_type = s->type;
2286
2287         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2288                 r = source_set_pending(s, false);
2289                 if (r < 0)
2290                         return r;
2291         }
2292
2293         if (s->type != SOURCE_POST) {
2294                 sd_event_source *z;
2295                 Iterator i;
2296
2297                 /* If we execute a non-post source, let's mark all
2298                  * post sources as pending */
2299
2300                 SET_FOREACH(z, s->event->post_sources, i) {
2301                         if (z->enabled == SD_EVENT_OFF)
2302                                 continue;
2303
2304                         r = source_set_pending(z, true);
2305                         if (r < 0)
2306                                 return r;
2307                 }
2308         }
2309
2310         if (s->enabled == SD_EVENT_ONESHOT) {
2311                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
2312                 if (r < 0)
2313                         return r;
2314         }
2315
2316         s->dispatching = true;
2317
2318         switch (s->type) {
2319
2320         case SOURCE_IO:
2321                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
2322                 break;
2323
2324         case SOURCE_TIME_REALTIME:
2325         case SOURCE_TIME_BOOTTIME:
2326         case SOURCE_TIME_MONOTONIC:
2327         case SOURCE_TIME_REALTIME_ALARM:
2328         case SOURCE_TIME_BOOTTIME_ALARM:
2329                 r = s->time.callback(s, s->time.next, s->userdata);
2330                 break;
2331
2332         case SOURCE_SIGNAL:
2333                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
2334                 break;
2335
2336         case SOURCE_CHILD: {
2337                 bool zombie;
2338
2339                 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
2340
2341                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
2342
2343                 /* Now, reap the PID for good. */
2344                 if (zombie)
2345                         (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
2346
2347                 break;
2348         }
2349
2350         case SOURCE_DEFER:
2351                 r = s->defer.callback(s, s->userdata);
2352                 break;
2353
2354         case SOURCE_POST:
2355                 r = s->post.callback(s, s->userdata);
2356                 break;
2357
2358         case SOURCE_EXIT:
2359                 r = s->exit.callback(s, s->userdata);
2360                 break;
2361
2362         case SOURCE_WATCHDOG:
2363         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
2364         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
2365                 assert_not_reached("Wut? I shouldn't exist.");
2366         }
2367
2368         s->dispatching = false;
2369
2370         if (r < 0)
2371                 log_debug_errno(r, "Event source %s (type %s) returned error, disabling: %m",
2372                                 strna(s->description), event_source_type_to_string(saved_type));
2373
2374         if (s->n_ref == 0)
2375                 source_free(s);
2376         else if (r < 0)
2377                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
2378
2379         return 1;
2380 }
2381
2382 static int event_prepare(sd_event *e) {
2383         int r;
2384
2385         assert(e);
2386
2387         for (;;) {
2388                 sd_event_source *s;
2389
2390                 s = prioq_peek(e->prepare);
2391                 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
2392                         break;
2393
2394                 s->prepare_iteration = e->iteration;
2395                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
2396                 if (r < 0)
2397                         return r;
2398
2399                 assert(s->prepare);
2400
2401                 s->dispatching = true;
2402                 r = s->prepare(s, s->userdata);
2403                 s->dispatching = false;
2404
2405                 if (r < 0)
2406                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, disabling: %m",
2407                                         strna(s->description), event_source_type_to_string(s->type));
2408
2409                 if (s->n_ref == 0)
2410                         source_free(s);
2411                 else if (r < 0)
2412                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
2413         }
2414
2415         return 0;
2416 }
2417
2418 static int dispatch_exit(sd_event *e) {
2419         sd_event_source *p;
2420         _cleanup_(sd_event_unrefp) sd_event *ref = NULL;
2421         int r;
2422
2423         assert(e);
2424
2425         p = prioq_peek(e->exit);
2426         if (!p || p->enabled == SD_EVENT_OFF) {
2427                 e->state = SD_EVENT_FINISHED;
2428                 return 0;
2429         }
2430
2431         ref = sd_event_ref(e);
2432         e->iteration++;
2433         e->state = SD_EVENT_EXITING;
2434         r = source_dispatch(p);
2435         e->state = SD_EVENT_INITIAL;
2436         return r;
2437 }
2438
2439 static sd_event_source* event_next_pending(sd_event *e) {
2440         sd_event_source *p;
2441
2442         assert(e);
2443
2444         p = prioq_peek(e->pending);
2445         if (!p)
2446                 return NULL;
2447
2448         if (p->enabled == SD_EVENT_OFF)
2449                 return NULL;
2450
2451         return p;
2452 }
2453
2454 static int arm_watchdog(sd_event *e) {
2455         struct itimerspec its = {};
2456         usec_t t;
2457         int r;
2458
2459         assert(e);
2460         assert(e->watchdog_fd >= 0);
2461
2462         t = sleep_between(e,
2463                           e->watchdog_last + (e->watchdog_period / 2),
2464                           e->watchdog_last + (e->watchdog_period * 3 / 4));
2465
2466         timespec_store(&its.it_value, t);
2467
2468         /* Make sure we never set the watchdog to 0, which tells the
2469          * kernel to disable it. */
2470         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
2471                 its.it_value.tv_nsec = 1;
2472
2473         r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
2474         if (r < 0)
2475                 return -errno;
2476
2477         return 0;
2478 }
2479
2480 static int process_watchdog(sd_event *e) {
2481         assert(e);
2482
2483         if (!e->watchdog)
2484                 return 0;
2485
2486         /* Don't notify watchdog too often */
2487         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
2488                 return 0;
2489
2490         sd_notify(false, "WATCHDOG=1");
2491         e->watchdog_last = e->timestamp.monotonic;
2492
2493         return arm_watchdog(e);
2494 }
2495
2496 _public_ int sd_event_prepare(sd_event *e) {
2497         int r;
2498
2499         assert_return(e, -EINVAL);
2500         assert_return(e = event_resolve(e), -ENOPKG);
2501         assert_return(!event_pid_changed(e), -ECHILD);
2502         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2503         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2504
2505         if (e->exit_requested)
2506                 goto pending;
2507
2508         e->iteration++;
2509
2510         e->state = SD_EVENT_PREPARING;
2511         r = event_prepare(e);
2512         e->state = SD_EVENT_INITIAL;
2513         if (r < 0)
2514                 return r;
2515
2516         r = event_arm_timer(e, &e->realtime);
2517         if (r < 0)
2518                 return r;
2519
2520         r = event_arm_timer(e, &e->boottime);
2521         if (r < 0)
2522                 return r;
2523
2524         r = event_arm_timer(e, &e->monotonic);
2525         if (r < 0)
2526                 return r;
2527
2528         r = event_arm_timer(e, &e->realtime_alarm);
2529         if (r < 0)
2530                 return r;
2531
2532         r = event_arm_timer(e, &e->boottime_alarm);
2533         if (r < 0)
2534                 return r;
2535
2536         if (event_next_pending(e) || e->need_process_child)
2537                 goto pending;
2538
2539         e->state = SD_EVENT_ARMED;
2540
2541         return 0;
2542
2543 pending:
2544         e->state = SD_EVENT_ARMED;
2545         r = sd_event_wait(e, 0);
2546         if (r == 0)
2547                 e->state = SD_EVENT_ARMED;
2548
2549         return r;
2550 }
2551
2552 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
2553         struct epoll_event *ev_queue;
2554         unsigned ev_queue_max;
2555         int r, m, i;
2556
2557         assert_return(e, -EINVAL);
2558         assert_return(e = event_resolve(e), -ENOPKG);
2559         assert_return(!event_pid_changed(e), -ECHILD);
2560         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2561         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
2562
2563         if (e->exit_requested) {
2564                 e->state = SD_EVENT_PENDING;
2565                 return 1;
2566         }
2567
2568         ev_queue_max = MAX(e->n_sources, 1u);
2569         ev_queue = newa(struct epoll_event, ev_queue_max);
2570
2571         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
2572                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
2573         if (m < 0) {
2574                 if (errno == EINTR) {
2575                         e->state = SD_EVENT_PENDING;
2576                         return 1;
2577                 }
2578
2579                 r = -errno;
2580                 goto finish;
2581         }
2582
2583         triple_timestamp_get(&e->timestamp);
2584
2585         for (i = 0; i < m; i++) {
2586
2587                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
2588                         r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
2589                 else {
2590                         WakeupType *t = ev_queue[i].data.ptr;
2591
2592                         switch (*t) {
2593
2594                         case WAKEUP_EVENT_SOURCE:
2595                                 r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
2596                                 break;
2597
2598                         case WAKEUP_CLOCK_DATA: {
2599                                 struct clock_data *d = ev_queue[i].data.ptr;
2600                                 r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
2601                                 break;
2602                         }
2603
2604                         case WAKEUP_SIGNAL_DATA:
2605                                 r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
2606                                 break;
2607
2608                         default:
2609                                 assert_not_reached("Invalid wake-up pointer");
2610                         }
2611                 }
2612                 if (r < 0)
2613                         goto finish;
2614         }
2615
2616         r = process_watchdog(e);
2617         if (r < 0)
2618                 goto finish;
2619
2620         r = process_timer(e, e->timestamp.realtime, &e->realtime);
2621         if (r < 0)
2622                 goto finish;
2623
2624         r = process_timer(e, e->timestamp.boottime, &e->boottime);
2625         if (r < 0)
2626                 goto finish;
2627
2628         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
2629         if (r < 0)
2630                 goto finish;
2631
2632         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
2633         if (r < 0)
2634                 goto finish;
2635
2636         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
2637         if (r < 0)
2638                 goto finish;
2639
2640         if (e->need_process_child) {
2641                 r = process_child(e);
2642                 if (r < 0)
2643                         goto finish;
2644         }
2645
2646         if (event_next_pending(e)) {
2647                 e->state = SD_EVENT_PENDING;
2648
2649                 return 1;
2650         }
2651
2652         r = 0;
2653
2654 finish:
2655         e->state = SD_EVENT_INITIAL;
2656
2657         return r;
2658 }
2659
2660 _public_ int sd_event_dispatch(sd_event *e) {
2661         sd_event_source *p;
2662         int r;
2663
2664         assert_return(e, -EINVAL);
2665         assert_return(e = event_resolve(e), -ENOPKG);
2666         assert_return(!event_pid_changed(e), -ECHILD);
2667         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2668         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
2669
2670         if (e->exit_requested)
2671                 return dispatch_exit(e);
2672
2673         p = event_next_pending(e);
2674         if (p) {
2675                 _cleanup_(sd_event_unrefp) sd_event *ref = NULL;
2676
2677                 ref = sd_event_ref(e);
2678                 e->state = SD_EVENT_RUNNING;
2679                 r = source_dispatch(p);
2680                 e->state = SD_EVENT_INITIAL;
2681                 return r;
2682         }
2683
2684         e->state = SD_EVENT_INITIAL;
2685
2686         return 1;
2687 }
2688
2689 static void event_log_delays(sd_event *e) {
2690         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1];
2691         unsigned i;
2692         int o;
2693
2694         for (i = o = 0; i < ELEMENTSOF(e->delays); i++) {
2695                 o += snprintf(&b[o], sizeof(b) - o, "%u ", e->delays[i]);
2696                 e->delays[i] = 0;
2697         }
2698         log_debug("Event loop iterations: %.*s", o, b);
2699 }
2700
2701 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
2702         int r;
2703
2704         assert_return(e, -EINVAL);
2705         assert_return(e = event_resolve(e), -ENOPKG);
2706         assert_return(!event_pid_changed(e), -ECHILD);
2707         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2708         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2709
2710         if (e->profile_delays && e->last_run) {
2711                 usec_t this_run;
2712                 unsigned l;
2713
2714                 this_run = now(CLOCK_MONOTONIC);
2715
2716                 l = u64log2(this_run - e->last_run);
2717                 assert(l < sizeof(e->delays));
2718                 e->delays[l]++;
2719
2720                 if (this_run - e->last_log >= 5*USEC_PER_SEC) {
2721                         event_log_delays(e);
2722                         e->last_log = this_run;
2723                 }
2724         }
2725
2726         r = sd_event_prepare(e);
2727         if (r == 0)
2728                 /* There was nothing? Then wait... */
2729                 r = sd_event_wait(e, timeout);
2730
2731         if (e->profile_delays)
2732                 e->last_run = now(CLOCK_MONOTONIC);
2733
2734         if (r > 0) {
2735                 /* There's something now, then let's dispatch it */
2736                 r = sd_event_dispatch(e);
2737                 if (r < 0)
2738                         return r;
2739
2740                 return 1;
2741         }
2742
2743         return r;
2744 }
2745
2746 _public_ int sd_event_loop(sd_event *e) {
2747         _cleanup_(sd_event_unrefp) sd_event *ref = NULL;
2748         int r;
2749
2750         assert_return(e, -EINVAL);
2751         assert_return(e = event_resolve(e), -ENOPKG);
2752         assert_return(!event_pid_changed(e), -ECHILD);
2753         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2754
2755         ref = sd_event_ref(e);
2756
2757         while (e->state != SD_EVENT_FINISHED) {
2758                 r = sd_event_run(e, (uint64_t) -1);
2759                 if (r < 0)
2760                         return r;
2761         }
2762
2763         return e->exit_code;
2764 }
2765
2766 _public_ int sd_event_get_fd(sd_event *e) {
2767
2768         assert_return(e, -EINVAL);
2769         assert_return(e = event_resolve(e), -ENOPKG);
2770         assert_return(!event_pid_changed(e), -ECHILD);
2771
2772         return e->epoll_fd;
2773 }
2774
2775 _public_ int sd_event_get_state(sd_event *e) {
2776         assert_return(e, -EINVAL);
2777         assert_return(e = event_resolve(e), -ENOPKG);
2778         assert_return(!event_pid_changed(e), -ECHILD);
2779
2780         return e->state;
2781 }
2782
2783 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2784         assert_return(e, -EINVAL);
2785         assert_return(e = event_resolve(e), -ENOPKG);
2786         assert_return(code, -EINVAL);
2787         assert_return(!event_pid_changed(e), -ECHILD);
2788
2789         if (!e->exit_requested)
2790                 return -ENODATA;
2791
2792         *code = e->exit_code;
2793         return 0;
2794 }
2795
2796 _public_ int sd_event_exit(sd_event *e, int code) {
2797         assert_return(e, -EINVAL);
2798         assert_return(e = event_resolve(e), -ENOPKG);
2799         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2800         assert_return(!event_pid_changed(e), -ECHILD);
2801
2802         e->exit_requested = true;
2803         e->exit_code = code;
2804
2805         return 0;
2806 }
2807
2808 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
2809         assert_return(e, -EINVAL);
2810         assert_return(e = event_resolve(e), -ENOPKG);
2811         assert_return(usec, -EINVAL);
2812         assert_return(!event_pid_changed(e), -ECHILD);
2813
2814         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
2815                 return -EOPNOTSUPP;
2816
2817         /* Generate a clean error in case CLOCK_BOOTTIME is not available. Note that don't use clock_supported() here,
2818          * for a reason: there are systems where CLOCK_BOOTTIME is supported, but CLOCK_BOOTTIME_ALARM is not, but for
2819          * the purpose of getting the time this doesn't matter. */
2820         if (IN_SET(clock, CLOCK_BOOTTIME, CLOCK_BOOTTIME_ALARM) && !clock_boottime_supported())
2821                 return -EOPNOTSUPP;
2822
2823         if (!triple_timestamp_is_set(&e->timestamp)) {
2824                 /* Implicitly fall back to now() if we never ran
2825                  * before and thus have no cached time. */
2826                 *usec = now(clock);
2827                 return 1;
2828         }
2829
2830         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
2831         return 0;
2832 }
2833
2834 _public_ int sd_event_default(sd_event **ret) {
2835         sd_event *e = NULL;
2836         int r;
2837
2838         if (!ret)
2839                 return !!default_event;
2840
2841         if (default_event) {
2842                 *ret = sd_event_ref(default_event);
2843                 return 0;
2844         }
2845
2846         r = sd_event_new(&e);
2847         if (r < 0)
2848                 return r;
2849
2850         e->default_event_ptr = &default_event;
2851         e->tid = gettid();
2852         default_event = e;
2853
2854         *ret = e;
2855         return 1;
2856 }
2857
2858 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2859         assert_return(e, -EINVAL);
2860         assert_return(e = event_resolve(e), -ENOPKG);
2861         assert_return(tid, -EINVAL);
2862         assert_return(!event_pid_changed(e), -ECHILD);
2863
2864         if (e->tid != 0) {
2865                 *tid = e->tid;
2866                 return 0;
2867         }
2868
2869         return -ENXIO;
2870 }
2871
2872 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2873         int r;
2874
2875         assert_return(e, -EINVAL);
2876         assert_return(e = event_resolve(e), -ENOPKG);
2877         assert_return(!event_pid_changed(e), -ECHILD);
2878
2879         if (e->watchdog == !!b)
2880                 return e->watchdog;
2881
2882         if (b) {
2883                 struct epoll_event ev;
2884
2885                 r = sd_watchdog_enabled(false, &e->watchdog_period);
2886                 if (r <= 0)
2887                         return r;
2888
2889                 /* Issue first ping immediately */
2890                 sd_notify(false, "WATCHDOG=1");
2891                 e->watchdog_last = now(CLOCK_MONOTONIC);
2892
2893                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2894                 if (e->watchdog_fd < 0)
2895                         return -errno;
2896
2897                 r = arm_watchdog(e);
2898                 if (r < 0)
2899                         goto fail;
2900
2901                 ev = (struct epoll_event) {
2902                         .events = EPOLLIN,
2903                         .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
2904                 };
2905
2906                 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2907                 if (r < 0) {
2908                         r = -errno;
2909                         goto fail;
2910                 }
2911
2912         } else {
2913                 if (e->watchdog_fd >= 0) {
2914                         epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2915                         e->watchdog_fd = safe_close(e->watchdog_fd);
2916                 }
2917         }
2918
2919         e->watchdog = !!b;
2920         return e->watchdog;
2921
2922 fail:
2923         e->watchdog_fd = safe_close(e->watchdog_fd);
2924         return r;
2925 }
2926
2927 _public_ int sd_event_get_watchdog(sd_event *e) {
2928         assert_return(e, -EINVAL);
2929         assert_return(e = event_resolve(e), -ENOPKG);
2930         assert_return(!event_pid_changed(e), -ECHILD);
2931
2932         return e->watchdog;
2933 }
2934
2935 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
2936         assert_return(e, -EINVAL);
2937         assert_return(e = event_resolve(e), -ENOPKG);
2938         assert_return(!event_pid_changed(e), -ECHILD);
2939
2940         *ret = e->iteration;
2941         return 0;
2942 }