chiark / gitweb /
sd-event: drop pending events when we turn off/on an event source
[elogind.git] / src / libelogind / sd-event / sd-event.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3   This file is part of systemd.
4
5   Copyright 2013 Lennart Poettering
6 ***/
7
8 #include <sys/epoll.h>
9 #include <sys/timerfd.h>
10 #include <sys/wait.h>
11
12 #include "sd-daemon.h"
13 #include "sd-event.h"
14 #include "sd-id128.h"
15
16 #include "alloc-util.h"
17 #include "fd-util.h"
18 #include "hashmap.h"
19 #include "list.h"
20 #include "macro.h"
21 #include "missing.h"
22 #include "prioq.h"
23 #include "process-util.h"
24 #include "set.h"
25 #include "signal-util.h"
26 #include "string-table.h"
27 #include "string-util.h"
28 #include "time-util.h"
29 #include "util.h"
30
31 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
32
33 typedef enum EventSourceType {
34         SOURCE_IO,
35         SOURCE_TIME_REALTIME,
36         SOURCE_TIME_BOOTTIME,
37         SOURCE_TIME_MONOTONIC,
38         SOURCE_TIME_REALTIME_ALARM,
39         SOURCE_TIME_BOOTTIME_ALARM,
40         SOURCE_SIGNAL,
41         SOURCE_CHILD,
42         SOURCE_DEFER,
43         SOURCE_POST,
44         SOURCE_EXIT,
45         SOURCE_WATCHDOG,
46         _SOURCE_EVENT_SOURCE_TYPE_MAX,
47         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
48 } EventSourceType;
49
50 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
51         [SOURCE_IO] = "io",
52         [SOURCE_TIME_REALTIME] = "realtime",
53         [SOURCE_TIME_BOOTTIME] = "bootime",
54         [SOURCE_TIME_MONOTONIC] = "monotonic",
55         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
56         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
57         [SOURCE_SIGNAL] = "signal",
58         [SOURCE_CHILD] = "child",
59         [SOURCE_DEFER] = "defer",
60         [SOURCE_POST] = "post",
61         [SOURCE_EXIT] = "exit",
62         [SOURCE_WATCHDOG] = "watchdog",
63 };
64
65 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
66
67 /* All objects we use in epoll events start with this value, so that
68  * we know how to dispatch it */
69 typedef enum WakeupType {
70         WAKEUP_NONE,
71         WAKEUP_EVENT_SOURCE,
72         WAKEUP_CLOCK_DATA,
73         WAKEUP_SIGNAL_DATA,
74         _WAKEUP_TYPE_MAX,
75         _WAKEUP_TYPE_INVALID = -1,
76 } WakeupType;
77
78 #define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
79
80 struct sd_event_source {
81         WakeupType wakeup;
82
83         unsigned n_ref;
84
85         sd_event *event;
86         void *userdata;
87         sd_event_handler_t prepare;
88
89         char *description;
90
91         EventSourceType type:5;
92         int enabled:3;
93         bool pending:1;
94         bool dispatching:1;
95         bool floating:1;
96
97         int64_t priority;
98         unsigned pending_index;
99         unsigned prepare_index;
100         uint64_t pending_iteration;
101         uint64_t prepare_iteration;
102
103         LIST_FIELDS(sd_event_source, sources);
104
105         union {
106                 struct {
107                         sd_event_io_handler_t callback;
108                         int fd;
109                         uint32_t events;
110                         uint32_t revents;
111                         bool registered:1;
112                         bool owned:1;
113                 } io;
114                 struct {
115                         sd_event_time_handler_t callback;
116                         usec_t next, accuracy;
117                         unsigned earliest_index;
118                         unsigned latest_index;
119                 } time;
120                 struct {
121                         sd_event_signal_handler_t callback;
122                         struct signalfd_siginfo siginfo;
123                         int sig;
124                 } signal;
125                 struct {
126                         sd_event_child_handler_t callback;
127                         siginfo_t siginfo;
128                         pid_t pid;
129                         int options;
130                 } child;
131                 struct {
132                         sd_event_handler_t callback;
133                 } defer;
134                 struct {
135                         sd_event_handler_t callback;
136                 } post;
137                 struct {
138                         sd_event_handler_t callback;
139                         unsigned prioq_index;
140                 } exit;
141         };
142 };
143
144 struct clock_data {
145         WakeupType wakeup;
146         int fd;
147
148         /* For all clocks we maintain two priority queues each, one
149          * ordered for the earliest times the events may be
150          * dispatched, and one ordered by the latest times they must
151          * have been dispatched. The range between the top entries in
152          * the two prioqs is the time window we can freely schedule
153          * wakeups in */
154
155         Prioq *earliest;
156         Prioq *latest;
157         usec_t next;
158
159         bool needs_rearm:1;
160 };
161
162 struct signal_data {
163         WakeupType wakeup;
164
165         /* For each priority we maintain one signal fd, so that we
166          * only have to dequeue a single event per priority at a
167          * time. */
168
169         int fd;
170         int64_t priority;
171         sigset_t sigset;
172         sd_event_source *current;
173 };
174
175 struct sd_event {
176         unsigned n_ref;
177
178         int epoll_fd;
179         int watchdog_fd;
180
181         Prioq *pending;
182         Prioq *prepare;
183
184         /* timerfd_create() only supports these five clocks so far. We
185          * can add support for more clocks when the kernel learns to
186          * deal with them, too. */
187         struct clock_data realtime;
188         struct clock_data boottime;
189         struct clock_data monotonic;
190         struct clock_data realtime_alarm;
191         struct clock_data boottime_alarm;
192
193         usec_t perturb;
194
195         sd_event_source **signal_sources; /* indexed by signal number */
196         Hashmap *signal_data; /* indexed by priority */
197
198         Hashmap *child_sources;
199         unsigned n_enabled_child_sources;
200
201         Set *post_sources;
202
203         Prioq *exit;
204
205         pid_t original_pid;
206
207         uint64_t iteration;
208         triple_timestamp timestamp;
209         int state;
210
211         bool exit_requested:1;
212         bool need_process_child:1;
213         bool watchdog:1;
214         bool profile_delays:1;
215
216         int exit_code;
217
218         pid_t tid;
219         sd_event **default_event_ptr;
220
221         usec_t watchdog_last, watchdog_period;
222
223         unsigned n_sources;
224
225         LIST_HEAD(sd_event_source, sources);
226
227         usec_t last_run, last_log;
228         unsigned delays[sizeof(usec_t) * 8];
229 };
230
231 static thread_local sd_event *default_event = NULL;
232
233 static void source_disconnect(sd_event_source *s);
234
235 static sd_event *event_resolve(sd_event *e) {
236         return e == SD_EVENT_DEFAULT ? default_event : e;
237 }
238
239 static int pending_prioq_compare(const void *a, const void *b) {
240         const sd_event_source *x = a, *y = b;
241
242         assert(x->pending);
243         assert(y->pending);
244
245         /* Enabled ones first */
246         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
247                 return -1;
248         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
249                 return 1;
250
251         /* Lower priority values first */
252         if (x->priority < y->priority)
253                 return -1;
254         if (x->priority > y->priority)
255                 return 1;
256
257         /* Older entries first */
258         if (x->pending_iteration < y->pending_iteration)
259                 return -1;
260         if (x->pending_iteration > y->pending_iteration)
261                 return 1;
262
263         return 0;
264 }
265
266 static int prepare_prioq_compare(const void *a, const void *b) {
267         const sd_event_source *x = a, *y = b;
268
269         assert(x->prepare);
270         assert(y->prepare);
271
272         /* Enabled ones first */
273         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
274                 return -1;
275         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
276                 return 1;
277
278         /* Move most recently prepared ones last, so that we can stop
279          * preparing as soon as we hit one that has already been
280          * prepared in the current iteration */
281         if (x->prepare_iteration < y->prepare_iteration)
282                 return -1;
283         if (x->prepare_iteration > y->prepare_iteration)
284                 return 1;
285
286         /* Lower priority values first */
287         if (x->priority < y->priority)
288                 return -1;
289         if (x->priority > y->priority)
290                 return 1;
291
292         return 0;
293 }
294
295 static int earliest_time_prioq_compare(const void *a, const void *b) {
296         const sd_event_source *x = a, *y = b;
297
298         assert(EVENT_SOURCE_IS_TIME(x->type));
299         assert(x->type == y->type);
300
301         /* Enabled ones first */
302         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
303                 return -1;
304         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
305                 return 1;
306
307         /* Move the pending ones to the end */
308         if (!x->pending && y->pending)
309                 return -1;
310         if (x->pending && !y->pending)
311                 return 1;
312
313         /* Order by time */
314         if (x->time.next < y->time.next)
315                 return -1;
316         if (x->time.next > y->time.next)
317                 return 1;
318
319         return 0;
320 }
321
322 static usec_t time_event_source_latest(const sd_event_source *s) {
323         return usec_add(s->time.next, s->time.accuracy);
324 }
325
326 static int latest_time_prioq_compare(const void *a, const void *b) {
327         const sd_event_source *x = a, *y = b;
328
329         assert(EVENT_SOURCE_IS_TIME(x->type));
330         assert(x->type == y->type);
331
332         /* Enabled ones first */
333         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
334                 return -1;
335         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
336                 return 1;
337
338         /* Move the pending ones to the end */
339         if (!x->pending && y->pending)
340                 return -1;
341         if (x->pending && !y->pending)
342                 return 1;
343
344         /* Order by time */
345         if (time_event_source_latest(x) < time_event_source_latest(y))
346                 return -1;
347         if (time_event_source_latest(x) > time_event_source_latest(y))
348                 return 1;
349
350         return 0;
351 }
352
353 static int exit_prioq_compare(const void *a, const void *b) {
354         const sd_event_source *x = a, *y = b;
355
356         assert(x->type == SOURCE_EXIT);
357         assert(y->type == SOURCE_EXIT);
358
359         /* Enabled ones first */
360         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
361                 return -1;
362         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
363                 return 1;
364
365         /* Lower priority values first */
366         if (x->priority < y->priority)
367                 return -1;
368         if (x->priority > y->priority)
369                 return 1;
370
371         return 0;
372 }
373
374 static void free_clock_data(struct clock_data *d) {
375         assert(d);
376         assert(d->wakeup == WAKEUP_CLOCK_DATA);
377
378         safe_close(d->fd);
379         prioq_free(d->earliest);
380         prioq_free(d->latest);
381 }
382
383 static void event_free(sd_event *e) {
384         sd_event_source *s;
385
386         assert(e);
387
388         while ((s = e->sources)) {
389                 assert(s->floating);
390                 source_disconnect(s);
391                 sd_event_source_unref(s);
392         }
393
394         assert(e->n_sources == 0);
395
396         if (e->default_event_ptr)
397                 *(e->default_event_ptr) = NULL;
398
399         safe_close(e->epoll_fd);
400         safe_close(e->watchdog_fd);
401
402         free_clock_data(&e->realtime);
403         free_clock_data(&e->boottime);
404         free_clock_data(&e->monotonic);
405         free_clock_data(&e->realtime_alarm);
406         free_clock_data(&e->boottime_alarm);
407
408         prioq_free(e->pending);
409         prioq_free(e->prepare);
410         prioq_free(e->exit);
411
412         free(e->signal_sources);
413         hashmap_free(e->signal_data);
414
415         hashmap_free(e->child_sources);
416         set_free(e->post_sources);
417         free(e);
418 }
419
420 _public_ int sd_event_new(sd_event** ret) {
421         sd_event *e;
422         int r;
423
424         assert_return(ret, -EINVAL);
425
426         e = new0(sd_event, 1);
427         if (!e)
428                 return -ENOMEM;
429
430         e->n_ref = 1;
431         e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
432         e->realtime.next = e->boottime.next = e->monotonic.next = e->realtime_alarm.next = e->boottime_alarm.next = USEC_INFINITY;
433         e->realtime.wakeup = e->boottime.wakeup = e->monotonic.wakeup = e->realtime_alarm.wakeup = e->boottime_alarm.wakeup = WAKEUP_CLOCK_DATA;
434         e->original_pid = getpid_cached();
435         e->perturb = USEC_INFINITY;
436
437         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
438         if (r < 0)
439                 goto fail;
440
441         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
442         if (e->epoll_fd < 0) {
443                 r = -errno;
444                 goto fail;
445         }
446
447         e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
448
449         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
450                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 ... 2^63 us will be logged every 5s.");
451                 e->profile_delays = true;
452         }
453
454         *ret = e;
455         return 0;
456
457 fail:
458         event_free(e);
459         return r;
460 }
461
462 _public_ sd_event* sd_event_ref(sd_event *e) {
463
464         if (!e)
465                 return NULL;
466
467         assert(e->n_ref >= 1);
468         e->n_ref++;
469
470         return e;
471 }
472
473 _public_ sd_event* sd_event_unref(sd_event *e) {
474
475         if (!e)
476                 return NULL;
477
478         assert(e->n_ref >= 1);
479         e->n_ref--;
480
481         if (e->n_ref <= 0)
482                 event_free(e);
483
484         return NULL;
485 }
486
487 static bool event_pid_changed(sd_event *e) {
488         assert(e);
489
490         /* We don't support people creating an event loop and keeping
491          * it around over a fork(). Let's complain. */
492
493         return e->original_pid != getpid_cached();
494 }
495
496 static void source_io_unregister(sd_event_source *s) {
497         int r;
498
499         assert(s);
500         assert(s->type == SOURCE_IO);
501
502         if (event_pid_changed(s->event))
503                 return;
504
505         if (!s->io.registered)
506                 return;
507
508         r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
509         if (r < 0)
510                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
511                                 strna(s->description), event_source_type_to_string(s->type));
512
513         s->io.registered = false;
514 }
515
516 static int source_io_register(
517                 sd_event_source *s,
518                 int enabled,
519                 uint32_t events) {
520
521         struct epoll_event ev;
522         int r;
523
524         assert(s);
525         assert(s->type == SOURCE_IO);
526         assert(enabled != SD_EVENT_OFF);
527
528         ev = (struct epoll_event) {
529                 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
530                 .data.ptr = s,
531         };
532
533         if (s->io.registered)
534                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
535         else
536                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
537         if (r < 0)
538                 return -errno;
539
540         s->io.registered = true;
541
542         return 0;
543 }
544
545 static clockid_t event_source_type_to_clock(EventSourceType t) {
546
547         switch (t) {
548
549         case SOURCE_TIME_REALTIME:
550                 return CLOCK_REALTIME;
551
552         case SOURCE_TIME_BOOTTIME:
553                 return CLOCK_BOOTTIME;
554
555         case SOURCE_TIME_MONOTONIC:
556                 return CLOCK_MONOTONIC;
557
558         case SOURCE_TIME_REALTIME_ALARM:
559                 return CLOCK_REALTIME_ALARM;
560
561         case SOURCE_TIME_BOOTTIME_ALARM:
562                 return CLOCK_BOOTTIME_ALARM;
563
564         default:
565                 return (clockid_t) -1;
566         }
567 }
568
569 static EventSourceType clock_to_event_source_type(clockid_t clock) {
570
571         switch (clock) {
572
573         case CLOCK_REALTIME:
574                 return SOURCE_TIME_REALTIME;
575
576         case CLOCK_BOOTTIME:
577                 return SOURCE_TIME_BOOTTIME;
578
579         case CLOCK_MONOTONIC:
580                 return SOURCE_TIME_MONOTONIC;
581
582         case CLOCK_REALTIME_ALARM:
583                 return SOURCE_TIME_REALTIME_ALARM;
584
585         case CLOCK_BOOTTIME_ALARM:
586                 return SOURCE_TIME_BOOTTIME_ALARM;
587
588         default:
589                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
590         }
591 }
592
593 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
594         assert(e);
595
596         switch (t) {
597
598         case SOURCE_TIME_REALTIME:
599                 return &e->realtime;
600
601         case SOURCE_TIME_BOOTTIME:
602                 return &e->boottime;
603
604         case SOURCE_TIME_MONOTONIC:
605                 return &e->monotonic;
606
607         case SOURCE_TIME_REALTIME_ALARM:
608                 return &e->realtime_alarm;
609
610         case SOURCE_TIME_BOOTTIME_ALARM:
611                 return &e->boottime_alarm;
612
613         default:
614                 return NULL;
615         }
616 }
617
618 static int event_make_signal_data(
619                 sd_event *e,
620                 int sig,
621                 struct signal_data **ret) {
622
623         struct epoll_event ev;
624         struct signal_data *d;
625         bool added = false;
626         sigset_t ss_copy;
627         int64_t priority;
628         int r;
629
630         assert(e);
631
632         if (event_pid_changed(e))
633                 return -ECHILD;
634
635         if (e->signal_sources && e->signal_sources[sig])
636                 priority = e->signal_sources[sig]->priority;
637         else
638                 priority = SD_EVENT_PRIORITY_NORMAL;
639
640         d = hashmap_get(e->signal_data, &priority);
641         if (d) {
642                 if (sigismember(&d->sigset, sig) > 0) {
643                         if (ret)
644                                 *ret = d;
645                         return 0;
646                 }
647         } else {
648                 r = hashmap_ensure_allocated(&e->signal_data, &uint64_hash_ops);
649                 if (r < 0)
650                         return r;
651
652                 d = new0(struct signal_data, 1);
653                 if (!d)
654                         return -ENOMEM;
655
656                 d->wakeup = WAKEUP_SIGNAL_DATA;
657                 d->fd  = -1;
658                 d->priority = priority;
659
660                 r = hashmap_put(e->signal_data, &d->priority, d);
661                 if (r < 0) {
662                         free(d);
663                         return r;
664                 }
665
666                 added = true;
667         }
668
669         ss_copy = d->sigset;
670         assert_se(sigaddset(&ss_copy, sig) >= 0);
671
672         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
673         if (r < 0) {
674                 r = -errno;
675                 goto fail;
676         }
677
678         d->sigset = ss_copy;
679
680         if (d->fd >= 0) {
681                 if (ret)
682                         *ret = d;
683                 return 0;
684         }
685
686         d->fd = fd_move_above_stdio(r);
687
688         ev = (struct epoll_event) {
689                 .events = EPOLLIN,
690                 .data.ptr = d,
691         };
692
693         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev);
694         if (r < 0)  {
695                 r = -errno;
696                 goto fail;
697         }
698
699         if (ret)
700                 *ret = d;
701
702         return 0;
703
704 fail:
705         if (added) {
706                 d->fd = safe_close(d->fd);
707                 hashmap_remove(e->signal_data, &d->priority);
708                 free(d);
709         }
710
711         return r;
712 }
713
714 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
715         assert(e);
716         assert(d);
717
718         /* Turns off the specified signal in the signal data
719          * object. If the signal mask of the object becomes empty that
720          * way removes it. */
721
722         if (sigismember(&d->sigset, sig) == 0)
723                 return;
724
725         assert_se(sigdelset(&d->sigset, sig) >= 0);
726
727         if (sigisemptyset(&d->sigset)) {
728
729                 /* If all the mask is all-zero we can get rid of the structure */
730                 hashmap_remove(e->signal_data, &d->priority);
731                 safe_close(d->fd);
732                 free(d);
733                 return;
734         }
735
736         assert(d->fd >= 0);
737
738         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
739                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
740 }
741
742 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
743         struct signal_data *d;
744         static const int64_t zero_priority = 0;
745
746         assert(e);
747
748         /* Rechecks if the specified signal is still something we are
749          * interested in. If not, we'll unmask it, and possibly drop
750          * the signalfd for it. */
751
752         if (sig == SIGCHLD &&
753             e->n_enabled_child_sources > 0)
754                 return;
755
756         if (e->signal_sources &&
757             e->signal_sources[sig] &&
758             e->signal_sources[sig]->enabled != SD_EVENT_OFF)
759                 return;
760
761         /*
762          * The specified signal might be enabled in three different queues:
763          *
764          * 1) the one that belongs to the priority passed (if it is non-NULL)
765          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
766          * 3) the 0 priority (to cover the SIGCHLD case)
767          *
768          * Hence, let's remove it from all three here.
769          */
770
771         if (priority) {
772                 d = hashmap_get(e->signal_data, priority);
773                 if (d)
774                         event_unmask_signal_data(e, d, sig);
775         }
776
777         if (e->signal_sources && e->signal_sources[sig]) {
778                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
779                 if (d)
780                         event_unmask_signal_data(e, d, sig);
781         }
782
783         d = hashmap_get(e->signal_data, &zero_priority);
784         if (d)
785                 event_unmask_signal_data(e, d, sig);
786 }
787
788 static void source_disconnect(sd_event_source *s) {
789         sd_event *event;
790
791         assert(s);
792
793         if (!s->event)
794                 return;
795
796         assert(s->event->n_sources > 0);
797
798         switch (s->type) {
799
800         case SOURCE_IO:
801                 if (s->io.fd >= 0)
802                         source_io_unregister(s);
803
804                 break;
805
806         case SOURCE_TIME_REALTIME:
807         case SOURCE_TIME_BOOTTIME:
808         case SOURCE_TIME_MONOTONIC:
809         case SOURCE_TIME_REALTIME_ALARM:
810         case SOURCE_TIME_BOOTTIME_ALARM: {
811                 struct clock_data *d;
812
813                 d = event_get_clock_data(s->event, s->type);
814                 assert(d);
815
816                 prioq_remove(d->earliest, s, &s->time.earliest_index);
817                 prioq_remove(d->latest, s, &s->time.latest_index);
818                 d->needs_rearm = true;
819                 break;
820         }
821
822         case SOURCE_SIGNAL:
823                 if (s->signal.sig > 0) {
824
825                         if (s->event->signal_sources)
826                                 s->event->signal_sources[s->signal.sig] = NULL;
827
828                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
829                 }
830
831                 break;
832
833         case SOURCE_CHILD:
834                 if (s->child.pid > 0) {
835                         if (s->enabled != SD_EVENT_OFF) {
836                                 assert(s->event->n_enabled_child_sources > 0);
837                                 s->event->n_enabled_child_sources--;
838                         }
839
840                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
841                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
842                 }
843
844                 break;
845
846         case SOURCE_DEFER:
847                 /* nothing */
848                 break;
849
850         case SOURCE_POST:
851                 set_remove(s->event->post_sources, s);
852                 break;
853
854         case SOURCE_EXIT:
855                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
856                 break;
857
858         default:
859                 assert_not_reached("Wut? I shouldn't exist.");
860         }
861
862         if (s->pending)
863                 prioq_remove(s->event->pending, s, &s->pending_index);
864
865         if (s->prepare)
866                 prioq_remove(s->event->prepare, s, &s->prepare_index);
867
868         event = s->event;
869
870         s->type = _SOURCE_EVENT_SOURCE_TYPE_INVALID;
871         s->event = NULL;
872         LIST_REMOVE(sources, event->sources, s);
873         event->n_sources--;
874
875         if (!s->floating)
876                 sd_event_unref(event);
877 }
878
879 static void source_free(sd_event_source *s) {
880         assert(s);
881
882         source_disconnect(s);
883
884         if (s->type == SOURCE_IO && s->io.owned)
885                 safe_close(s->io.fd);
886
887         free(s->description);
888         free(s);
889 }
890
891 static int source_set_pending(sd_event_source *s, bool b) {
892         int r;
893
894         assert(s);
895         assert(s->type != SOURCE_EXIT);
896
897         if (s->pending == b)
898                 return 0;
899
900         s->pending = b;
901
902         if (b) {
903                 s->pending_iteration = s->event->iteration;
904
905                 r = prioq_put(s->event->pending, s, &s->pending_index);
906                 if (r < 0) {
907                         s->pending = false;
908                         return r;
909                 }
910         } else
911                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
912
913         if (EVENT_SOURCE_IS_TIME(s->type)) {
914                 struct clock_data *d;
915
916                 d = event_get_clock_data(s->event, s->type);
917                 assert(d);
918
919                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
920                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
921                 d->needs_rearm = true;
922         }
923
924         if (s->type == SOURCE_SIGNAL && !b) {
925                 struct signal_data *d;
926
927                 d = hashmap_get(s->event->signal_data, &s->priority);
928                 if (d && d->current == s)
929                         d->current = NULL;
930         }
931
932         return 0;
933 }
934
935 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
936         sd_event_source *s;
937
938         assert(e);
939
940         s = new0(sd_event_source, 1);
941         if (!s)
942                 return NULL;
943
944         s->n_ref = 1;
945         s->event = e;
946         s->floating = floating;
947         s->type = type;
948         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
949
950         if (!floating)
951                 sd_event_ref(e);
952
953         LIST_PREPEND(sources, e->sources, s);
954         e->n_sources++;
955
956         return s;
957 }
958
959 _public_ int sd_event_add_io(
960                 sd_event *e,
961                 sd_event_source **ret,
962                 int fd,
963                 uint32_t events,
964                 sd_event_io_handler_t callback,
965                 void *userdata) {
966
967         sd_event_source *s;
968         int r;
969
970         assert_return(e, -EINVAL);
971         assert_return(e = event_resolve(e), -ENOPKG);
972         assert_return(fd >= 0, -EBADF);
973         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
974         assert_return(callback, -EINVAL);
975         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
976         assert_return(!event_pid_changed(e), -ECHILD);
977
978         s = source_new(e, !ret, SOURCE_IO);
979         if (!s)
980                 return -ENOMEM;
981
982         s->wakeup = WAKEUP_EVENT_SOURCE;
983         s->io.fd = fd;
984         s->io.events = events;
985         s->io.callback = callback;
986         s->userdata = userdata;
987         s->enabled = SD_EVENT_ON;
988
989         r = source_io_register(s, s->enabled, events);
990         if (r < 0) {
991                 source_free(s);
992                 return r;
993         }
994
995         if (ret)
996                 *ret = s;
997
998         return 0;
999 }
1000
1001 static void initialize_perturb(sd_event *e) {
1002         sd_id128_t bootid = {};
1003
1004         /* When we sleep for longer, we try to realign the wakeup to
1005            the same time wihtin each minute/second/250ms, so that
1006            events all across the system can be coalesced into a single
1007            CPU wakeup. However, let's take some system-specific
1008            randomness for this value, so that in a network of systems
1009            with synced clocks timer events are distributed a
1010            bit. Here, we calculate a perturbation usec offset from the
1011            boot ID. */
1012
1013         if (_likely_(e->perturb != USEC_INFINITY))
1014                 return;
1015
1016         if (sd_id128_get_boot(&bootid) >= 0)
1017                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1018 }
1019
1020 static int event_setup_timer_fd(
1021                 sd_event *e,
1022                 struct clock_data *d,
1023                 clockid_t clock) {
1024
1025         struct epoll_event ev;
1026         int r, fd;
1027
1028         assert(e);
1029         assert(d);
1030
1031         if (_likely_(d->fd >= 0))
1032                 return 0;
1033
1034         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1035         if (fd < 0)
1036                 return -errno;
1037
1038         fd = fd_move_above_stdio(fd);
1039
1040         ev = (struct epoll_event) {
1041                 .events = EPOLLIN,
1042                 .data.ptr = d,
1043         };
1044
1045         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
1046         if (r < 0) {
1047                 safe_close(fd);
1048                 return -errno;
1049         }
1050
1051         d->fd = fd;
1052         return 0;
1053 }
1054
1055 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1056         assert(s);
1057
1058         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1059 }
1060
1061 _public_ int sd_event_add_time(
1062                 sd_event *e,
1063                 sd_event_source **ret,
1064                 clockid_t clock,
1065                 uint64_t usec,
1066                 uint64_t accuracy,
1067                 sd_event_time_handler_t callback,
1068                 void *userdata) {
1069
1070         EventSourceType type;
1071         sd_event_source *s;
1072         struct clock_data *d;
1073         int r;
1074
1075         assert_return(e, -EINVAL);
1076         assert_return(e = event_resolve(e), -ENOPKG);
1077         assert_return(accuracy != (uint64_t) -1, -EINVAL);
1078         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1079         assert_return(!event_pid_changed(e), -ECHILD);
1080
1081         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1082                 return -EOPNOTSUPP;
1083
1084         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1085         if (type < 0)
1086                 return -EOPNOTSUPP;
1087
1088         if (!callback)
1089                 callback = time_exit_callback;
1090
1091         d = event_get_clock_data(e, type);
1092         assert(d);
1093
1094         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1095         if (r < 0)
1096                 return r;
1097
1098         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1099         if (r < 0)
1100                 return r;
1101
1102         if (d->fd < 0) {
1103                 r = event_setup_timer_fd(e, d, clock);
1104                 if (r < 0)
1105                         return r;
1106         }
1107
1108         s = source_new(e, !ret, type);
1109         if (!s)
1110                 return -ENOMEM;
1111
1112         s->time.next = usec;
1113         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1114         s->time.callback = callback;
1115         s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
1116         s->userdata = userdata;
1117         s->enabled = SD_EVENT_ONESHOT;
1118
1119         d->needs_rearm = true;
1120
1121         r = prioq_put(d->earliest, s, &s->time.earliest_index);
1122         if (r < 0)
1123                 goto fail;
1124
1125         r = prioq_put(d->latest, s, &s->time.latest_index);
1126         if (r < 0)
1127                 goto fail;
1128
1129         if (ret)
1130                 *ret = s;
1131
1132         return 0;
1133
1134 fail:
1135         source_free(s);
1136         return r;
1137 }
1138
1139 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1140         assert(s);
1141
1142         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1143 }
1144
1145 _public_ int sd_event_add_signal(
1146                 sd_event *e,
1147                 sd_event_source **ret,
1148                 int sig,
1149                 sd_event_signal_handler_t callback,
1150                 void *userdata) {
1151
1152         sd_event_source *s;
1153         struct signal_data *d;
1154         sigset_t ss;
1155         int r;
1156
1157         assert_return(e, -EINVAL);
1158         assert_return(e = event_resolve(e), -ENOPKG);
1159         assert_return(SIGNAL_VALID(sig), -EINVAL);
1160         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1161         assert_return(!event_pid_changed(e), -ECHILD);
1162
1163         if (!callback)
1164                 callback = signal_exit_callback;
1165
1166         r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
1167         if (r != 0)
1168                 return -r;
1169
1170         if (!sigismember(&ss, sig))
1171                 return -EBUSY;
1172
1173         if (!e->signal_sources) {
1174                 e->signal_sources = new0(sd_event_source*, _NSIG);
1175                 if (!e->signal_sources)
1176                         return -ENOMEM;
1177         } else if (e->signal_sources[sig])
1178                 return -EBUSY;
1179
1180         s = source_new(e, !ret, SOURCE_SIGNAL);
1181         if (!s)
1182                 return -ENOMEM;
1183
1184         s->signal.sig = sig;
1185         s->signal.callback = callback;
1186         s->userdata = userdata;
1187         s->enabled = SD_EVENT_ON;
1188
1189         e->signal_sources[sig] = s;
1190
1191         r = event_make_signal_data(e, sig, &d);
1192         if (r < 0) {
1193                 source_free(s);
1194                 return r;
1195         }
1196
1197         /* Use the signal name as description for the event source by default */
1198         (void) sd_event_source_set_description(s, signal_to_string(sig));
1199
1200         if (ret)
1201                 *ret = s;
1202
1203         return 0;
1204 }
1205
1206 _public_ int sd_event_add_child(
1207                 sd_event *e,
1208                 sd_event_source **ret,
1209                 pid_t pid,
1210                 int options,
1211                 sd_event_child_handler_t callback,
1212                 void *userdata) {
1213
1214         sd_event_source *s;
1215         int r;
1216
1217         assert_return(e, -EINVAL);
1218         assert_return(e = event_resolve(e), -ENOPKG);
1219         assert_return(pid > 1, -EINVAL);
1220         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1221         assert_return(options != 0, -EINVAL);
1222         assert_return(callback, -EINVAL);
1223         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1224         assert_return(!event_pid_changed(e), -ECHILD);
1225
1226         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1227         if (r < 0)
1228                 return r;
1229
1230         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1231                 return -EBUSY;
1232
1233         s = source_new(e, !ret, SOURCE_CHILD);
1234         if (!s)
1235                 return -ENOMEM;
1236
1237         s->child.pid = pid;
1238         s->child.options = options;
1239         s->child.callback = callback;
1240         s->userdata = userdata;
1241         s->enabled = SD_EVENT_ONESHOT;
1242
1243         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1244         if (r < 0) {
1245                 source_free(s);
1246                 return r;
1247         }
1248
1249         e->n_enabled_child_sources++;
1250
1251         r = event_make_signal_data(e, SIGCHLD, NULL);
1252         if (r < 0) {
1253                 e->n_enabled_child_sources--;
1254                 source_free(s);
1255                 return r;
1256         }
1257
1258         e->need_process_child = true;
1259
1260         if (ret)
1261                 *ret = s;
1262
1263         return 0;
1264 }
1265
1266 _public_ int sd_event_add_defer(
1267                 sd_event *e,
1268                 sd_event_source **ret,
1269                 sd_event_handler_t callback,
1270                 void *userdata) {
1271
1272         sd_event_source *s;
1273         int r;
1274
1275         assert_return(e, -EINVAL);
1276         assert_return(e = event_resolve(e), -ENOPKG);
1277         assert_return(callback, -EINVAL);
1278         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1279         assert_return(!event_pid_changed(e), -ECHILD);
1280
1281         s = source_new(e, !ret, SOURCE_DEFER);
1282         if (!s)
1283                 return -ENOMEM;
1284
1285         s->defer.callback = callback;
1286         s->userdata = userdata;
1287         s->enabled = SD_EVENT_ONESHOT;
1288
1289         r = source_set_pending(s, true);
1290         if (r < 0) {
1291                 source_free(s);
1292                 return r;
1293         }
1294
1295         if (ret)
1296                 *ret = s;
1297
1298         return 0;
1299 }
1300
1301 _public_ int sd_event_add_post(
1302                 sd_event *e,
1303                 sd_event_source **ret,
1304                 sd_event_handler_t callback,
1305                 void *userdata) {
1306
1307         sd_event_source *s;
1308         int r;
1309
1310         assert_return(e, -EINVAL);
1311         assert_return(e = event_resolve(e), -ENOPKG);
1312         assert_return(callback, -EINVAL);
1313         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1314         assert_return(!event_pid_changed(e), -ECHILD);
1315
1316         r = set_ensure_allocated(&e->post_sources, NULL);
1317         if (r < 0)
1318                 return r;
1319
1320         s = source_new(e, !ret, SOURCE_POST);
1321         if (!s)
1322                 return -ENOMEM;
1323
1324         s->post.callback = callback;
1325         s->userdata = userdata;
1326         s->enabled = SD_EVENT_ON;
1327
1328         r = set_put(e->post_sources, s);
1329         if (r < 0) {
1330                 source_free(s);
1331                 return r;
1332         }
1333
1334         if (ret)
1335                 *ret = s;
1336
1337         return 0;
1338 }
1339
1340 _public_ int sd_event_add_exit(
1341                 sd_event *e,
1342                 sd_event_source **ret,
1343                 sd_event_handler_t callback,
1344                 void *userdata) {
1345
1346         sd_event_source *s;
1347         int r;
1348
1349         assert_return(e, -EINVAL);
1350         assert_return(e = event_resolve(e), -ENOPKG);
1351         assert_return(callback, -EINVAL);
1352         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1353         assert_return(!event_pid_changed(e), -ECHILD);
1354
1355         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1356         if (r < 0)
1357                 return r;
1358
1359         s = source_new(e, !ret, SOURCE_EXIT);
1360         if (!s)
1361                 return -ENOMEM;
1362
1363         s->exit.callback = callback;
1364         s->userdata = userdata;
1365         s->exit.prioq_index = PRIOQ_IDX_NULL;
1366         s->enabled = SD_EVENT_ONESHOT;
1367
1368         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1369         if (r < 0) {
1370                 source_free(s);
1371                 return r;
1372         }
1373
1374         if (ret)
1375                 *ret = s;
1376
1377         return 0;
1378 }
1379
1380 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
1381
1382         if (!s)
1383                 return NULL;
1384
1385         assert(s->n_ref >= 1);
1386         s->n_ref++;
1387
1388         return s;
1389 }
1390
1391 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
1392
1393         if (!s)
1394                 return NULL;
1395
1396         assert(s->n_ref >= 1);
1397         s->n_ref--;
1398
1399         if (s->n_ref <= 0) {
1400                 /* Here's a special hack: when we are called from a
1401                  * dispatch handler we won't free the event source
1402                  * immediately, but we will detach the fd from the
1403                  * epoll. This way it is safe for the caller to unref
1404                  * the event source and immediately close the fd, but
1405                  * we still retain a valid event source object after
1406                  * the callback. */
1407
1408                 if (s->dispatching) {
1409                         if (s->type == SOURCE_IO)
1410                                 source_io_unregister(s);
1411
1412                         source_disconnect(s);
1413                 } else
1414                         source_free(s);
1415         }
1416
1417         return NULL;
1418 }
1419
1420 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
1421         assert_return(s, -EINVAL);
1422         assert_return(!event_pid_changed(s->event), -ECHILD);
1423
1424         return free_and_strdup(&s->description, description);
1425 }
1426
1427 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
1428         assert_return(s, -EINVAL);
1429         assert_return(description, -EINVAL);
1430         assert_return(s->description, -ENXIO);
1431         assert_return(!event_pid_changed(s->event), -ECHILD);
1432
1433         *description = s->description;
1434         return 0;
1435 }
1436
1437 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1438         assert_return(s, NULL);
1439
1440         return s->event;
1441 }
1442
1443 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1444         assert_return(s, -EINVAL);
1445         assert_return(s->type != SOURCE_EXIT, -EDOM);
1446         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1447         assert_return(!event_pid_changed(s->event), -ECHILD);
1448
1449         return s->pending;
1450 }
1451
1452 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1453         assert_return(s, -EINVAL);
1454         assert_return(s->type == SOURCE_IO, -EDOM);
1455         assert_return(!event_pid_changed(s->event), -ECHILD);
1456
1457         return s->io.fd;
1458 }
1459
1460 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1461         int r;
1462
1463         assert_return(s, -EINVAL);
1464         assert_return(fd >= 0, -EBADF);
1465         assert_return(s->type == SOURCE_IO, -EDOM);
1466         assert_return(!event_pid_changed(s->event), -ECHILD);
1467
1468         if (s->io.fd == fd)
1469                 return 0;
1470
1471         if (s->enabled == SD_EVENT_OFF) {
1472                 s->io.fd = fd;
1473                 s->io.registered = false;
1474         } else {
1475                 int saved_fd;
1476
1477                 saved_fd = s->io.fd;
1478                 assert(s->io.registered);
1479
1480                 s->io.fd = fd;
1481                 s->io.registered = false;
1482
1483                 r = source_io_register(s, s->enabled, s->io.events);
1484                 if (r < 0) {
1485                         s->io.fd = saved_fd;
1486                         s->io.registered = true;
1487                         return r;
1488                 }
1489
1490                 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
1491         }
1492
1493         return 0;
1494 }
1495
1496 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
1497         assert_return(s, -EINVAL);
1498         assert_return(s->type == SOURCE_IO, -EDOM);
1499
1500         return s->io.owned;
1501 }
1502
1503 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
1504         assert_return(s, -EINVAL);
1505         assert_return(s->type == SOURCE_IO, -EDOM);
1506
1507         s->io.owned = own;
1508         return 0;
1509 }
1510
1511 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1512         assert_return(s, -EINVAL);
1513         assert_return(events, -EINVAL);
1514         assert_return(s->type == SOURCE_IO, -EDOM);
1515         assert_return(!event_pid_changed(s->event), -ECHILD);
1516
1517         *events = s->io.events;
1518         return 0;
1519 }
1520
1521 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1522         int r;
1523
1524         assert_return(s, -EINVAL);
1525         assert_return(s->type == SOURCE_IO, -EDOM);
1526         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1527         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1528         assert_return(!event_pid_changed(s->event), -ECHILD);
1529
1530         /* edge-triggered updates are never skipped, so we can reset edges */
1531         if (s->io.events == events && !(events & EPOLLET))
1532                 return 0;
1533
1534         if (s->enabled != SD_EVENT_OFF) {
1535                 r = source_io_register(s, s->enabled, events);
1536                 if (r < 0)
1537                         return r;
1538         }
1539
1540         s->io.events = events;
1541         source_set_pending(s, false);
1542
1543         return 0;
1544 }
1545
1546 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1547         assert_return(s, -EINVAL);
1548         assert_return(revents, -EINVAL);
1549         assert_return(s->type == SOURCE_IO, -EDOM);
1550         assert_return(s->pending, -ENODATA);
1551         assert_return(!event_pid_changed(s->event), -ECHILD);
1552
1553         *revents = s->io.revents;
1554         return 0;
1555 }
1556
1557 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1558         assert_return(s, -EINVAL);
1559         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1560         assert_return(!event_pid_changed(s->event), -ECHILD);
1561
1562         return s->signal.sig;
1563 }
1564
1565 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
1566         assert_return(s, -EINVAL);
1567         assert_return(!event_pid_changed(s->event), -ECHILD);
1568
1569         *priority = s->priority;
1570         return 0;
1571 }
1572
1573 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
1574         int r;
1575
1576         assert_return(s, -EINVAL);
1577         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1578         assert_return(!event_pid_changed(s->event), -ECHILD);
1579
1580         if (s->priority == priority)
1581                 return 0;
1582
1583         if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
1584                 struct signal_data *old, *d;
1585
1586                 /* Move us from the signalfd belonging to the old
1587                  * priority to the signalfd of the new priority */
1588
1589                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
1590
1591                 s->priority = priority;
1592
1593                 r = event_make_signal_data(s->event, s->signal.sig, &d);
1594                 if (r < 0) {
1595                         s->priority = old->priority;
1596                         return r;
1597                 }
1598
1599                 event_unmask_signal_data(s->event, old, s->signal.sig);
1600         } else
1601                 s->priority = priority;
1602
1603         if (s->pending)
1604                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1605
1606         if (s->prepare)
1607                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1608
1609         if (s->type == SOURCE_EXIT)
1610                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1611
1612         return 0;
1613 }
1614
1615 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1616         assert_return(s, -EINVAL);
1617         assert_return(m, -EINVAL);
1618         assert_return(!event_pid_changed(s->event), -ECHILD);
1619
1620         *m = s->enabled;
1621         return 0;
1622 }
1623
1624 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1625         int r;
1626
1627         assert_return(s, -EINVAL);
1628         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
1629         assert_return(!event_pid_changed(s->event), -ECHILD);
1630
1631         /* If we are dead anyway, we are fine with turning off
1632          * sources, but everything else needs to fail. */
1633         if (s->event->state == SD_EVENT_FINISHED)
1634                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
1635
1636         if (s->enabled == m)
1637                 return 0;
1638
1639         if (m == SD_EVENT_OFF) {
1640
1641                 /* Unset the pending flag when this event source is disabled */
1642                 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
1643                         r = source_set_pending(s, false);
1644                         if (r < 0)
1645                                 return r;
1646                 }
1647
1648                 switch (s->type) {
1649
1650                 case SOURCE_IO:
1651                         source_io_unregister(s);
1652                         s->enabled = m;
1653                         break;
1654
1655                 case SOURCE_TIME_REALTIME:
1656                 case SOURCE_TIME_BOOTTIME:
1657                 case SOURCE_TIME_MONOTONIC:
1658                 case SOURCE_TIME_REALTIME_ALARM:
1659                 case SOURCE_TIME_BOOTTIME_ALARM: {
1660                         struct clock_data *d;
1661
1662                         s->enabled = m;
1663                         d = event_get_clock_data(s->event, s->type);
1664                         assert(d);
1665
1666                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1667                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1668                         d->needs_rearm = true;
1669                         break;
1670                 }
1671
1672                 case SOURCE_SIGNAL:
1673                         s->enabled = m;
1674
1675                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1676                         break;
1677
1678                 case SOURCE_CHILD:
1679                         s->enabled = m;
1680
1681                         assert(s->event->n_enabled_child_sources > 0);
1682                         s->event->n_enabled_child_sources--;
1683
1684                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1685                         break;
1686
1687                 case SOURCE_EXIT:
1688                         s->enabled = m;
1689                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1690                         break;
1691
1692                 case SOURCE_DEFER:
1693                 case SOURCE_POST:
1694                         s->enabled = m;
1695                         break;
1696
1697                 default:
1698                         assert_not_reached("Wut? I shouldn't exist.");
1699                 }
1700
1701         } else {
1702
1703                 /* Unset the pending flag when this event source is enabled */
1704                 if (s->enabled == SD_EVENT_OFF && !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
1705                         r = source_set_pending(s, false);
1706                         if (r < 0)
1707                                 return r;
1708                 }
1709
1710                 switch (s->type) {
1711
1712                 case SOURCE_IO:
1713                         r = source_io_register(s, m, s->io.events);
1714                         if (r < 0)
1715                                 return r;
1716
1717                         s->enabled = m;
1718                         break;
1719
1720                 case SOURCE_TIME_REALTIME:
1721                 case SOURCE_TIME_BOOTTIME:
1722                 case SOURCE_TIME_MONOTONIC:
1723                 case SOURCE_TIME_REALTIME_ALARM:
1724                 case SOURCE_TIME_BOOTTIME_ALARM: {
1725                         struct clock_data *d;
1726
1727                         s->enabled = m;
1728                         d = event_get_clock_data(s->event, s->type);
1729                         assert(d);
1730
1731                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1732                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1733                         d->needs_rearm = true;
1734                         break;
1735                 }
1736
1737                 case SOURCE_SIGNAL:
1738
1739                         s->enabled = m;
1740
1741                         r = event_make_signal_data(s->event, s->signal.sig, NULL);
1742                         if (r < 0) {
1743                                 s->enabled = SD_EVENT_OFF;
1744                                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1745                                 return r;
1746                         }
1747
1748                         break;
1749
1750                 case SOURCE_CHILD:
1751
1752                         if (s->enabled == SD_EVENT_OFF)
1753                                 s->event->n_enabled_child_sources++;
1754
1755                         s->enabled = m;
1756
1757                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
1758                         if (r < 0) {
1759                                 s->enabled = SD_EVENT_OFF;
1760                                 s->event->n_enabled_child_sources--;
1761                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1762                                 return r;
1763                         }
1764
1765                         break;
1766
1767                 case SOURCE_EXIT:
1768                         s->enabled = m;
1769                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1770                         break;
1771
1772                 case SOURCE_DEFER:
1773                 case SOURCE_POST:
1774                         s->enabled = m;
1775                         break;
1776
1777                 default:
1778                         assert_not_reached("Wut? I shouldn't exist.");
1779                 }
1780         }
1781
1782         if (s->pending)
1783                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1784
1785         if (s->prepare)
1786                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1787
1788         return 0;
1789 }
1790
1791 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1792         assert_return(s, -EINVAL);
1793         assert_return(usec, -EINVAL);
1794         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1795         assert_return(!event_pid_changed(s->event), -ECHILD);
1796
1797         *usec = s->time.next;
1798         return 0;
1799 }
1800
1801 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1802         struct clock_data *d;
1803
1804         assert_return(s, -EINVAL);
1805         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1806         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1807         assert_return(!event_pid_changed(s->event), -ECHILD);
1808
1809         s->time.next = usec;
1810
1811         source_set_pending(s, false);
1812
1813         d = event_get_clock_data(s->event, s->type);
1814         assert(d);
1815
1816         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1817         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1818         d->needs_rearm = true;
1819
1820         return 0;
1821 }
1822
1823 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1824         assert_return(s, -EINVAL);
1825         assert_return(usec, -EINVAL);
1826         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1827         assert_return(!event_pid_changed(s->event), -ECHILD);
1828
1829         *usec = s->time.accuracy;
1830         return 0;
1831 }
1832
1833 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1834         struct clock_data *d;
1835
1836         assert_return(s, -EINVAL);
1837         assert_return(usec != (uint64_t) -1, -EINVAL);
1838         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1839         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1840         assert_return(!event_pid_changed(s->event), -ECHILD);
1841
1842         if (usec == 0)
1843                 usec = DEFAULT_ACCURACY_USEC;
1844
1845         s->time.accuracy = usec;
1846
1847         source_set_pending(s, false);
1848
1849         d = event_get_clock_data(s->event, s->type);
1850         assert(d);
1851
1852         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1853         d->needs_rearm = true;
1854
1855         return 0;
1856 }
1857
1858 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
1859         assert_return(s, -EINVAL);
1860         assert_return(clock, -EINVAL);
1861         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1862         assert_return(!event_pid_changed(s->event), -ECHILD);
1863
1864         *clock = event_source_type_to_clock(s->type);
1865         return 0;
1866 }
1867
1868 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1869         assert_return(s, -EINVAL);
1870         assert_return(pid, -EINVAL);
1871         assert_return(s->type == SOURCE_CHILD, -EDOM);
1872         assert_return(!event_pid_changed(s->event), -ECHILD);
1873
1874         *pid = s->child.pid;
1875         return 0;
1876 }
1877
1878 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1879         int r;
1880
1881         assert_return(s, -EINVAL);
1882         assert_return(s->type != SOURCE_EXIT, -EDOM);
1883         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1884         assert_return(!event_pid_changed(s->event), -ECHILD);
1885
1886         if (s->prepare == callback)
1887                 return 0;
1888
1889         if (callback && s->prepare) {
1890                 s->prepare = callback;
1891                 return 0;
1892         }
1893
1894         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1895         if (r < 0)
1896                 return r;
1897
1898         s->prepare = callback;
1899
1900         if (callback) {
1901                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1902                 if (r < 0)
1903                         return r;
1904         } else
1905                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1906
1907         return 0;
1908 }
1909
1910 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1911         assert_return(s, NULL);
1912
1913         return s->userdata;
1914 }
1915
1916 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1917         void *ret;
1918
1919         assert_return(s, NULL);
1920
1921         ret = s->userdata;
1922         s->userdata = userdata;
1923
1924         return ret;
1925 }
1926
1927 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1928         usec_t c;
1929         assert(e);
1930         assert(a <= b);
1931
1932         if (a <= 0)
1933                 return 0;
1934         if (a >= USEC_INFINITY)
1935                 return USEC_INFINITY;
1936
1937         if (b <= a + 1)
1938                 return a;
1939
1940         initialize_perturb(e);
1941
1942         /*
1943           Find a good time to wake up again between times a and b. We
1944           have two goals here:
1945
1946           a) We want to wake up as seldom as possible, hence prefer
1947              later times over earlier times.
1948
1949           b) But if we have to wake up, then let's make sure to
1950              dispatch as much as possible on the entire system.
1951
1952           We implement this by waking up everywhere at the same time
1953           within any given minute if we can, synchronised via the
1954           perturbation value determined from the boot ID. If we can't,
1955           then we try to find the same spot in every 10s, then 1s and
1956           then 250ms step. Otherwise, we pick the last possible time
1957           to wake up.
1958         */
1959
1960         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1961         if (c >= b) {
1962                 if (_unlikely_(c < USEC_PER_MINUTE))
1963                         return b;
1964
1965                 c -= USEC_PER_MINUTE;
1966         }
1967
1968         if (c >= a)
1969                 return c;
1970
1971         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1972         if (c >= b) {
1973                 if (_unlikely_(c < USEC_PER_SEC*10))
1974                         return b;
1975
1976                 c -= USEC_PER_SEC*10;
1977         }
1978
1979         if (c >= a)
1980                 return c;
1981
1982         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1983         if (c >= b) {
1984                 if (_unlikely_(c < USEC_PER_SEC))
1985                         return b;
1986
1987                 c -= USEC_PER_SEC;
1988         }
1989
1990         if (c >= a)
1991                 return c;
1992
1993         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1994         if (c >= b) {
1995                 if (_unlikely_(c < USEC_PER_MSEC*250))
1996                         return b;
1997
1998                 c -= USEC_PER_MSEC*250;
1999         }
2000
2001         if (c >= a)
2002                 return c;
2003
2004         return b;
2005 }
2006
2007 static int event_arm_timer(
2008                 sd_event *e,
2009                 struct clock_data *d) {
2010
2011         struct itimerspec its = {};
2012         sd_event_source *a, *b;
2013         usec_t t;
2014         int r;
2015
2016         assert(e);
2017         assert(d);
2018
2019         if (!d->needs_rearm)
2020                 return 0;
2021         else
2022                 d->needs_rearm = false;
2023
2024         a = prioq_peek(d->earliest);
2025         if (!a || a->enabled == SD_EVENT_OFF || a->time.next == USEC_INFINITY) {
2026
2027                 if (d->fd < 0)
2028                         return 0;
2029
2030                 if (d->next == USEC_INFINITY)
2031                         return 0;
2032
2033                 /* disarm */
2034                 r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2035                 if (r < 0)
2036                         return r;
2037
2038                 d->next = USEC_INFINITY;
2039                 return 0;
2040         }
2041
2042         b = prioq_peek(d->latest);
2043         assert_se(b && b->enabled != SD_EVENT_OFF);
2044
2045         t = sleep_between(e, a->time.next, time_event_source_latest(b));
2046         if (d->next == t)
2047                 return 0;
2048
2049         assert_se(d->fd >= 0);
2050
2051         if (t == 0) {
2052                 /* We don' want to disarm here, just mean some time looooong ago. */
2053                 its.it_value.tv_sec = 0;
2054                 its.it_value.tv_nsec = 1;
2055         } else
2056                 timespec_store(&its.it_value, t);
2057
2058         r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2059         if (r < 0)
2060                 return -errno;
2061
2062         d->next = t;
2063         return 0;
2064 }
2065
2066 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
2067         assert(e);
2068         assert(s);
2069         assert(s->type == SOURCE_IO);
2070
2071         /* If the event source was already pending, we just OR in the
2072          * new revents, otherwise we reset the value. The ORing is
2073          * necessary to handle EPOLLONESHOT events properly where
2074          * readability might happen independently of writability, and
2075          * we need to keep track of both */
2076
2077         if (s->pending)
2078                 s->io.revents |= revents;
2079         else
2080                 s->io.revents = revents;
2081
2082         return source_set_pending(s, true);
2083 }
2084
2085 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
2086         uint64_t x;
2087         ssize_t ss;
2088
2089         assert(e);
2090         assert(fd >= 0);
2091
2092         assert_return(events == EPOLLIN, -EIO);
2093
2094         ss = read(fd, &x, sizeof(x));
2095         if (ss < 0) {
2096                 if (IN_SET(errno, EAGAIN, EINTR))
2097                         return 0;
2098
2099                 return -errno;
2100         }
2101
2102         if (_unlikely_(ss != sizeof(x)))
2103                 return -EIO;
2104
2105         if (next)
2106                 *next = USEC_INFINITY;
2107
2108         return 0;
2109 }
2110
2111 static int process_timer(
2112                 sd_event *e,
2113                 usec_t n,
2114                 struct clock_data *d) {
2115
2116         sd_event_source *s;
2117         int r;
2118
2119         assert(e);
2120         assert(d);
2121
2122         for (;;) {
2123                 s = prioq_peek(d->earliest);
2124                 if (!s ||
2125                     s->time.next > n ||
2126                     s->enabled == SD_EVENT_OFF ||
2127                     s->pending)
2128                         break;
2129
2130                 r = source_set_pending(s, true);
2131                 if (r < 0)
2132                         return r;
2133
2134                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2135                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
2136                 d->needs_rearm = true;
2137         }
2138
2139         return 0;
2140 }
2141
2142 static int process_child(sd_event *e) {
2143         sd_event_source *s;
2144         Iterator i;
2145         int r;
2146
2147         assert(e);
2148
2149         e->need_process_child = false;
2150
2151         /*
2152            So, this is ugly. We iteratively invoke waitid() with P_PID
2153            + WNOHANG for each PID we wait for, instead of using
2154            P_ALL. This is because we only want to get child
2155            information of very specific child processes, and not all
2156            of them. We might not have processed the SIGCHLD even of a
2157            previous invocation and we don't want to maintain a
2158            unbounded *per-child* event queue, hence we really don't
2159            want anything flushed out of the kernel's queue that we
2160            don't care about. Since this is O(n) this means that if you
2161            have a lot of processes you probably want to handle SIGCHLD
2162            yourself.
2163
2164            We do not reap the children here (by using WNOWAIT), this
2165            is only done after the event source is dispatched so that
2166            the callback still sees the process as a zombie.
2167         */
2168
2169         HASHMAP_FOREACH(s, e->child_sources, i) {
2170                 assert(s->type == SOURCE_CHILD);
2171
2172                 if (s->pending)
2173                         continue;
2174
2175                 if (s->enabled == SD_EVENT_OFF)
2176                         continue;
2177
2178                 zero(s->child.siginfo);
2179                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
2180                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
2181                 if (r < 0)
2182                         return -errno;
2183
2184                 if (s->child.siginfo.si_pid != 0) {
2185                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
2186
2187                         if (!zombie && (s->child.options & WEXITED)) {
2188                                 /* If the child isn't dead then let's
2189                                  * immediately remove the state change
2190                                  * from the queue, since there's no
2191                                  * benefit in leaving it queued */
2192
2193                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
2194                                 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
2195                         }
2196
2197                         r = source_set_pending(s, true);
2198                         if (r < 0)
2199                                 return r;
2200                 }
2201         }
2202
2203         return 0;
2204 }
2205
2206 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
2207         bool read_one = false;
2208         int r;
2209
2210         assert(e);
2211         assert_return(events == EPOLLIN, -EIO);
2212
2213         /* If there's a signal queued on this priority and SIGCHLD is
2214            on this priority too, then make sure to recheck the
2215            children we watch. This is because we only ever dequeue
2216            the first signal per priority, and if we dequeue one, and
2217            SIGCHLD might be enqueued later we wouldn't know, but we
2218            might have higher priority children we care about hence we
2219            need to check that explicitly. */
2220
2221         if (sigismember(&d->sigset, SIGCHLD))
2222                 e->need_process_child = true;
2223
2224         /* If there's already an event source pending for this
2225          * priority we don't read another */
2226         if (d->current)
2227                 return 0;
2228
2229         for (;;) {
2230                 struct signalfd_siginfo si;
2231                 ssize_t n;
2232                 sd_event_source *s = NULL;
2233
2234                 n = read(d->fd, &si, sizeof(si));
2235                 if (n < 0) {
2236                         if (IN_SET(errno, EAGAIN, EINTR))
2237                                 return read_one;
2238
2239                         return -errno;
2240                 }
2241
2242                 if (_unlikely_(n != sizeof(si)))
2243                         return -EIO;
2244
2245                 assert(SIGNAL_VALID(si.ssi_signo));
2246
2247                 read_one = true;
2248
2249                 if (e->signal_sources)
2250                         s = e->signal_sources[si.ssi_signo];
2251                 if (!s)
2252                         continue;
2253                 if (s->pending)
2254                         continue;
2255
2256                 s->signal.siginfo = si;
2257                 d->current = s;
2258
2259                 r = source_set_pending(s, true);
2260                 if (r < 0)
2261                         return r;
2262
2263                 return 1;
2264         }
2265 }
2266
2267 static int source_dispatch(sd_event_source *s) {
2268         EventSourceType saved_type;
2269         int r = 0;
2270
2271         assert(s);
2272         assert(s->pending || s->type == SOURCE_EXIT);
2273
2274         /* Save the event source type, here, so that we still know it after the event callback which might invalidate
2275          * the event. */
2276         saved_type = s->type;
2277
2278         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2279                 r = source_set_pending(s, false);
2280                 if (r < 0)
2281                         return r;
2282         }
2283
2284         if (s->type != SOURCE_POST) {
2285                 sd_event_source *z;
2286                 Iterator i;
2287
2288                 /* If we execute a non-post source, let's mark all
2289                  * post sources as pending */
2290
2291                 SET_FOREACH(z, s->event->post_sources, i) {
2292                         if (z->enabled == SD_EVENT_OFF)
2293                                 continue;
2294
2295                         r = source_set_pending(z, true);
2296                         if (r < 0)
2297                                 return r;
2298                 }
2299         }
2300
2301         if (s->enabled == SD_EVENT_ONESHOT) {
2302                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
2303                 if (r < 0)
2304                         return r;
2305         }
2306
2307         s->dispatching = true;
2308
2309         switch (s->type) {
2310
2311         case SOURCE_IO:
2312                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
2313                 break;
2314
2315         case SOURCE_TIME_REALTIME:
2316         case SOURCE_TIME_BOOTTIME:
2317         case SOURCE_TIME_MONOTONIC:
2318         case SOURCE_TIME_REALTIME_ALARM:
2319         case SOURCE_TIME_BOOTTIME_ALARM:
2320                 r = s->time.callback(s, s->time.next, s->userdata);
2321                 break;
2322
2323         case SOURCE_SIGNAL:
2324                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
2325                 break;
2326
2327         case SOURCE_CHILD: {
2328                 bool zombie;
2329
2330                 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
2331
2332                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
2333
2334                 /* Now, reap the PID for good. */
2335                 if (zombie)
2336                         waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
2337
2338                 break;
2339         }
2340
2341         case SOURCE_DEFER:
2342                 r = s->defer.callback(s, s->userdata);
2343                 break;
2344
2345         case SOURCE_POST:
2346                 r = s->post.callback(s, s->userdata);
2347                 break;
2348
2349         case SOURCE_EXIT:
2350                 r = s->exit.callback(s, s->userdata);
2351                 break;
2352
2353         case SOURCE_WATCHDOG:
2354         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
2355         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
2356                 assert_not_reached("Wut? I shouldn't exist.");
2357         }
2358
2359         s->dispatching = false;
2360
2361         if (r < 0)
2362                 log_debug_errno(r, "Event source %s (type %s) returned error, disabling: %m",
2363                                 strna(s->description), event_source_type_to_string(saved_type));
2364
2365         if (s->n_ref == 0)
2366                 source_free(s);
2367         else if (r < 0)
2368                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
2369
2370         return 1;
2371 }
2372
2373 static int event_prepare(sd_event *e) {
2374         int r;
2375
2376         assert(e);
2377
2378         for (;;) {
2379                 sd_event_source *s;
2380
2381                 s = prioq_peek(e->prepare);
2382                 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
2383                         break;
2384
2385                 s->prepare_iteration = e->iteration;
2386                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
2387                 if (r < 0)
2388                         return r;
2389
2390                 assert(s->prepare);
2391
2392                 s->dispatching = true;
2393                 r = s->prepare(s, s->userdata);
2394                 s->dispatching = false;
2395
2396                 if (r < 0)
2397                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, disabling: %m",
2398                                         strna(s->description), event_source_type_to_string(s->type));
2399
2400                 if (s->n_ref == 0)
2401                         source_free(s);
2402                 else if (r < 0)
2403                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
2404         }
2405
2406         return 0;
2407 }
2408
2409 static int dispatch_exit(sd_event *e) {
2410         sd_event_source *p;
2411         _cleanup_(sd_event_unrefp) sd_event *ref = NULL;
2412         int r;
2413
2414         assert(e);
2415
2416         p = prioq_peek(e->exit);
2417         if (!p || p->enabled == SD_EVENT_OFF) {
2418                 e->state = SD_EVENT_FINISHED;
2419                 return 0;
2420         }
2421
2422         ref = sd_event_ref(e);
2423         e->iteration++;
2424         e->state = SD_EVENT_EXITING;
2425         r = source_dispatch(p);
2426         e->state = SD_EVENT_INITIAL;
2427         return r;
2428 }
2429
2430 static sd_event_source* event_next_pending(sd_event *e) {
2431         sd_event_source *p;
2432
2433         assert(e);
2434
2435         p = prioq_peek(e->pending);
2436         if (!p)
2437                 return NULL;
2438
2439         if (p->enabled == SD_EVENT_OFF)
2440                 return NULL;
2441
2442         return p;
2443 }
2444
2445 static int arm_watchdog(sd_event *e) {
2446         struct itimerspec its = {};
2447         usec_t t;
2448         int r;
2449
2450         assert(e);
2451         assert(e->watchdog_fd >= 0);
2452
2453         t = sleep_between(e,
2454                           e->watchdog_last + (e->watchdog_period / 2),
2455                           e->watchdog_last + (e->watchdog_period * 3 / 4));
2456
2457         timespec_store(&its.it_value, t);
2458
2459         /* Make sure we never set the watchdog to 0, which tells the
2460          * kernel to disable it. */
2461         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
2462                 its.it_value.tv_nsec = 1;
2463
2464         r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
2465         if (r < 0)
2466                 return -errno;
2467
2468         return 0;
2469 }
2470
2471 static int process_watchdog(sd_event *e) {
2472         assert(e);
2473
2474         if (!e->watchdog)
2475                 return 0;
2476
2477         /* Don't notify watchdog too often */
2478         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
2479                 return 0;
2480
2481         sd_notify(false, "WATCHDOG=1");
2482         e->watchdog_last = e->timestamp.monotonic;
2483
2484         return arm_watchdog(e);
2485 }
2486
2487 _public_ int sd_event_prepare(sd_event *e) {
2488         int r;
2489
2490         assert_return(e, -EINVAL);
2491         assert_return(e = event_resolve(e), -ENOPKG);
2492         assert_return(!event_pid_changed(e), -ECHILD);
2493         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2494         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2495
2496         if (e->exit_requested)
2497                 goto pending;
2498
2499         e->iteration++;
2500
2501         e->state = SD_EVENT_PREPARING;
2502         r = event_prepare(e);
2503         e->state = SD_EVENT_INITIAL;
2504         if (r < 0)
2505                 return r;
2506
2507         r = event_arm_timer(e, &e->realtime);
2508         if (r < 0)
2509                 return r;
2510
2511         r = event_arm_timer(e, &e->boottime);
2512         if (r < 0)
2513                 return r;
2514
2515         r = event_arm_timer(e, &e->monotonic);
2516         if (r < 0)
2517                 return r;
2518
2519         r = event_arm_timer(e, &e->realtime_alarm);
2520         if (r < 0)
2521                 return r;
2522
2523         r = event_arm_timer(e, &e->boottime_alarm);
2524         if (r < 0)
2525                 return r;
2526
2527         if (event_next_pending(e) || e->need_process_child)
2528                 goto pending;
2529
2530         e->state = SD_EVENT_ARMED;
2531
2532         return 0;
2533
2534 pending:
2535         e->state = SD_EVENT_ARMED;
2536         r = sd_event_wait(e, 0);
2537         if (r == 0)
2538                 e->state = SD_EVENT_ARMED;
2539
2540         return r;
2541 }
2542
2543 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
2544         struct epoll_event *ev_queue;
2545         unsigned ev_queue_max;
2546         int r, m, i;
2547
2548         assert_return(e, -EINVAL);
2549         assert_return(e = event_resolve(e), -ENOPKG);
2550         assert_return(!event_pid_changed(e), -ECHILD);
2551         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2552         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
2553
2554         if (e->exit_requested) {
2555                 e->state = SD_EVENT_PENDING;
2556                 return 1;
2557         }
2558
2559         ev_queue_max = MAX(e->n_sources, 1u);
2560         ev_queue = newa(struct epoll_event, ev_queue_max);
2561
2562         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
2563                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
2564         if (m < 0) {
2565                 if (errno == EINTR) {
2566                         e->state = SD_EVENT_PENDING;
2567                         return 1;
2568                 }
2569
2570                 r = -errno;
2571                 goto finish;
2572         }
2573
2574         triple_timestamp_get(&e->timestamp);
2575
2576         for (i = 0; i < m; i++) {
2577
2578                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
2579                         r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
2580                 else {
2581                         WakeupType *t = ev_queue[i].data.ptr;
2582
2583                         switch (*t) {
2584
2585                         case WAKEUP_EVENT_SOURCE:
2586                                 r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
2587                                 break;
2588
2589                         case WAKEUP_CLOCK_DATA: {
2590                                 struct clock_data *d = ev_queue[i].data.ptr;
2591                                 r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
2592                                 break;
2593                         }
2594
2595                         case WAKEUP_SIGNAL_DATA:
2596                                 r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
2597                                 break;
2598
2599                         default:
2600                                 assert_not_reached("Invalid wake-up pointer");
2601                         }
2602                 }
2603                 if (r < 0)
2604                         goto finish;
2605         }
2606
2607         r = process_watchdog(e);
2608         if (r < 0)
2609                 goto finish;
2610
2611         r = process_timer(e, e->timestamp.realtime, &e->realtime);
2612         if (r < 0)
2613                 goto finish;
2614
2615         r = process_timer(e, e->timestamp.boottime, &e->boottime);
2616         if (r < 0)
2617                 goto finish;
2618
2619         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
2620         if (r < 0)
2621                 goto finish;
2622
2623         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
2624         if (r < 0)
2625                 goto finish;
2626
2627         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
2628         if (r < 0)
2629                 goto finish;
2630
2631         if (e->need_process_child) {
2632                 r = process_child(e);
2633                 if (r < 0)
2634                         goto finish;
2635         }
2636
2637         if (event_next_pending(e)) {
2638                 e->state = SD_EVENT_PENDING;
2639
2640                 return 1;
2641         }
2642
2643         r = 0;
2644
2645 finish:
2646         e->state = SD_EVENT_INITIAL;
2647
2648         return r;
2649 }
2650
2651 _public_ int sd_event_dispatch(sd_event *e) {
2652         sd_event_source *p;
2653         int r;
2654
2655         assert_return(e, -EINVAL);
2656         assert_return(e = event_resolve(e), -ENOPKG);
2657         assert_return(!event_pid_changed(e), -ECHILD);
2658         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2659         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
2660
2661         if (e->exit_requested)
2662                 return dispatch_exit(e);
2663
2664         p = event_next_pending(e);
2665         if (p) {
2666                 _cleanup_(sd_event_unrefp) sd_event *ref = NULL;
2667
2668                 ref = sd_event_ref(e);
2669                 e->state = SD_EVENT_RUNNING;
2670                 r = source_dispatch(p);
2671                 e->state = SD_EVENT_INITIAL;
2672                 return r;
2673         }
2674
2675         e->state = SD_EVENT_INITIAL;
2676
2677         return 1;
2678 }
2679
2680 static void event_log_delays(sd_event *e) {
2681         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1];
2682         unsigned i;
2683         int o;
2684
2685         for (i = o = 0; i < ELEMENTSOF(e->delays); i++) {
2686                 o += snprintf(&b[o], sizeof(b) - o, "%u ", e->delays[i]);
2687                 e->delays[i] = 0;
2688         }
2689         log_debug("Event loop iterations: %.*s", o, b);
2690 }
2691
2692 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
2693         int r;
2694
2695         assert_return(e, -EINVAL);
2696         assert_return(e = event_resolve(e), -ENOPKG);
2697         assert_return(!event_pid_changed(e), -ECHILD);
2698         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2699         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2700
2701         if (e->profile_delays && e->last_run) {
2702                 usec_t this_run;
2703                 unsigned l;
2704
2705                 this_run = now(CLOCK_MONOTONIC);
2706
2707                 l = u64log2(this_run - e->last_run);
2708                 assert(l < sizeof(e->delays));
2709                 e->delays[l]++;
2710
2711                 if (this_run - e->last_log >= 5*USEC_PER_SEC) {
2712                         event_log_delays(e);
2713                         e->last_log = this_run;
2714                 }
2715         }
2716
2717         r = sd_event_prepare(e);
2718         if (r == 0)
2719                 /* There was nothing? Then wait... */
2720                 r = sd_event_wait(e, timeout);
2721
2722         if (e->profile_delays)
2723                 e->last_run = now(CLOCK_MONOTONIC);
2724
2725         if (r > 0) {
2726                 /* There's something now, then let's dispatch it */
2727                 r = sd_event_dispatch(e);
2728                 if (r < 0)
2729                         return r;
2730
2731                 return 1;
2732         }
2733
2734         return r;
2735 }
2736
2737 _public_ int sd_event_loop(sd_event *e) {
2738         _cleanup_(sd_event_unrefp) sd_event *ref = NULL;
2739         int r;
2740
2741         assert_return(e, -EINVAL);
2742         assert_return(e = event_resolve(e), -ENOPKG);
2743         assert_return(!event_pid_changed(e), -ECHILD);
2744         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2745
2746         ref = sd_event_ref(e);
2747
2748         while (e->state != SD_EVENT_FINISHED) {
2749                 r = sd_event_run(e, (uint64_t) -1);
2750                 if (r < 0)
2751                         return r;
2752         }
2753
2754         return e->exit_code;
2755 }
2756
2757 _public_ int sd_event_get_fd(sd_event *e) {
2758
2759         assert_return(e, -EINVAL);
2760         assert_return(e = event_resolve(e), -ENOPKG);
2761         assert_return(!event_pid_changed(e), -ECHILD);
2762
2763         return e->epoll_fd;
2764 }
2765
2766 _public_ int sd_event_get_state(sd_event *e) {
2767         assert_return(e, -EINVAL);
2768         assert_return(e = event_resolve(e), -ENOPKG);
2769         assert_return(!event_pid_changed(e), -ECHILD);
2770
2771         return e->state;
2772 }
2773
2774 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2775         assert_return(e, -EINVAL);
2776         assert_return(e = event_resolve(e), -ENOPKG);
2777         assert_return(code, -EINVAL);
2778         assert_return(!event_pid_changed(e), -ECHILD);
2779
2780         if (!e->exit_requested)
2781                 return -ENODATA;
2782
2783         *code = e->exit_code;
2784         return 0;
2785 }
2786
2787 _public_ int sd_event_exit(sd_event *e, int code) {
2788         assert_return(e, -EINVAL);
2789         assert_return(e = event_resolve(e), -ENOPKG);
2790         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2791         assert_return(!event_pid_changed(e), -ECHILD);
2792
2793         e->exit_requested = true;
2794         e->exit_code = code;
2795
2796         return 0;
2797 }
2798
2799 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
2800         assert_return(e, -EINVAL);
2801         assert_return(e = event_resolve(e), -ENOPKG);
2802         assert_return(usec, -EINVAL);
2803         assert_return(!event_pid_changed(e), -ECHILD);
2804
2805         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
2806                 return -EOPNOTSUPP;
2807
2808         /* Generate a clean error in case CLOCK_BOOTTIME is not available. Note that don't use clock_supported() here,
2809          * for a reason: there are systems where CLOCK_BOOTTIME is supported, but CLOCK_BOOTTIME_ALARM is not, but for
2810          * the purpose of getting the time this doesn't matter. */
2811         if (IN_SET(clock, CLOCK_BOOTTIME, CLOCK_BOOTTIME_ALARM) && !clock_boottime_supported())
2812                 return -EOPNOTSUPP;
2813
2814         if (!triple_timestamp_is_set(&e->timestamp)) {
2815                 /* Implicitly fall back to now() if we never ran
2816                  * before and thus have no cached time. */
2817                 *usec = now(clock);
2818                 return 1;
2819         }
2820
2821         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
2822         return 0;
2823 }
2824
2825 _public_ int sd_event_default(sd_event **ret) {
2826         sd_event *e = NULL;
2827         int r;
2828
2829         if (!ret)
2830                 return !!default_event;
2831
2832         if (default_event) {
2833                 *ret = sd_event_ref(default_event);
2834                 return 0;
2835         }
2836
2837         r = sd_event_new(&e);
2838         if (r < 0)
2839                 return r;
2840
2841         e->default_event_ptr = &default_event;
2842         e->tid = gettid();
2843         default_event = e;
2844
2845         *ret = e;
2846         return 1;
2847 }
2848
2849 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2850         assert_return(e, -EINVAL);
2851         assert_return(e = event_resolve(e), -ENOPKG);
2852         assert_return(tid, -EINVAL);
2853         assert_return(!event_pid_changed(e), -ECHILD);
2854
2855         if (e->tid != 0) {
2856                 *tid = e->tid;
2857                 return 0;
2858         }
2859
2860         return -ENXIO;
2861 }
2862
2863 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2864         int r;
2865
2866         assert_return(e, -EINVAL);
2867         assert_return(e = event_resolve(e), -ENOPKG);
2868         assert_return(!event_pid_changed(e), -ECHILD);
2869
2870         if (e->watchdog == !!b)
2871                 return e->watchdog;
2872
2873         if (b) {
2874                 struct epoll_event ev;
2875
2876                 r = sd_watchdog_enabled(false, &e->watchdog_period);
2877                 if (r <= 0)
2878                         return r;
2879
2880                 /* Issue first ping immediately */
2881                 sd_notify(false, "WATCHDOG=1");
2882                 e->watchdog_last = now(CLOCK_MONOTONIC);
2883
2884                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2885                 if (e->watchdog_fd < 0)
2886                         return -errno;
2887
2888                 r = arm_watchdog(e);
2889                 if (r < 0)
2890                         goto fail;
2891
2892                 ev = (struct epoll_event) {
2893                         .events = EPOLLIN,
2894                         .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
2895                 };
2896
2897                 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2898                 if (r < 0) {
2899                         r = -errno;
2900                         goto fail;
2901                 }
2902
2903         } else {
2904                 if (e->watchdog_fd >= 0) {
2905                         epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2906                         e->watchdog_fd = safe_close(e->watchdog_fd);
2907                 }
2908         }
2909
2910         e->watchdog = !!b;
2911         return e->watchdog;
2912
2913 fail:
2914         e->watchdog_fd = safe_close(e->watchdog_fd);
2915         return r;
2916 }
2917
2918 _public_ int sd_event_get_watchdog(sd_event *e) {
2919         assert_return(e, -EINVAL);
2920         assert_return(e = event_resolve(e), -ENOPKG);
2921         assert_return(!event_pid_changed(e), -ECHILD);
2922
2923         return e->watchdog;
2924 }
2925
2926 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
2927         assert_return(e, -EINVAL);
2928         assert_return(e = event_resolve(e), -ENOPKG);
2929         assert_return(!event_pid_changed(e), -ECHILD);
2930
2931         *ret = e->iteration;
2932         return 0;
2933 }