chiark / gitweb /
Prep v236 : Add missing SPDX-License-Identifier (4/9) src/libelogind
[elogind.git] / src / libelogind / sd-event / sd-event.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3   This file is part of systemd.
4
5   Copyright 2013 Lennart Poettering
6
7   systemd is free software; you can redistribute it and/or modify it
8   under the terms of the GNU Lesser General Public License as published by
9   the Free Software Foundation; either version 2.1 of the License, or
10   (at your option) any later version.
11
12   systemd is distributed in the hope that it will be useful, but
13   WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15   Lesser General Public License for more details.
16
17   You should have received a copy of the GNU Lesser General Public License
18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <sys/epoll.h>
22 #include <sys/timerfd.h>
23 #include <sys/wait.h>
24
25 #include "sd-daemon.h"
26 #include "sd-event.h"
27 #include "sd-id128.h"
28
29 #include "alloc-util.h"
30 #include "fd-util.h"
31 #include "hashmap.h"
32 #include "list.h"
33 #include "macro.h"
34 #include "missing.h"
35 #include "prioq.h"
36 #include "process-util.h"
37 #include "set.h"
38 #include "signal-util.h"
39 #include "string-table.h"
40 #include "string-util.h"
41 #include "time-util.h"
42 #include "util.h"
43
44 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
45
46 typedef enum EventSourceType {
47         SOURCE_IO,
48         SOURCE_TIME_REALTIME,
49         SOURCE_TIME_BOOTTIME,
50         SOURCE_TIME_MONOTONIC,
51         SOURCE_TIME_REALTIME_ALARM,
52         SOURCE_TIME_BOOTTIME_ALARM,
53         SOURCE_SIGNAL,
54         SOURCE_CHILD,
55         SOURCE_DEFER,
56         SOURCE_POST,
57         SOURCE_EXIT,
58         SOURCE_WATCHDOG,
59         _SOURCE_EVENT_SOURCE_TYPE_MAX,
60         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
61 } EventSourceType;
62
63 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
64         [SOURCE_IO] = "io",
65         [SOURCE_TIME_REALTIME] = "realtime",
66         [SOURCE_TIME_BOOTTIME] = "bootime",
67         [SOURCE_TIME_MONOTONIC] = "monotonic",
68         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
69         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
70         [SOURCE_SIGNAL] = "signal",
71         [SOURCE_CHILD] = "child",
72         [SOURCE_DEFER] = "defer",
73         [SOURCE_POST] = "post",
74         [SOURCE_EXIT] = "exit",
75         [SOURCE_WATCHDOG] = "watchdog",
76 };
77
78 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
79
80 /* All objects we use in epoll events start with this value, so that
81  * we know how to dispatch it */
82 typedef enum WakeupType {
83         WAKEUP_NONE,
84         WAKEUP_EVENT_SOURCE,
85         WAKEUP_CLOCK_DATA,
86         WAKEUP_SIGNAL_DATA,
87         _WAKEUP_TYPE_MAX,
88         _WAKEUP_TYPE_INVALID = -1,
89 } WakeupType;
90
91 #define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
92
93 struct sd_event_source {
94         WakeupType wakeup;
95
96         unsigned n_ref;
97
98         sd_event *event;
99         void *userdata;
100         sd_event_handler_t prepare;
101
102         char *description;
103
104         EventSourceType type:5;
105         int enabled:3;
106         bool pending:1;
107         bool dispatching:1;
108         bool floating:1;
109
110         int64_t priority;
111         unsigned pending_index;
112         unsigned prepare_index;
113         uint64_t pending_iteration;
114         uint64_t prepare_iteration;
115
116         LIST_FIELDS(sd_event_source, sources);
117
118         union {
119                 struct {
120                         sd_event_io_handler_t callback;
121                         int fd;
122                         uint32_t events;
123                         uint32_t revents;
124                         bool registered:1;
125                 } io;
126                 struct {
127                         sd_event_time_handler_t callback;
128                         usec_t next, accuracy;
129                         unsigned earliest_index;
130                         unsigned latest_index;
131                 } time;
132                 struct {
133                         sd_event_signal_handler_t callback;
134                         struct signalfd_siginfo siginfo;
135                         int sig;
136                 } signal;
137                 struct {
138                         sd_event_child_handler_t callback;
139                         siginfo_t siginfo;
140                         pid_t pid;
141                         int options;
142                 } child;
143                 struct {
144                         sd_event_handler_t callback;
145                 } defer;
146                 struct {
147                         sd_event_handler_t callback;
148                 } post;
149                 struct {
150                         sd_event_handler_t callback;
151                         unsigned prioq_index;
152                 } exit;
153         };
154 };
155
156 struct clock_data {
157         WakeupType wakeup;
158         int fd;
159
160         /* For all clocks we maintain two priority queues each, one
161          * ordered for the earliest times the events may be
162          * dispatched, and one ordered by the latest times they must
163          * have been dispatched. The range between the top entries in
164          * the two prioqs is the time window we can freely schedule
165          * wakeups in */
166
167         Prioq *earliest;
168         Prioq *latest;
169         usec_t next;
170
171         bool needs_rearm:1;
172 };
173
174 struct signal_data {
175         WakeupType wakeup;
176
177         /* For each priority we maintain one signal fd, so that we
178          * only have to dequeue a single event per priority at a
179          * time. */
180
181         int fd;
182         int64_t priority;
183         sigset_t sigset;
184         sd_event_source *current;
185 };
186
187 struct sd_event {
188         unsigned n_ref;
189
190         int epoll_fd;
191         int watchdog_fd;
192
193         Prioq *pending;
194         Prioq *prepare;
195
196         /* timerfd_create() only supports these five clocks so far. We
197          * can add support for more clocks when the kernel learns to
198          * deal with them, too. */
199         struct clock_data realtime;
200         struct clock_data boottime;
201         struct clock_data monotonic;
202         struct clock_data realtime_alarm;
203         struct clock_data boottime_alarm;
204
205         usec_t perturb;
206
207         sd_event_source **signal_sources; /* indexed by signal number */
208         Hashmap *signal_data; /* indexed by priority */
209
210         Hashmap *child_sources;
211         unsigned n_enabled_child_sources;
212
213         Set *post_sources;
214
215         Prioq *exit;
216
217         pid_t original_pid;
218
219         uint64_t iteration;
220         triple_timestamp timestamp;
221         int state;
222
223         bool exit_requested:1;
224         bool need_process_child:1;
225         bool watchdog:1;
226         bool profile_delays:1;
227
228         int exit_code;
229
230         pid_t tid;
231         sd_event **default_event_ptr;
232
233         usec_t watchdog_last, watchdog_period;
234
235         unsigned n_sources;
236
237         LIST_HEAD(sd_event_source, sources);
238
239         usec_t last_run, last_log;
240         unsigned delays[sizeof(usec_t) * 8];
241 };
242
243 static void source_disconnect(sd_event_source *s);
244
245 static int pending_prioq_compare(const void *a, const void *b) {
246         const sd_event_source *x = a, *y = b;
247
248         assert(x->pending);
249         assert(y->pending);
250
251         /* Enabled ones first */
252         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
253                 return -1;
254         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
255                 return 1;
256
257         /* Lower priority values first */
258         if (x->priority < y->priority)
259                 return -1;
260         if (x->priority > y->priority)
261                 return 1;
262
263         /* Older entries first */
264         if (x->pending_iteration < y->pending_iteration)
265                 return -1;
266         if (x->pending_iteration > y->pending_iteration)
267                 return 1;
268
269         return 0;
270 }
271
272 static int prepare_prioq_compare(const void *a, const void *b) {
273         const sd_event_source *x = a, *y = b;
274
275         assert(x->prepare);
276         assert(y->prepare);
277
278         /* Enabled ones first */
279         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
280                 return -1;
281         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
282                 return 1;
283
284         /* Move most recently prepared ones last, so that we can stop
285          * preparing as soon as we hit one that has already been
286          * prepared in the current iteration */
287         if (x->prepare_iteration < y->prepare_iteration)
288                 return -1;
289         if (x->prepare_iteration > y->prepare_iteration)
290                 return 1;
291
292         /* Lower priority values first */
293         if (x->priority < y->priority)
294                 return -1;
295         if (x->priority > y->priority)
296                 return 1;
297
298         return 0;
299 }
300
301 static int earliest_time_prioq_compare(const void *a, const void *b) {
302         const sd_event_source *x = a, *y = b;
303
304         assert(EVENT_SOURCE_IS_TIME(x->type));
305         assert(x->type == y->type);
306
307         /* Enabled ones first */
308         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
309                 return -1;
310         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
311                 return 1;
312
313         /* Move the pending ones to the end */
314         if (!x->pending && y->pending)
315                 return -1;
316         if (x->pending && !y->pending)
317                 return 1;
318
319         /* Order by time */
320         if (x->time.next < y->time.next)
321                 return -1;
322         if (x->time.next > y->time.next)
323                 return 1;
324
325         return 0;
326 }
327
328 static usec_t time_event_source_latest(const sd_event_source *s) {
329         return usec_add(s->time.next, s->time.accuracy);
330 }
331
332 static int latest_time_prioq_compare(const void *a, const void *b) {
333         const sd_event_source *x = a, *y = b;
334
335         assert(EVENT_SOURCE_IS_TIME(x->type));
336         assert(x->type == y->type);
337
338         /* Enabled ones first */
339         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
340                 return -1;
341         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
342                 return 1;
343
344         /* Move the pending ones to the end */
345         if (!x->pending && y->pending)
346                 return -1;
347         if (x->pending && !y->pending)
348                 return 1;
349
350         /* Order by time */
351         if (time_event_source_latest(x) < time_event_source_latest(y))
352                 return -1;
353         if (time_event_source_latest(x) > time_event_source_latest(y))
354                 return 1;
355
356         return 0;
357 }
358
359 static int exit_prioq_compare(const void *a, const void *b) {
360         const sd_event_source *x = a, *y = b;
361
362         assert(x->type == SOURCE_EXIT);
363         assert(y->type == SOURCE_EXIT);
364
365         /* Enabled ones first */
366         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
367                 return -1;
368         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
369                 return 1;
370
371         /* Lower priority values first */
372         if (x->priority < y->priority)
373                 return -1;
374         if (x->priority > y->priority)
375                 return 1;
376
377         return 0;
378 }
379
380 static void free_clock_data(struct clock_data *d) {
381         assert(d);
382         assert(d->wakeup == WAKEUP_CLOCK_DATA);
383
384         safe_close(d->fd);
385         prioq_free(d->earliest);
386         prioq_free(d->latest);
387 }
388
389 static void event_free(sd_event *e) {
390         sd_event_source *s;
391
392         assert(e);
393
394         while ((s = e->sources)) {
395                 assert(s->floating);
396                 source_disconnect(s);
397                 sd_event_source_unref(s);
398         }
399
400         assert(e->n_sources == 0);
401
402         if (e->default_event_ptr)
403                 *(e->default_event_ptr) = NULL;
404
405         safe_close(e->epoll_fd);
406         safe_close(e->watchdog_fd);
407
408         free_clock_data(&e->realtime);
409         free_clock_data(&e->boottime);
410         free_clock_data(&e->monotonic);
411         free_clock_data(&e->realtime_alarm);
412         free_clock_data(&e->boottime_alarm);
413
414         prioq_free(e->pending);
415         prioq_free(e->prepare);
416         prioq_free(e->exit);
417
418         free(e->signal_sources);
419         hashmap_free(e->signal_data);
420
421         hashmap_free(e->child_sources);
422         set_free(e->post_sources);
423         free(e);
424 }
425
426 _public_ int sd_event_new(sd_event** ret) {
427         sd_event *e;
428         int r;
429
430         assert_return(ret, -EINVAL);
431
432         e = new0(sd_event, 1);
433         if (!e)
434                 return -ENOMEM;
435
436         e->n_ref = 1;
437         e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
438         e->realtime.next = e->boottime.next = e->monotonic.next = e->realtime_alarm.next = e->boottime_alarm.next = USEC_INFINITY;
439         e->realtime.wakeup = e->boottime.wakeup = e->monotonic.wakeup = e->realtime_alarm.wakeup = e->boottime_alarm.wakeup = WAKEUP_CLOCK_DATA;
440         e->original_pid = getpid_cached();
441         e->perturb = USEC_INFINITY;
442
443         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
444         if (r < 0)
445                 goto fail;
446
447         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
448         if (e->epoll_fd < 0) {
449                 r = -errno;
450                 goto fail;
451         }
452
453         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
454                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 ... 2^63 us will be logged every 5s.");
455                 e->profile_delays = true;
456         }
457
458         *ret = e;
459         return 0;
460
461 fail:
462         event_free(e);
463         return r;
464 }
465
466 _public_ sd_event* sd_event_ref(sd_event *e) {
467
468         if (!e)
469                 return NULL;
470
471         assert(e->n_ref >= 1);
472         e->n_ref++;
473
474         return e;
475 }
476
477 _public_ sd_event* sd_event_unref(sd_event *e) {
478
479         if (!e)
480                 return NULL;
481
482         assert(e->n_ref >= 1);
483         e->n_ref--;
484
485         if (e->n_ref <= 0)
486                 event_free(e);
487
488         return NULL;
489 }
490
491 static bool event_pid_changed(sd_event *e) {
492         assert(e);
493
494         /* We don't support people creating an event loop and keeping
495          * it around over a fork(). Let's complain. */
496
497         return e->original_pid != getpid_cached();
498 }
499
500 static void source_io_unregister(sd_event_source *s) {
501         int r;
502
503         assert(s);
504         assert(s->type == SOURCE_IO);
505
506         if (event_pid_changed(s->event))
507                 return;
508
509         if (!s->io.registered)
510                 return;
511
512         r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
513         if (r < 0)
514                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
515                                 strna(s->description), event_source_type_to_string(s->type));
516
517         s->io.registered = false;
518 }
519
520 static int source_io_register(
521                 sd_event_source *s,
522                 int enabled,
523                 uint32_t events) {
524
525         struct epoll_event ev = {};
526         int r;
527
528         assert(s);
529         assert(s->type == SOURCE_IO);
530         assert(enabled != SD_EVENT_OFF);
531
532         ev.events = events;
533         ev.data.ptr = s;
534
535         if (enabled == SD_EVENT_ONESHOT)
536                 ev.events |= EPOLLONESHOT;
537
538         if (s->io.registered)
539                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
540         else
541                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
542         if (r < 0)
543                 return -errno;
544
545         s->io.registered = true;
546
547         return 0;
548 }
549
550 static clockid_t event_source_type_to_clock(EventSourceType t) {
551
552         switch (t) {
553
554         case SOURCE_TIME_REALTIME:
555                 return CLOCK_REALTIME;
556
557         case SOURCE_TIME_BOOTTIME:
558                 return CLOCK_BOOTTIME;
559
560         case SOURCE_TIME_MONOTONIC:
561                 return CLOCK_MONOTONIC;
562
563         case SOURCE_TIME_REALTIME_ALARM:
564                 return CLOCK_REALTIME_ALARM;
565
566         case SOURCE_TIME_BOOTTIME_ALARM:
567                 return CLOCK_BOOTTIME_ALARM;
568
569         default:
570                 return (clockid_t) -1;
571         }
572 }
573
574 static EventSourceType clock_to_event_source_type(clockid_t clock) {
575
576         switch (clock) {
577
578         case CLOCK_REALTIME:
579                 return SOURCE_TIME_REALTIME;
580
581         case CLOCK_BOOTTIME:
582                 return SOURCE_TIME_BOOTTIME;
583
584         case CLOCK_MONOTONIC:
585                 return SOURCE_TIME_MONOTONIC;
586
587         case CLOCK_REALTIME_ALARM:
588                 return SOURCE_TIME_REALTIME_ALARM;
589
590         case CLOCK_BOOTTIME_ALARM:
591                 return SOURCE_TIME_BOOTTIME_ALARM;
592
593         default:
594                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
595         }
596 }
597
598 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
599         assert(e);
600
601         switch (t) {
602
603         case SOURCE_TIME_REALTIME:
604                 return &e->realtime;
605
606         case SOURCE_TIME_BOOTTIME:
607                 return &e->boottime;
608
609         case SOURCE_TIME_MONOTONIC:
610                 return &e->monotonic;
611
612         case SOURCE_TIME_REALTIME_ALARM:
613                 return &e->realtime_alarm;
614
615         case SOURCE_TIME_BOOTTIME_ALARM:
616                 return &e->boottime_alarm;
617
618         default:
619                 return NULL;
620         }
621 }
622
623 static int event_make_signal_data(
624                 sd_event *e,
625                 int sig,
626                 struct signal_data **ret) {
627
628         struct epoll_event ev = {};
629         struct signal_data *d;
630         bool added = false;
631         sigset_t ss_copy;
632         int64_t priority;
633         int r;
634
635         assert(e);
636
637         if (event_pid_changed(e))
638                 return -ECHILD;
639
640         if (e->signal_sources && e->signal_sources[sig])
641                 priority = e->signal_sources[sig]->priority;
642         else
643                 priority = 0;
644
645         d = hashmap_get(e->signal_data, &priority);
646         if (d) {
647                 if (sigismember(&d->sigset, sig) > 0) {
648                         if (ret)
649                                 *ret = d;
650                         return 0;
651                 }
652         } else {
653                 r = hashmap_ensure_allocated(&e->signal_data, &uint64_hash_ops);
654                 if (r < 0)
655                         return r;
656
657                 d = new0(struct signal_data, 1);
658                 if (!d)
659                         return -ENOMEM;
660
661                 d->wakeup = WAKEUP_SIGNAL_DATA;
662                 d->fd  = -1;
663                 d->priority = priority;
664
665                 r = hashmap_put(e->signal_data, &d->priority, d);
666                 if (r < 0) {
667                         free(d);
668                         return r;
669                 }
670
671                 added = true;
672         }
673
674         ss_copy = d->sigset;
675         assert_se(sigaddset(&ss_copy, sig) >= 0);
676
677         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
678         if (r < 0) {
679                 r = -errno;
680                 goto fail;
681         }
682
683         d->sigset = ss_copy;
684
685         if (d->fd >= 0) {
686                 if (ret)
687                         *ret = d;
688                 return 0;
689         }
690
691         d->fd = r;
692
693         ev.events = EPOLLIN;
694         ev.data.ptr = d;
695
696         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev);
697         if (r < 0)  {
698                 r = -errno;
699                 goto fail;
700         }
701
702         if (ret)
703                 *ret = d;
704
705         return 0;
706
707 fail:
708         if (added) {
709                 d->fd = safe_close(d->fd);
710                 hashmap_remove(e->signal_data, &d->priority);
711                 free(d);
712         }
713
714         return r;
715 }
716
717 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
718         assert(e);
719         assert(d);
720
721         /* Turns off the specified signal in the signal data
722          * object. If the signal mask of the object becomes empty that
723          * way removes it. */
724
725         if (sigismember(&d->sigset, sig) == 0)
726                 return;
727
728         assert_se(sigdelset(&d->sigset, sig) >= 0);
729
730         if (sigisemptyset(&d->sigset)) {
731
732                 /* If all the mask is all-zero we can get rid of the structure */
733                 hashmap_remove(e->signal_data, &d->priority);
734                 safe_close(d->fd);
735                 free(d);
736                 return;
737         }
738
739         assert(d->fd >= 0);
740
741         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
742                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
743 }
744
745 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
746         struct signal_data *d;
747         static const int64_t zero_priority = 0;
748
749         assert(e);
750
751         /* Rechecks if the specified signal is still something we are
752          * interested in. If not, we'll unmask it, and possibly drop
753          * the signalfd for it. */
754
755         if (sig == SIGCHLD &&
756             e->n_enabled_child_sources > 0)
757                 return;
758
759         if (e->signal_sources &&
760             e->signal_sources[sig] &&
761             e->signal_sources[sig]->enabled != SD_EVENT_OFF)
762                 return;
763
764         /*
765          * The specified signal might be enabled in three different queues:
766          *
767          * 1) the one that belongs to the priority passed (if it is non-NULL)
768          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
769          * 3) the 0 priority (to cover the SIGCHLD case)
770          *
771          * Hence, let's remove it from all three here.
772          */
773
774         if (priority) {
775                 d = hashmap_get(e->signal_data, priority);
776                 if (d)
777                         event_unmask_signal_data(e, d, sig);
778         }
779
780         if (e->signal_sources && e->signal_sources[sig]) {
781                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
782                 if (d)
783                         event_unmask_signal_data(e, d, sig);
784         }
785
786         d = hashmap_get(e->signal_data, &zero_priority);
787         if (d)
788                 event_unmask_signal_data(e, d, sig);
789 }
790
791 static void source_disconnect(sd_event_source *s) {
792         sd_event *event;
793
794         assert(s);
795
796         if (!s->event)
797                 return;
798
799         assert(s->event->n_sources > 0);
800
801         switch (s->type) {
802
803         case SOURCE_IO:
804                 if (s->io.fd >= 0)
805                         source_io_unregister(s);
806
807                 break;
808
809         case SOURCE_TIME_REALTIME:
810         case SOURCE_TIME_BOOTTIME:
811         case SOURCE_TIME_MONOTONIC:
812         case SOURCE_TIME_REALTIME_ALARM:
813         case SOURCE_TIME_BOOTTIME_ALARM: {
814                 struct clock_data *d;
815
816                 d = event_get_clock_data(s->event, s->type);
817                 assert(d);
818
819                 prioq_remove(d->earliest, s, &s->time.earliest_index);
820                 prioq_remove(d->latest, s, &s->time.latest_index);
821                 d->needs_rearm = true;
822                 break;
823         }
824
825         case SOURCE_SIGNAL:
826                 if (s->signal.sig > 0) {
827
828                         if (s->event->signal_sources)
829                                 s->event->signal_sources[s->signal.sig] = NULL;
830
831                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
832                 }
833
834                 break;
835
836         case SOURCE_CHILD:
837                 if (s->child.pid > 0) {
838                         if (s->enabled != SD_EVENT_OFF) {
839                                 assert(s->event->n_enabled_child_sources > 0);
840                                 s->event->n_enabled_child_sources--;
841                         }
842
843                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
844                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
845                 }
846
847                 break;
848
849         case SOURCE_DEFER:
850                 /* nothing */
851                 break;
852
853         case SOURCE_POST:
854                 set_remove(s->event->post_sources, s);
855                 break;
856
857         case SOURCE_EXIT:
858                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
859                 break;
860
861         default:
862                 assert_not_reached("Wut? I shouldn't exist.");
863         }
864
865         if (s->pending)
866                 prioq_remove(s->event->pending, s, &s->pending_index);
867
868         if (s->prepare)
869                 prioq_remove(s->event->prepare, s, &s->prepare_index);
870
871         event = s->event;
872
873         s->type = _SOURCE_EVENT_SOURCE_TYPE_INVALID;
874         s->event = NULL;
875         LIST_REMOVE(sources, event->sources, s);
876         event->n_sources--;
877
878         if (!s->floating)
879                 sd_event_unref(event);
880 }
881
882 static void source_free(sd_event_source *s) {
883         assert(s);
884
885         source_disconnect(s);
886         free(s->description);
887         free(s);
888 }
889
890 static int source_set_pending(sd_event_source *s, bool b) {
891         int r;
892
893         assert(s);
894         assert(s->type != SOURCE_EXIT);
895
896         if (s->pending == b)
897                 return 0;
898
899         s->pending = b;
900
901         if (b) {
902                 s->pending_iteration = s->event->iteration;
903
904                 r = prioq_put(s->event->pending, s, &s->pending_index);
905                 if (r < 0) {
906                         s->pending = false;
907                         return r;
908                 }
909         } else
910                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
911
912         if (EVENT_SOURCE_IS_TIME(s->type)) {
913                 struct clock_data *d;
914
915                 d = event_get_clock_data(s->event, s->type);
916                 assert(d);
917
918                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
919                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
920                 d->needs_rearm = true;
921         }
922
923         if (s->type == SOURCE_SIGNAL && !b) {
924                 struct signal_data *d;
925
926                 d = hashmap_get(s->event->signal_data, &s->priority);
927                 if (d && d->current == s)
928                         d->current = NULL;
929         }
930
931         return 0;
932 }
933
934 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
935         sd_event_source *s;
936
937         assert(e);
938
939         s = new0(sd_event_source, 1);
940         if (!s)
941                 return NULL;
942
943         s->n_ref = 1;
944         s->event = e;
945         s->floating = floating;
946         s->type = type;
947         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
948
949         if (!floating)
950                 sd_event_ref(e);
951
952         LIST_PREPEND(sources, e->sources, s);
953         e->n_sources++;
954
955         return s;
956 }
957
958 _public_ int sd_event_add_io(
959                 sd_event *e,
960                 sd_event_source **ret,
961                 int fd,
962                 uint32_t events,
963                 sd_event_io_handler_t callback,
964                 void *userdata) {
965
966         sd_event_source *s;
967         int r;
968
969         assert_return(e, -EINVAL);
970         assert_return(fd >= 0, -EBADF);
971         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
972         assert_return(callback, -EINVAL);
973         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
974         assert_return(!event_pid_changed(e), -ECHILD);
975
976         s = source_new(e, !ret, SOURCE_IO);
977         if (!s)
978                 return -ENOMEM;
979
980         s->wakeup = WAKEUP_EVENT_SOURCE;
981         s->io.fd = fd;
982         s->io.events = events;
983         s->io.callback = callback;
984         s->userdata = userdata;
985         s->enabled = SD_EVENT_ON;
986
987         r = source_io_register(s, s->enabled, events);
988         if (r < 0) {
989                 source_free(s);
990                 return r;
991         }
992
993         if (ret)
994                 *ret = s;
995
996         return 0;
997 }
998
999 static void initialize_perturb(sd_event *e) {
1000         sd_id128_t bootid = {};
1001
1002         /* When we sleep for longer, we try to realign the wakeup to
1003            the same time wihtin each minute/second/250ms, so that
1004            events all across the system can be coalesced into a single
1005            CPU wakeup. However, let's take some system-specific
1006            randomness for this value, so that in a network of systems
1007            with synced clocks timer events are distributed a
1008            bit. Here, we calculate a perturbation usec offset from the
1009            boot ID. */
1010
1011         if (_likely_(e->perturb != USEC_INFINITY))
1012                 return;
1013
1014         if (sd_id128_get_boot(&bootid) >= 0)
1015                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1016 }
1017
1018 static int event_setup_timer_fd(
1019                 sd_event *e,
1020                 struct clock_data *d,
1021                 clockid_t clock) {
1022
1023         struct epoll_event ev = {};
1024         int r, fd;
1025
1026         assert(e);
1027         assert(d);
1028
1029         if (_likely_(d->fd >= 0))
1030                 return 0;
1031
1032         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1033         if (fd < 0)
1034                 return -errno;
1035
1036         ev.events = EPOLLIN;
1037         ev.data.ptr = d;
1038
1039         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
1040         if (r < 0) {
1041                 safe_close(fd);
1042                 return -errno;
1043         }
1044
1045         d->fd = fd;
1046         return 0;
1047 }
1048
1049 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1050         assert(s);
1051
1052         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1053 }
1054
1055 _public_ int sd_event_add_time(
1056                 sd_event *e,
1057                 sd_event_source **ret,
1058                 clockid_t clock,
1059                 uint64_t usec,
1060                 uint64_t accuracy,
1061                 sd_event_time_handler_t callback,
1062                 void *userdata) {
1063
1064         EventSourceType type;
1065         sd_event_source *s;
1066         struct clock_data *d;
1067         int r;
1068
1069         assert_return(e, -EINVAL);
1070         assert_return(accuracy != (uint64_t) -1, -EINVAL);
1071         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1072         assert_return(!event_pid_changed(e), -ECHILD);
1073
1074         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1075                 return -EOPNOTSUPP;
1076
1077         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1078         if (type < 0)
1079                 return -EOPNOTSUPP;
1080
1081         if (!callback)
1082                 callback = time_exit_callback;
1083
1084         d = event_get_clock_data(e, type);
1085         assert(d);
1086
1087         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1088         if (r < 0)
1089                 return r;
1090
1091         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1092         if (r < 0)
1093                 return r;
1094
1095         if (d->fd < 0) {
1096                 r = event_setup_timer_fd(e, d, clock);
1097                 if (r < 0)
1098                         return r;
1099         }
1100
1101         s = source_new(e, !ret, type);
1102         if (!s)
1103                 return -ENOMEM;
1104
1105         s->time.next = usec;
1106         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1107         s->time.callback = callback;
1108         s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
1109         s->userdata = userdata;
1110         s->enabled = SD_EVENT_ONESHOT;
1111
1112         d->needs_rearm = true;
1113
1114         r = prioq_put(d->earliest, s, &s->time.earliest_index);
1115         if (r < 0)
1116                 goto fail;
1117
1118         r = prioq_put(d->latest, s, &s->time.latest_index);
1119         if (r < 0)
1120                 goto fail;
1121
1122         if (ret)
1123                 *ret = s;
1124
1125         return 0;
1126
1127 fail:
1128         source_free(s);
1129         return r;
1130 }
1131
1132 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1133         assert(s);
1134
1135         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1136 }
1137
1138 _public_ int sd_event_add_signal(
1139                 sd_event *e,
1140                 sd_event_source **ret,
1141                 int sig,
1142                 sd_event_signal_handler_t callback,
1143                 void *userdata) {
1144
1145         sd_event_source *s;
1146         struct signal_data *d;
1147         sigset_t ss;
1148         int r;
1149
1150         assert_return(e, -EINVAL);
1151         assert_return(SIGNAL_VALID(sig), -EINVAL);
1152         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1153         assert_return(!event_pid_changed(e), -ECHILD);
1154
1155         if (!callback)
1156                 callback = signal_exit_callback;
1157
1158         r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
1159         if (r != 0)
1160                 return -r;
1161
1162         if (!sigismember(&ss, sig))
1163                 return -EBUSY;
1164
1165         if (!e->signal_sources) {
1166                 e->signal_sources = new0(sd_event_source*, _NSIG);
1167                 if (!e->signal_sources)
1168                         return -ENOMEM;
1169         } else if (e->signal_sources[sig])
1170                 return -EBUSY;
1171
1172         s = source_new(e, !ret, SOURCE_SIGNAL);
1173         if (!s)
1174                 return -ENOMEM;
1175
1176         s->signal.sig = sig;
1177         s->signal.callback = callback;
1178         s->userdata = userdata;
1179         s->enabled = SD_EVENT_ON;
1180
1181         e->signal_sources[sig] = s;
1182
1183         r = event_make_signal_data(e, sig, &d);
1184         if (r < 0) {
1185                 source_free(s);
1186                 return r;
1187         }
1188
1189         /* Use the signal name as description for the event source by default */
1190         (void) sd_event_source_set_description(s, signal_to_string(sig));
1191
1192         if (ret)
1193                 *ret = s;
1194
1195         return 0;
1196 }
1197
1198 _public_ int sd_event_add_child(
1199                 sd_event *e,
1200                 sd_event_source **ret,
1201                 pid_t pid,
1202                 int options,
1203                 sd_event_child_handler_t callback,
1204                 void *userdata) {
1205
1206         sd_event_source *s;
1207         int r;
1208
1209         assert_return(e, -EINVAL);
1210         assert_return(pid > 1, -EINVAL);
1211         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1212         assert_return(options != 0, -EINVAL);
1213         assert_return(callback, -EINVAL);
1214         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1215         assert_return(!event_pid_changed(e), -ECHILD);
1216
1217         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1218         if (r < 0)
1219                 return r;
1220
1221         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1222                 return -EBUSY;
1223
1224         s = source_new(e, !ret, SOURCE_CHILD);
1225         if (!s)
1226                 return -ENOMEM;
1227
1228         s->child.pid = pid;
1229         s->child.options = options;
1230         s->child.callback = callback;
1231         s->userdata = userdata;
1232         s->enabled = SD_EVENT_ONESHOT;
1233
1234         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1235         if (r < 0) {
1236                 source_free(s);
1237                 return r;
1238         }
1239
1240         e->n_enabled_child_sources++;
1241
1242         r = event_make_signal_data(e, SIGCHLD, NULL);
1243         if (r < 0) {
1244                 e->n_enabled_child_sources--;
1245                 source_free(s);
1246                 return r;
1247         }
1248
1249         e->need_process_child = true;
1250
1251         if (ret)
1252                 *ret = s;
1253
1254         return 0;
1255 }
1256
1257 _public_ int sd_event_add_defer(
1258                 sd_event *e,
1259                 sd_event_source **ret,
1260                 sd_event_handler_t callback,
1261                 void *userdata) {
1262
1263         sd_event_source *s;
1264         int r;
1265
1266         assert_return(e, -EINVAL);
1267         assert_return(callback, -EINVAL);
1268         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1269         assert_return(!event_pid_changed(e), -ECHILD);
1270
1271         s = source_new(e, !ret, SOURCE_DEFER);
1272         if (!s)
1273                 return -ENOMEM;
1274
1275         s->defer.callback = callback;
1276         s->userdata = userdata;
1277         s->enabled = SD_EVENT_ONESHOT;
1278
1279         r = source_set_pending(s, true);
1280         if (r < 0) {
1281                 source_free(s);
1282                 return r;
1283         }
1284
1285         if (ret)
1286                 *ret = s;
1287
1288         return 0;
1289 }
1290
1291 _public_ int sd_event_add_post(
1292                 sd_event *e,
1293                 sd_event_source **ret,
1294                 sd_event_handler_t callback,
1295                 void *userdata) {
1296
1297         sd_event_source *s;
1298         int r;
1299
1300         assert_return(e, -EINVAL);
1301         assert_return(callback, -EINVAL);
1302         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1303         assert_return(!event_pid_changed(e), -ECHILD);
1304
1305         r = set_ensure_allocated(&e->post_sources, NULL);
1306         if (r < 0)
1307                 return r;
1308
1309         s = source_new(e, !ret, SOURCE_POST);
1310         if (!s)
1311                 return -ENOMEM;
1312
1313         s->post.callback = callback;
1314         s->userdata = userdata;
1315         s->enabled = SD_EVENT_ON;
1316
1317         r = set_put(e->post_sources, s);
1318         if (r < 0) {
1319                 source_free(s);
1320                 return r;
1321         }
1322
1323         if (ret)
1324                 *ret = s;
1325
1326         return 0;
1327 }
1328
1329 _public_ int sd_event_add_exit(
1330                 sd_event *e,
1331                 sd_event_source **ret,
1332                 sd_event_handler_t callback,
1333                 void *userdata) {
1334
1335         sd_event_source *s;
1336         int r;
1337
1338         assert_return(e, -EINVAL);
1339         assert_return(callback, -EINVAL);
1340         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1341         assert_return(!event_pid_changed(e), -ECHILD);
1342
1343         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1344         if (r < 0)
1345                 return r;
1346
1347         s = source_new(e, !ret, SOURCE_EXIT);
1348         if (!s)
1349                 return -ENOMEM;
1350
1351         s->exit.callback = callback;
1352         s->userdata = userdata;
1353         s->exit.prioq_index = PRIOQ_IDX_NULL;
1354         s->enabled = SD_EVENT_ONESHOT;
1355
1356         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1357         if (r < 0) {
1358                 source_free(s);
1359                 return r;
1360         }
1361
1362         if (ret)
1363                 *ret = s;
1364
1365         return 0;
1366 }
1367
1368 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
1369
1370         if (!s)
1371                 return NULL;
1372
1373         assert(s->n_ref >= 1);
1374         s->n_ref++;
1375
1376         return s;
1377 }
1378
1379 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
1380
1381         if (!s)
1382                 return NULL;
1383
1384         assert(s->n_ref >= 1);
1385         s->n_ref--;
1386
1387         if (s->n_ref <= 0) {
1388                 /* Here's a special hack: when we are called from a
1389                  * dispatch handler we won't free the event source
1390                  * immediately, but we will detach the fd from the
1391                  * epoll. This way it is safe for the caller to unref
1392                  * the event source and immediately close the fd, but
1393                  * we still retain a valid event source object after
1394                  * the callback. */
1395
1396                 if (s->dispatching) {
1397                         if (s->type == SOURCE_IO)
1398                                 source_io_unregister(s);
1399
1400                         source_disconnect(s);
1401                 } else
1402                         source_free(s);
1403         }
1404
1405         return NULL;
1406 }
1407
1408 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
1409         assert_return(s, -EINVAL);
1410         assert_return(!event_pid_changed(s->event), -ECHILD);
1411
1412         return free_and_strdup(&s->description, description);
1413 }
1414
1415 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
1416         assert_return(s, -EINVAL);
1417         assert_return(description, -EINVAL);
1418         assert_return(s->description, -ENXIO);
1419         assert_return(!event_pid_changed(s->event), -ECHILD);
1420
1421         *description = s->description;
1422         return 0;
1423 }
1424
1425 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1426         assert_return(s, NULL);
1427
1428         return s->event;
1429 }
1430
1431 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1432         assert_return(s, -EINVAL);
1433         assert_return(s->type != SOURCE_EXIT, -EDOM);
1434         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1435         assert_return(!event_pid_changed(s->event), -ECHILD);
1436
1437         return s->pending;
1438 }
1439
1440 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1441         assert_return(s, -EINVAL);
1442         assert_return(s->type == SOURCE_IO, -EDOM);
1443         assert_return(!event_pid_changed(s->event), -ECHILD);
1444
1445         return s->io.fd;
1446 }
1447
1448 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1449         int r;
1450
1451         assert_return(s, -EINVAL);
1452         assert_return(fd >= 0, -EBADF);
1453         assert_return(s->type == SOURCE_IO, -EDOM);
1454         assert_return(!event_pid_changed(s->event), -ECHILD);
1455
1456         if (s->io.fd == fd)
1457                 return 0;
1458
1459         if (s->enabled == SD_EVENT_OFF) {
1460                 s->io.fd = fd;
1461                 s->io.registered = false;
1462         } else {
1463                 int saved_fd;
1464
1465                 saved_fd = s->io.fd;
1466                 assert(s->io.registered);
1467
1468                 s->io.fd = fd;
1469                 s->io.registered = false;
1470
1471                 r = source_io_register(s, s->enabled, s->io.events);
1472                 if (r < 0) {
1473                         s->io.fd = saved_fd;
1474                         s->io.registered = true;
1475                         return r;
1476                 }
1477
1478                 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
1479         }
1480
1481         return 0;
1482 }
1483
1484 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1485         assert_return(s, -EINVAL);
1486         assert_return(events, -EINVAL);
1487         assert_return(s->type == SOURCE_IO, -EDOM);
1488         assert_return(!event_pid_changed(s->event), -ECHILD);
1489
1490         *events = s->io.events;
1491         return 0;
1492 }
1493
1494 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1495         int r;
1496
1497         assert_return(s, -EINVAL);
1498         assert_return(s->type == SOURCE_IO, -EDOM);
1499         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1500         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1501         assert_return(!event_pid_changed(s->event), -ECHILD);
1502
1503         /* edge-triggered updates are never skipped, so we can reset edges */
1504         if (s->io.events == events && !(events & EPOLLET))
1505                 return 0;
1506
1507         if (s->enabled != SD_EVENT_OFF) {
1508                 r = source_io_register(s, s->enabled, events);
1509                 if (r < 0)
1510                         return r;
1511         }
1512
1513         s->io.events = events;
1514         source_set_pending(s, false);
1515
1516         return 0;
1517 }
1518
1519 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1520         assert_return(s, -EINVAL);
1521         assert_return(revents, -EINVAL);
1522         assert_return(s->type == SOURCE_IO, -EDOM);
1523         assert_return(s->pending, -ENODATA);
1524         assert_return(!event_pid_changed(s->event), -ECHILD);
1525
1526         *revents = s->io.revents;
1527         return 0;
1528 }
1529
1530 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1531         assert_return(s, -EINVAL);
1532         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1533         assert_return(!event_pid_changed(s->event), -ECHILD);
1534
1535         return s->signal.sig;
1536 }
1537
1538 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
1539         assert_return(s, -EINVAL);
1540         assert_return(!event_pid_changed(s->event), -ECHILD);
1541
1542         *priority = s->priority;
1543         return 0;
1544 }
1545
1546 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
1547         int r;
1548
1549         assert_return(s, -EINVAL);
1550         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1551         assert_return(!event_pid_changed(s->event), -ECHILD);
1552
1553         if (s->priority == priority)
1554                 return 0;
1555
1556         if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
1557                 struct signal_data *old, *d;
1558
1559                 /* Move us from the signalfd belonging to the old
1560                  * priority to the signalfd of the new priority */
1561
1562                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
1563
1564                 s->priority = priority;
1565
1566                 r = event_make_signal_data(s->event, s->signal.sig, &d);
1567                 if (r < 0) {
1568                         s->priority = old->priority;
1569                         return r;
1570                 }
1571
1572                 event_unmask_signal_data(s->event, old, s->signal.sig);
1573         } else
1574                 s->priority = priority;
1575
1576         if (s->pending)
1577                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1578
1579         if (s->prepare)
1580                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1581
1582         if (s->type == SOURCE_EXIT)
1583                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1584
1585         return 0;
1586 }
1587
1588 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1589         assert_return(s, -EINVAL);
1590         assert_return(m, -EINVAL);
1591         assert_return(!event_pid_changed(s->event), -ECHILD);
1592
1593         *m = s->enabled;
1594         return 0;
1595 }
1596
1597 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1598         int r;
1599
1600         assert_return(s, -EINVAL);
1601         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
1602         assert_return(!event_pid_changed(s->event), -ECHILD);
1603
1604         /* If we are dead anyway, we are fine with turning off
1605          * sources, but everything else needs to fail. */
1606         if (s->event->state == SD_EVENT_FINISHED)
1607                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
1608
1609         if (s->enabled == m)
1610                 return 0;
1611
1612         if (m == SD_EVENT_OFF) {
1613
1614                 switch (s->type) {
1615
1616                 case SOURCE_IO:
1617                         source_io_unregister(s);
1618                         s->enabled = m;
1619                         break;
1620
1621                 case SOURCE_TIME_REALTIME:
1622                 case SOURCE_TIME_BOOTTIME:
1623                 case SOURCE_TIME_MONOTONIC:
1624                 case SOURCE_TIME_REALTIME_ALARM:
1625                 case SOURCE_TIME_BOOTTIME_ALARM: {
1626                         struct clock_data *d;
1627
1628                         s->enabled = m;
1629                         d = event_get_clock_data(s->event, s->type);
1630                         assert(d);
1631
1632                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1633                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1634                         d->needs_rearm = true;
1635                         break;
1636                 }
1637
1638                 case SOURCE_SIGNAL:
1639                         s->enabled = m;
1640
1641                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1642                         break;
1643
1644                 case SOURCE_CHILD:
1645                         s->enabled = m;
1646
1647                         assert(s->event->n_enabled_child_sources > 0);
1648                         s->event->n_enabled_child_sources--;
1649
1650                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1651                         break;
1652
1653                 case SOURCE_EXIT:
1654                         s->enabled = m;
1655                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1656                         break;
1657
1658                 case SOURCE_DEFER:
1659                 case SOURCE_POST:
1660                         s->enabled = m;
1661                         break;
1662
1663                 default:
1664                         assert_not_reached("Wut? I shouldn't exist.");
1665                 }
1666
1667         } else {
1668                 switch (s->type) {
1669
1670                 case SOURCE_IO:
1671                         r = source_io_register(s, m, s->io.events);
1672                         if (r < 0)
1673                                 return r;
1674
1675                         s->enabled = m;
1676                         break;
1677
1678                 case SOURCE_TIME_REALTIME:
1679                 case SOURCE_TIME_BOOTTIME:
1680                 case SOURCE_TIME_MONOTONIC:
1681                 case SOURCE_TIME_REALTIME_ALARM:
1682                 case SOURCE_TIME_BOOTTIME_ALARM: {
1683                         struct clock_data *d;
1684
1685                         s->enabled = m;
1686                         d = event_get_clock_data(s->event, s->type);
1687                         assert(d);
1688
1689                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1690                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1691                         d->needs_rearm = true;
1692                         break;
1693                 }
1694
1695                 case SOURCE_SIGNAL:
1696
1697                         s->enabled = m;
1698
1699                         r = event_make_signal_data(s->event, s->signal.sig, NULL);
1700                         if (r < 0) {
1701                                 s->enabled = SD_EVENT_OFF;
1702                                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1703                                 return r;
1704                         }
1705
1706                         break;
1707
1708                 case SOURCE_CHILD:
1709
1710                         if (s->enabled == SD_EVENT_OFF)
1711                                 s->event->n_enabled_child_sources++;
1712
1713                         s->enabled = m;
1714
1715                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
1716                         if (r < 0) {
1717                                 s->enabled = SD_EVENT_OFF;
1718                                 s->event->n_enabled_child_sources--;
1719                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1720                                 return r;
1721                         }
1722
1723                         break;
1724
1725                 case SOURCE_EXIT:
1726                         s->enabled = m;
1727                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1728                         break;
1729
1730                 case SOURCE_DEFER:
1731                 case SOURCE_POST:
1732                         s->enabled = m;
1733                         break;
1734
1735                 default:
1736                         assert_not_reached("Wut? I shouldn't exist.");
1737                 }
1738         }
1739
1740         if (s->pending)
1741                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1742
1743         if (s->prepare)
1744                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1745
1746         return 0;
1747 }
1748
1749 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1750         assert_return(s, -EINVAL);
1751         assert_return(usec, -EINVAL);
1752         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1753         assert_return(!event_pid_changed(s->event), -ECHILD);
1754
1755         *usec = s->time.next;
1756         return 0;
1757 }
1758
1759 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1760         struct clock_data *d;
1761
1762         assert_return(s, -EINVAL);
1763         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1764         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1765         assert_return(!event_pid_changed(s->event), -ECHILD);
1766
1767         s->time.next = usec;
1768
1769         source_set_pending(s, false);
1770
1771         d = event_get_clock_data(s->event, s->type);
1772         assert(d);
1773
1774         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1775         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1776         d->needs_rearm = true;
1777
1778         return 0;
1779 }
1780
1781 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1782         assert_return(s, -EINVAL);
1783         assert_return(usec, -EINVAL);
1784         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1785         assert_return(!event_pid_changed(s->event), -ECHILD);
1786
1787         *usec = s->time.accuracy;
1788         return 0;
1789 }
1790
1791 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1792         struct clock_data *d;
1793
1794         assert_return(s, -EINVAL);
1795         assert_return(usec != (uint64_t) -1, -EINVAL);
1796         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1797         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1798         assert_return(!event_pid_changed(s->event), -ECHILD);
1799
1800         if (usec == 0)
1801                 usec = DEFAULT_ACCURACY_USEC;
1802
1803         s->time.accuracy = usec;
1804
1805         source_set_pending(s, false);
1806
1807         d = event_get_clock_data(s->event, s->type);
1808         assert(d);
1809
1810         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1811         d->needs_rearm = true;
1812
1813         return 0;
1814 }
1815
1816 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
1817         assert_return(s, -EINVAL);
1818         assert_return(clock, -EINVAL);
1819         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1820         assert_return(!event_pid_changed(s->event), -ECHILD);
1821
1822         *clock = event_source_type_to_clock(s->type);
1823         return 0;
1824 }
1825
1826 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1827         assert_return(s, -EINVAL);
1828         assert_return(pid, -EINVAL);
1829         assert_return(s->type == SOURCE_CHILD, -EDOM);
1830         assert_return(!event_pid_changed(s->event), -ECHILD);
1831
1832         *pid = s->child.pid;
1833         return 0;
1834 }
1835
1836 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1837         int r;
1838
1839         assert_return(s, -EINVAL);
1840         assert_return(s->type != SOURCE_EXIT, -EDOM);
1841         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1842         assert_return(!event_pid_changed(s->event), -ECHILD);
1843
1844         if (s->prepare == callback)
1845                 return 0;
1846
1847         if (callback && s->prepare) {
1848                 s->prepare = callback;
1849                 return 0;
1850         }
1851
1852         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1853         if (r < 0)
1854                 return r;
1855
1856         s->prepare = callback;
1857
1858         if (callback) {
1859                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1860                 if (r < 0)
1861                         return r;
1862         } else
1863                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1864
1865         return 0;
1866 }
1867
1868 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1869         assert_return(s, NULL);
1870
1871         return s->userdata;
1872 }
1873
1874 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1875         void *ret;
1876
1877         assert_return(s, NULL);
1878
1879         ret = s->userdata;
1880         s->userdata = userdata;
1881
1882         return ret;
1883 }
1884
1885 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1886         usec_t c;
1887         assert(e);
1888         assert(a <= b);
1889
1890         if (a <= 0)
1891                 return 0;
1892         if (a >= USEC_INFINITY)
1893                 return USEC_INFINITY;
1894
1895         if (b <= a + 1)
1896                 return a;
1897
1898         initialize_perturb(e);
1899
1900         /*
1901           Find a good time to wake up again between times a and b. We
1902           have two goals here:
1903
1904           a) We want to wake up as seldom as possible, hence prefer
1905              later times over earlier times.
1906
1907           b) But if we have to wake up, then let's make sure to
1908              dispatch as much as possible on the entire system.
1909
1910           We implement this by waking up everywhere at the same time
1911           within any given minute if we can, synchronised via the
1912           perturbation value determined from the boot ID. If we can't,
1913           then we try to find the same spot in every 10s, then 1s and
1914           then 250ms step. Otherwise, we pick the last possible time
1915           to wake up.
1916         */
1917
1918         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1919         if (c >= b) {
1920                 if (_unlikely_(c < USEC_PER_MINUTE))
1921                         return b;
1922
1923                 c -= USEC_PER_MINUTE;
1924         }
1925
1926         if (c >= a)
1927                 return c;
1928
1929         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1930         if (c >= b) {
1931                 if (_unlikely_(c < USEC_PER_SEC*10))
1932                         return b;
1933
1934                 c -= USEC_PER_SEC*10;
1935         }
1936
1937         if (c >= a)
1938                 return c;
1939
1940         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1941         if (c >= b) {
1942                 if (_unlikely_(c < USEC_PER_SEC))
1943                         return b;
1944
1945                 c -= USEC_PER_SEC;
1946         }
1947
1948         if (c >= a)
1949                 return c;
1950
1951         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1952         if (c >= b) {
1953                 if (_unlikely_(c < USEC_PER_MSEC*250))
1954                         return b;
1955
1956                 c -= USEC_PER_MSEC*250;
1957         }
1958
1959         if (c >= a)
1960                 return c;
1961
1962         return b;
1963 }
1964
1965 static int event_arm_timer(
1966                 sd_event *e,
1967                 struct clock_data *d) {
1968
1969         struct itimerspec its = {};
1970         sd_event_source *a, *b;
1971         usec_t t;
1972         int r;
1973
1974         assert(e);
1975         assert(d);
1976
1977         if (!d->needs_rearm)
1978                 return 0;
1979         else
1980                 d->needs_rearm = false;
1981
1982         a = prioq_peek(d->earliest);
1983         if (!a || a->enabled == SD_EVENT_OFF || a->time.next == USEC_INFINITY) {
1984
1985                 if (d->fd < 0)
1986                         return 0;
1987
1988                 if (d->next == USEC_INFINITY)
1989                         return 0;
1990
1991                 /* disarm */
1992                 r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
1993                 if (r < 0)
1994                         return r;
1995
1996                 d->next = USEC_INFINITY;
1997                 return 0;
1998         }
1999
2000         b = prioq_peek(d->latest);
2001         assert_se(b && b->enabled != SD_EVENT_OFF);
2002
2003         t = sleep_between(e, a->time.next, time_event_source_latest(b));
2004         if (d->next == t)
2005                 return 0;
2006
2007         assert_se(d->fd >= 0);
2008
2009         if (t == 0) {
2010                 /* We don' want to disarm here, just mean some time looooong ago. */
2011                 its.it_value.tv_sec = 0;
2012                 its.it_value.tv_nsec = 1;
2013         } else
2014                 timespec_store(&its.it_value, t);
2015
2016         r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2017         if (r < 0)
2018                 return -errno;
2019
2020         d->next = t;
2021         return 0;
2022 }
2023
2024 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
2025         assert(e);
2026         assert(s);
2027         assert(s->type == SOURCE_IO);
2028
2029         /* If the event source was already pending, we just OR in the
2030          * new revents, otherwise we reset the value. The ORing is
2031          * necessary to handle EPOLLONESHOT events properly where
2032          * readability might happen independently of writability, and
2033          * we need to keep track of both */
2034
2035         if (s->pending)
2036                 s->io.revents |= revents;
2037         else
2038                 s->io.revents = revents;
2039
2040         return source_set_pending(s, true);
2041 }
2042
2043 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
2044         uint64_t x;
2045         ssize_t ss;
2046
2047         assert(e);
2048         assert(fd >= 0);
2049
2050         assert_return(events == EPOLLIN, -EIO);
2051
2052         ss = read(fd, &x, sizeof(x));
2053         if (ss < 0) {
2054                 if (IN_SET(errno, EAGAIN, EINTR))
2055                         return 0;
2056
2057                 return -errno;
2058         }
2059
2060         if (_unlikely_(ss != sizeof(x)))
2061                 return -EIO;
2062
2063         if (next)
2064                 *next = USEC_INFINITY;
2065
2066         return 0;
2067 }
2068
2069 static int process_timer(
2070                 sd_event *e,
2071                 usec_t n,
2072                 struct clock_data *d) {
2073
2074         sd_event_source *s;
2075         int r;
2076
2077         assert(e);
2078         assert(d);
2079
2080         for (;;) {
2081                 s = prioq_peek(d->earliest);
2082                 if (!s ||
2083                     s->time.next > n ||
2084                     s->enabled == SD_EVENT_OFF ||
2085                     s->pending)
2086                         break;
2087
2088                 r = source_set_pending(s, true);
2089                 if (r < 0)
2090                         return r;
2091
2092                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2093                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
2094                 d->needs_rearm = true;
2095         }
2096
2097         return 0;
2098 }
2099
2100 static int process_child(sd_event *e) {
2101         sd_event_source *s;
2102         Iterator i;
2103         int r;
2104
2105         assert(e);
2106
2107         e->need_process_child = false;
2108
2109         /*
2110            So, this is ugly. We iteratively invoke waitid() with P_PID
2111            + WNOHANG for each PID we wait for, instead of using
2112            P_ALL. This is because we only want to get child
2113            information of very specific child processes, and not all
2114            of them. We might not have processed the SIGCHLD even of a
2115            previous invocation and we don't want to maintain a
2116            unbounded *per-child* event queue, hence we really don't
2117            want anything flushed out of the kernel's queue that we
2118            don't care about. Since this is O(n) this means that if you
2119            have a lot of processes you probably want to handle SIGCHLD
2120            yourself.
2121
2122            We do not reap the children here (by using WNOWAIT), this
2123            is only done after the event source is dispatched so that
2124            the callback still sees the process as a zombie.
2125         */
2126
2127         HASHMAP_FOREACH(s, e->child_sources, i) {
2128                 assert(s->type == SOURCE_CHILD);
2129
2130                 if (s->pending)
2131                         continue;
2132
2133                 if (s->enabled == SD_EVENT_OFF)
2134                         continue;
2135
2136                 zero(s->child.siginfo);
2137                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
2138                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
2139                 if (r < 0)
2140                         return -errno;
2141
2142                 if (s->child.siginfo.si_pid != 0) {
2143                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
2144
2145                         if (!zombie && (s->child.options & WEXITED)) {
2146                                 /* If the child isn't dead then let's
2147                                  * immediately remove the state change
2148                                  * from the queue, since there's no
2149                                  * benefit in leaving it queued */
2150
2151                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
2152                                 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
2153                         }
2154
2155                         r = source_set_pending(s, true);
2156                         if (r < 0)
2157                                 return r;
2158                 }
2159         }
2160
2161         return 0;
2162 }
2163
2164 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
2165         bool read_one = false;
2166         int r;
2167
2168         assert(e);
2169         assert_return(events == EPOLLIN, -EIO);
2170
2171         /* If there's a signal queued on this priority and SIGCHLD is
2172            on this priority too, then make sure to recheck the
2173            children we watch. This is because we only ever dequeue
2174            the first signal per priority, and if we dequeue one, and
2175            SIGCHLD might be enqueued later we wouldn't know, but we
2176            might have higher priority children we care about hence we
2177            need to check that explicitly. */
2178
2179         if (sigismember(&d->sigset, SIGCHLD))
2180                 e->need_process_child = true;
2181
2182         /* If there's already an event source pending for this
2183          * priority we don't read another */
2184         if (d->current)
2185                 return 0;
2186
2187         for (;;) {
2188                 struct signalfd_siginfo si;
2189                 ssize_t n;
2190                 sd_event_source *s = NULL;
2191
2192                 n = read(d->fd, &si, sizeof(si));
2193                 if (n < 0) {
2194                         if (IN_SET(errno, EAGAIN, EINTR))
2195                                 return read_one;
2196
2197                         return -errno;
2198                 }
2199
2200                 if (_unlikely_(n != sizeof(si)))
2201                         return -EIO;
2202
2203                 assert(SIGNAL_VALID(si.ssi_signo));
2204
2205                 read_one = true;
2206
2207                 if (e->signal_sources)
2208                         s = e->signal_sources[si.ssi_signo];
2209                 if (!s)
2210                         continue;
2211                 if (s->pending)
2212                         continue;
2213
2214                 s->signal.siginfo = si;
2215                 d->current = s;
2216
2217                 r = source_set_pending(s, true);
2218                 if (r < 0)
2219                         return r;
2220
2221                 return 1;
2222         }
2223 }
2224
2225 static int source_dispatch(sd_event_source *s) {
2226         EventSourceType saved_type;
2227         int r = 0;
2228
2229         assert(s);
2230         assert(s->pending || s->type == SOURCE_EXIT);
2231
2232         /* Save the event source type, here, so that we still know it after the event callback which might invalidate
2233          * the event. */
2234         saved_type = s->type;
2235
2236         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2237                 r = source_set_pending(s, false);
2238                 if (r < 0)
2239                         return r;
2240         }
2241
2242         if (s->type != SOURCE_POST) {
2243                 sd_event_source *z;
2244                 Iterator i;
2245
2246                 /* If we execute a non-post source, let's mark all
2247                  * post sources as pending */
2248
2249                 SET_FOREACH(z, s->event->post_sources, i) {
2250                         if (z->enabled == SD_EVENT_OFF)
2251                                 continue;
2252
2253                         r = source_set_pending(z, true);
2254                         if (r < 0)
2255                                 return r;
2256                 }
2257         }
2258
2259         if (s->enabled == SD_EVENT_ONESHOT) {
2260                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
2261                 if (r < 0)
2262                         return r;
2263         }
2264
2265         s->dispatching = true;
2266
2267         switch (s->type) {
2268
2269         case SOURCE_IO:
2270                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
2271                 break;
2272
2273         case SOURCE_TIME_REALTIME:
2274         case SOURCE_TIME_BOOTTIME:
2275         case SOURCE_TIME_MONOTONIC:
2276         case SOURCE_TIME_REALTIME_ALARM:
2277         case SOURCE_TIME_BOOTTIME_ALARM:
2278                 r = s->time.callback(s, s->time.next, s->userdata);
2279                 break;
2280
2281         case SOURCE_SIGNAL:
2282                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
2283                 break;
2284
2285         case SOURCE_CHILD: {
2286                 bool zombie;
2287
2288                 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
2289
2290                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
2291
2292                 /* Now, reap the PID for good. */
2293                 if (zombie)
2294                         waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
2295
2296                 break;
2297         }
2298
2299         case SOURCE_DEFER:
2300                 r = s->defer.callback(s, s->userdata);
2301                 break;
2302
2303         case SOURCE_POST:
2304                 r = s->post.callback(s, s->userdata);
2305                 break;
2306
2307         case SOURCE_EXIT:
2308                 r = s->exit.callback(s, s->userdata);
2309                 break;
2310
2311         case SOURCE_WATCHDOG:
2312         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
2313         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
2314                 assert_not_reached("Wut? I shouldn't exist.");
2315         }
2316
2317         s->dispatching = false;
2318
2319         if (r < 0)
2320                 log_debug_errno(r, "Event source %s (type %s) returned error, disabling: %m",
2321                                 strna(s->description), event_source_type_to_string(saved_type));
2322
2323         if (s->n_ref == 0)
2324                 source_free(s);
2325         else if (r < 0)
2326                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
2327
2328         return 1;
2329 }
2330
2331 static int event_prepare(sd_event *e) {
2332         int r;
2333
2334         assert(e);
2335
2336         for (;;) {
2337                 sd_event_source *s;
2338
2339                 s = prioq_peek(e->prepare);
2340                 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
2341                         break;
2342
2343                 s->prepare_iteration = e->iteration;
2344                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
2345                 if (r < 0)
2346                         return r;
2347
2348                 assert(s->prepare);
2349
2350                 s->dispatching = true;
2351                 r = s->prepare(s, s->userdata);
2352                 s->dispatching = false;
2353
2354                 if (r < 0)
2355                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, disabling: %m",
2356                                         strna(s->description), event_source_type_to_string(s->type));
2357
2358                 if (s->n_ref == 0)
2359                         source_free(s);
2360                 else if (r < 0)
2361                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
2362         }
2363
2364         return 0;
2365 }
2366
2367 static int dispatch_exit(sd_event *e) {
2368         sd_event_source *p;
2369         int r;
2370
2371         assert(e);
2372
2373         p = prioq_peek(e->exit);
2374         if (!p || p->enabled == SD_EVENT_OFF) {
2375                 e->state = SD_EVENT_FINISHED;
2376                 return 0;
2377         }
2378
2379         sd_event_ref(e);
2380         e->iteration++;
2381         e->state = SD_EVENT_EXITING;
2382
2383         r = source_dispatch(p);
2384
2385         e->state = SD_EVENT_INITIAL;
2386         sd_event_unref(e);
2387
2388         return r;
2389 }
2390
2391 static sd_event_source* event_next_pending(sd_event *e) {
2392         sd_event_source *p;
2393
2394         assert(e);
2395
2396         p = prioq_peek(e->pending);
2397         if (!p)
2398                 return NULL;
2399
2400         if (p->enabled == SD_EVENT_OFF)
2401                 return NULL;
2402
2403         return p;
2404 }
2405
2406 static int arm_watchdog(sd_event *e) {
2407         struct itimerspec its = {};
2408         usec_t t;
2409         int r;
2410
2411         assert(e);
2412         assert(e->watchdog_fd >= 0);
2413
2414         t = sleep_between(e,
2415                           e->watchdog_last + (e->watchdog_period / 2),
2416                           e->watchdog_last + (e->watchdog_period * 3 / 4));
2417
2418         timespec_store(&its.it_value, t);
2419
2420         /* Make sure we never set the watchdog to 0, which tells the
2421          * kernel to disable it. */
2422         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
2423                 its.it_value.tv_nsec = 1;
2424
2425         r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
2426         if (r < 0)
2427                 return -errno;
2428
2429         return 0;
2430 }
2431
2432 static int process_watchdog(sd_event *e) {
2433         assert(e);
2434
2435         if (!e->watchdog)
2436                 return 0;
2437
2438         /* Don't notify watchdog too often */
2439         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
2440                 return 0;
2441
2442         sd_notify(false, "WATCHDOG=1");
2443         e->watchdog_last = e->timestamp.monotonic;
2444
2445         return arm_watchdog(e);
2446 }
2447
2448 _public_ int sd_event_prepare(sd_event *e) {
2449         int r;
2450
2451         assert_return(e, -EINVAL);
2452         assert_return(!event_pid_changed(e), -ECHILD);
2453         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2454         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2455
2456         if (e->exit_requested)
2457                 goto pending;
2458
2459         e->iteration++;
2460
2461         e->state = SD_EVENT_PREPARING;
2462         r = event_prepare(e);
2463         e->state = SD_EVENT_INITIAL;
2464         if (r < 0)
2465                 return r;
2466
2467         r = event_arm_timer(e, &e->realtime);
2468         if (r < 0)
2469                 return r;
2470
2471         r = event_arm_timer(e, &e->boottime);
2472         if (r < 0)
2473                 return r;
2474
2475         r = event_arm_timer(e, &e->monotonic);
2476         if (r < 0)
2477                 return r;
2478
2479         r = event_arm_timer(e, &e->realtime_alarm);
2480         if (r < 0)
2481                 return r;
2482
2483         r = event_arm_timer(e, &e->boottime_alarm);
2484         if (r < 0)
2485                 return r;
2486
2487         if (event_next_pending(e) || e->need_process_child)
2488                 goto pending;
2489
2490         e->state = SD_EVENT_ARMED;
2491
2492         return 0;
2493
2494 pending:
2495         e->state = SD_EVENT_ARMED;
2496         r = sd_event_wait(e, 0);
2497         if (r == 0)
2498                 e->state = SD_EVENT_ARMED;
2499
2500         return r;
2501 }
2502
2503 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
2504         struct epoll_event *ev_queue;
2505         unsigned ev_queue_max;
2506         int r, m, i;
2507
2508         assert_return(e, -EINVAL);
2509         assert_return(!event_pid_changed(e), -ECHILD);
2510         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2511         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
2512
2513         if (e->exit_requested) {
2514                 e->state = SD_EVENT_PENDING;
2515                 return 1;
2516         }
2517
2518         ev_queue_max = MAX(e->n_sources, 1u);
2519         ev_queue = newa(struct epoll_event, ev_queue_max);
2520
2521         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
2522                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
2523         if (m < 0) {
2524                 if (errno == EINTR) {
2525                         e->state = SD_EVENT_PENDING;
2526                         return 1;
2527                 }
2528
2529                 r = -errno;
2530                 goto finish;
2531         }
2532
2533         triple_timestamp_get(&e->timestamp);
2534
2535         for (i = 0; i < m; i++) {
2536
2537                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
2538                         r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
2539                 else {
2540                         WakeupType *t = ev_queue[i].data.ptr;
2541
2542                         switch (*t) {
2543
2544                         case WAKEUP_EVENT_SOURCE:
2545                                 r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
2546                                 break;
2547
2548                         case WAKEUP_CLOCK_DATA: {
2549                                 struct clock_data *d = ev_queue[i].data.ptr;
2550                                 r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
2551                                 break;
2552                         }
2553
2554                         case WAKEUP_SIGNAL_DATA:
2555                                 r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
2556                                 break;
2557
2558                         default:
2559                                 assert_not_reached("Invalid wake-up pointer");
2560                         }
2561                 }
2562                 if (r < 0)
2563                         goto finish;
2564         }
2565
2566         r = process_watchdog(e);
2567         if (r < 0)
2568                 goto finish;
2569
2570         r = process_timer(e, e->timestamp.realtime, &e->realtime);
2571         if (r < 0)
2572                 goto finish;
2573
2574         r = process_timer(e, e->timestamp.boottime, &e->boottime);
2575         if (r < 0)
2576                 goto finish;
2577
2578         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
2579         if (r < 0)
2580                 goto finish;
2581
2582         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
2583         if (r < 0)
2584                 goto finish;
2585
2586         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
2587         if (r < 0)
2588                 goto finish;
2589
2590         if (e->need_process_child) {
2591                 r = process_child(e);
2592                 if (r < 0)
2593                         goto finish;
2594         }
2595
2596         if (event_next_pending(e)) {
2597                 e->state = SD_EVENT_PENDING;
2598
2599                 return 1;
2600         }
2601
2602         r = 0;
2603
2604 finish:
2605         e->state = SD_EVENT_INITIAL;
2606
2607         return r;
2608 }
2609
2610 _public_ int sd_event_dispatch(sd_event *e) {
2611         sd_event_source *p;
2612         int r;
2613
2614         assert_return(e, -EINVAL);
2615         assert_return(!event_pid_changed(e), -ECHILD);
2616         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2617         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
2618
2619         if (e->exit_requested)
2620                 return dispatch_exit(e);
2621
2622         p = event_next_pending(e);
2623         if (p) {
2624                 sd_event_ref(e);
2625
2626                 e->state = SD_EVENT_RUNNING;
2627                 r = source_dispatch(p);
2628                 e->state = SD_EVENT_INITIAL;
2629
2630                 sd_event_unref(e);
2631
2632                 return r;
2633         }
2634
2635         e->state = SD_EVENT_INITIAL;
2636
2637         return 1;
2638 }
2639
2640 static void event_log_delays(sd_event *e) {
2641         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1];
2642         unsigned i;
2643         int o;
2644
2645         for (i = o = 0; i < ELEMENTSOF(e->delays); i++) {
2646                 o += snprintf(&b[o], sizeof(b) - o, "%u ", e->delays[i]);
2647                 e->delays[i] = 0;
2648         }
2649         log_debug("Event loop iterations: %.*s", o, b);
2650 }
2651
2652 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
2653         int r;
2654
2655         assert_return(e, -EINVAL);
2656         assert_return(!event_pid_changed(e), -ECHILD);
2657         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2658         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2659
2660         if (e->profile_delays && e->last_run) {
2661                 usec_t this_run;
2662                 unsigned l;
2663
2664                 this_run = now(CLOCK_MONOTONIC);
2665
2666                 l = u64log2(this_run - e->last_run);
2667                 assert(l < sizeof(e->delays));
2668                 e->delays[l]++;
2669
2670                 if (this_run - e->last_log >= 5*USEC_PER_SEC) {
2671                         event_log_delays(e);
2672                         e->last_log = this_run;
2673                 }
2674         }
2675
2676         r = sd_event_prepare(e);
2677         if (r == 0)
2678                 /* There was nothing? Then wait... */
2679                 r = sd_event_wait(e, timeout);
2680
2681         if (e->profile_delays)
2682                 e->last_run = now(CLOCK_MONOTONIC);
2683
2684         if (r > 0) {
2685                 /* There's something now, then let's dispatch it */
2686                 r = sd_event_dispatch(e);
2687                 if (r < 0)
2688                         return r;
2689
2690                 return 1;
2691         }
2692
2693         return r;
2694 }
2695
2696 _public_ int sd_event_loop(sd_event *e) {
2697         int r;
2698
2699         assert_return(e, -EINVAL);
2700         assert_return(!event_pid_changed(e), -ECHILD);
2701         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2702
2703         sd_event_ref(e);
2704
2705         while (e->state != SD_EVENT_FINISHED) {
2706                 r = sd_event_run(e, (uint64_t) -1);
2707                 if (r < 0)
2708                         goto finish;
2709         }
2710
2711         r = e->exit_code;
2712
2713 finish:
2714         sd_event_unref(e);
2715         return r;
2716 }
2717
2718 _public_ int sd_event_get_fd(sd_event *e) {
2719
2720         assert_return(e, -EINVAL);
2721         assert_return(!event_pid_changed(e), -ECHILD);
2722
2723         return e->epoll_fd;
2724 }
2725
2726 _public_ int sd_event_get_state(sd_event *e) {
2727         assert_return(e, -EINVAL);
2728         assert_return(!event_pid_changed(e), -ECHILD);
2729
2730         return e->state;
2731 }
2732
2733 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2734         assert_return(e, -EINVAL);
2735         assert_return(code, -EINVAL);
2736         assert_return(!event_pid_changed(e), -ECHILD);
2737
2738         if (!e->exit_requested)
2739                 return -ENODATA;
2740
2741         *code = e->exit_code;
2742         return 0;
2743 }
2744
2745 _public_ int sd_event_exit(sd_event *e, int code) {
2746         assert_return(e, -EINVAL);
2747         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2748         assert_return(!event_pid_changed(e), -ECHILD);
2749
2750         e->exit_requested = true;
2751         e->exit_code = code;
2752
2753         return 0;
2754 }
2755
2756 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
2757         assert_return(e, -EINVAL);
2758         assert_return(usec, -EINVAL);
2759         assert_return(!event_pid_changed(e), -ECHILD);
2760
2761         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
2762                 return -EOPNOTSUPP;
2763
2764         /* Generate a clean error in case CLOCK_BOOTTIME is not available. Note that don't use clock_supported() here,
2765          * for a reason: there are systems where CLOCK_BOOTTIME is supported, but CLOCK_BOOTTIME_ALARM is not, but for
2766          * the purpose of getting the time this doesn't matter. */
2767         if (IN_SET(clock, CLOCK_BOOTTIME, CLOCK_BOOTTIME_ALARM) && !clock_boottime_supported())
2768                 return -EOPNOTSUPP;
2769
2770         if (!triple_timestamp_is_set(&e->timestamp)) {
2771                 /* Implicitly fall back to now() if we never ran
2772                  * before and thus have no cached time. */
2773                 *usec = now(clock);
2774                 return 1;
2775         }
2776
2777         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
2778         return 0;
2779 }
2780
2781 _public_ int sd_event_default(sd_event **ret) {
2782
2783         static thread_local sd_event *default_event = NULL;
2784         sd_event *e = NULL;
2785         int r;
2786
2787         if (!ret)
2788                 return !!default_event;
2789
2790         if (default_event) {
2791                 *ret = sd_event_ref(default_event);
2792                 return 0;
2793         }
2794
2795         r = sd_event_new(&e);
2796         if (r < 0)
2797                 return r;
2798
2799         e->default_event_ptr = &default_event;
2800         e->tid = gettid();
2801         default_event = e;
2802
2803         *ret = e;
2804         return 1;
2805 }
2806
2807 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2808         assert_return(e, -EINVAL);
2809         assert_return(tid, -EINVAL);
2810         assert_return(!event_pid_changed(e), -ECHILD);
2811
2812         if (e->tid != 0) {
2813                 *tid = e->tid;
2814                 return 0;
2815         }
2816
2817         return -ENXIO;
2818 }
2819
2820 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2821         int r;
2822
2823         assert_return(e, -EINVAL);
2824         assert_return(!event_pid_changed(e), -ECHILD);
2825
2826         if (e->watchdog == !!b)
2827                 return e->watchdog;
2828
2829         if (b) {
2830                 struct epoll_event ev = {};
2831
2832                 r = sd_watchdog_enabled(false, &e->watchdog_period);
2833                 if (r <= 0)
2834                         return r;
2835
2836                 /* Issue first ping immediately */
2837                 sd_notify(false, "WATCHDOG=1");
2838                 e->watchdog_last = now(CLOCK_MONOTONIC);
2839
2840                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2841                 if (e->watchdog_fd < 0)
2842                         return -errno;
2843
2844                 r = arm_watchdog(e);
2845                 if (r < 0)
2846                         goto fail;
2847
2848                 ev.events = EPOLLIN;
2849                 ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
2850
2851                 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2852                 if (r < 0) {
2853                         r = -errno;
2854                         goto fail;
2855                 }
2856
2857         } else {
2858                 if (e->watchdog_fd >= 0) {
2859                         epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2860                         e->watchdog_fd = safe_close(e->watchdog_fd);
2861                 }
2862         }
2863
2864         e->watchdog = !!b;
2865         return e->watchdog;
2866
2867 fail:
2868         e->watchdog_fd = safe_close(e->watchdog_fd);
2869         return r;
2870 }
2871
2872 _public_ int sd_event_get_watchdog(sd_event *e) {
2873         assert_return(e, -EINVAL);
2874         assert_return(!event_pid_changed(e), -ECHILD);
2875
2876         return e->watchdog;
2877 }
2878
2879 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
2880         assert_return(e, -EINVAL);
2881         assert_return(!event_pid_changed(e), -ECHILD);
2882
2883         *ret = e->iteration;
2884         return 0;
2885 }