chiark / gitweb /
Prep 229.9: Make all supportable API functions visible.
[elogind.git] / src / libelogind / sd-event / sd-event.c
1 /***
2   This file is part of systemd.
3
4   Copyright 2013 Lennart Poettering
5
6   systemd is free software; you can redistribute it and/or modify it
7   under the terms of the GNU Lesser General Public License as published by
8   the Free Software Foundation; either version 2.1 of the License, or
9   (at your option) any later version.
10
11   systemd is distributed in the hope that it will be useful, but
12   WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public License
17   along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <sys/epoll.h>
21 #include <sys/timerfd.h>
22 #include <sys/wait.h>
23
24 #include "sd-daemon.h"
25 #include "sd-event.h"
26 #include "sd-id128.h"
27
28 #include "alloc-util.h"
29 #include "fd-util.h"
30 #include "hashmap.h"
31 #include "list.h"
32 #include "macro.h"
33 #include "missing.h"
34 #include "prioq.h"
35 #include "process-util.h"
36 #include "set.h"
37 #include "signal-util.h"
38 #include "string-table.h"
39 #include "string-util.h"
40 #include "time-util.h"
41 #include "util.h"
42
43 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
44
45 typedef enum EventSourceType {
46         SOURCE_IO,
47         SOURCE_TIME_REALTIME,
48         SOURCE_TIME_BOOTTIME,
49         SOURCE_TIME_MONOTONIC,
50         SOURCE_TIME_REALTIME_ALARM,
51         SOURCE_TIME_BOOTTIME_ALARM,
52         SOURCE_SIGNAL,
53         SOURCE_CHILD,
54         SOURCE_DEFER,
55         SOURCE_POST,
56         SOURCE_EXIT,
57         SOURCE_WATCHDOG,
58         _SOURCE_EVENT_SOURCE_TYPE_MAX,
59         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
60 } EventSourceType;
61
62 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
63         [SOURCE_IO] = "io",
64         [SOURCE_TIME_REALTIME] = "realtime",
65         [SOURCE_TIME_BOOTTIME] = "bootime",
66         [SOURCE_TIME_MONOTONIC] = "monotonic",
67         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
68         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
69         [SOURCE_SIGNAL] = "signal",
70         [SOURCE_CHILD] = "child",
71         [SOURCE_DEFER] = "defer",
72         [SOURCE_POST] = "post",
73         [SOURCE_EXIT] = "exit",
74         [SOURCE_WATCHDOG] = "watchdog",
75 };
76
77 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
78
79 /* All objects we use in epoll events start with this value, so that
80  * we know how to dispatch it */
81 typedef enum WakeupType {
82         WAKEUP_NONE,
83         WAKEUP_EVENT_SOURCE,
84         WAKEUP_CLOCK_DATA,
85         WAKEUP_SIGNAL_DATA,
86         _WAKEUP_TYPE_MAX,
87         _WAKEUP_TYPE_INVALID = -1,
88 } WakeupType;
89
90 #define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
91
92 struct sd_event_source {
93         WakeupType wakeup;
94
95         unsigned n_ref;
96
97         sd_event *event;
98         void *userdata;
99         sd_event_handler_t prepare;
100
101         char *description;
102
103         EventSourceType type:5;
104         int enabled:3;
105         bool pending:1;
106         bool dispatching:1;
107         bool floating:1;
108
109         int64_t priority;
110         unsigned pending_index;
111         unsigned prepare_index;
112         uint64_t pending_iteration;
113         uint64_t prepare_iteration;
114
115         LIST_FIELDS(sd_event_source, sources);
116
117         union {
118                 struct {
119                         sd_event_io_handler_t callback;
120                         int fd;
121                         uint32_t events;
122                         uint32_t revents;
123                         bool registered:1;
124                 } io;
125                 struct {
126                         sd_event_time_handler_t callback;
127                         usec_t next, accuracy;
128                         unsigned earliest_index;
129                         unsigned latest_index;
130                 } time;
131                 struct {
132                         sd_event_signal_handler_t callback;
133                         struct signalfd_siginfo siginfo;
134                         int sig;
135                 } signal;
136                 struct {
137                         sd_event_child_handler_t callback;
138                         siginfo_t siginfo;
139                         pid_t pid;
140                         int options;
141                 } child;
142                 struct {
143                         sd_event_handler_t callback;
144                 } defer;
145                 struct {
146                         sd_event_handler_t callback;
147                 } post;
148                 struct {
149                         sd_event_handler_t callback;
150                         unsigned prioq_index;
151                 } exit;
152         };
153 };
154
155 struct clock_data {
156         WakeupType wakeup;
157         int fd;
158
159         /* For all clocks we maintain two priority queues each, one
160          * ordered for the earliest times the events may be
161          * dispatched, and one ordered by the latest times they must
162          * have been dispatched. The range between the top entries in
163          * the two prioqs is the time window we can freely schedule
164          * wakeups in */
165
166         Prioq *earliest;
167         Prioq *latest;
168         usec_t next;
169
170         bool needs_rearm:1;
171 };
172
173 struct signal_data {
174         WakeupType wakeup;
175
176         /* For each priority we maintain one signal fd, so that we
177          * only have to dequeue a single event per priority at a
178          * time. */
179
180         int fd;
181         int64_t priority;
182         sigset_t sigset;
183         sd_event_source *current;
184 };
185
186 struct sd_event {
187         unsigned n_ref;
188
189         int epoll_fd;
190         int watchdog_fd;
191
192         Prioq *pending;
193         Prioq *prepare;
194
195         /* timerfd_create() only supports these five clocks so far. We
196          * can add support for more clocks when the kernel learns to
197          * deal with them, too. */
198         struct clock_data realtime;
199         struct clock_data boottime;
200         struct clock_data monotonic;
201         struct clock_data realtime_alarm;
202         struct clock_data boottime_alarm;
203
204         usec_t perturb;
205
206         sd_event_source **signal_sources; /* indexed by signal number */
207         Hashmap *signal_data; /* indexed by priority */
208
209         Hashmap *child_sources;
210         unsigned n_enabled_child_sources;
211
212         Set *post_sources;
213
214         Prioq *exit;
215
216         pid_t original_pid;
217
218         uint64_t iteration;
219         triple_timestamp timestamp;
220         int state;
221
222         bool exit_requested:1;
223         bool need_process_child:1;
224         bool watchdog:1;
225         bool profile_delays:1;
226
227         int exit_code;
228
229         pid_t tid;
230         sd_event **default_event_ptr;
231
232         usec_t watchdog_last, watchdog_period;
233
234         unsigned n_sources;
235
236         LIST_HEAD(sd_event_source, sources);
237
238         usec_t last_run, last_log;
239         unsigned delays[sizeof(usec_t) * 8];
240 };
241
242 static void source_disconnect(sd_event_source *s);
243
244 static int pending_prioq_compare(const void *a, const void *b) {
245         const sd_event_source *x = a, *y = b;
246
247         assert(x->pending);
248         assert(y->pending);
249
250         /* Enabled ones first */
251         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
252                 return -1;
253         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
254                 return 1;
255
256         /* Lower priority values first */
257         if (x->priority < y->priority)
258                 return -1;
259         if (x->priority > y->priority)
260                 return 1;
261
262         /* Older entries first */
263         if (x->pending_iteration < y->pending_iteration)
264                 return -1;
265         if (x->pending_iteration > y->pending_iteration)
266                 return 1;
267
268         return 0;
269 }
270
271 static int prepare_prioq_compare(const void *a, const void *b) {
272         const sd_event_source *x = a, *y = b;
273
274         assert(x->prepare);
275         assert(y->prepare);
276
277         /* Enabled ones first */
278         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
279                 return -1;
280         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
281                 return 1;
282
283         /* Move most recently prepared ones last, so that we can stop
284          * preparing as soon as we hit one that has already been
285          * prepared in the current iteration */
286         if (x->prepare_iteration < y->prepare_iteration)
287                 return -1;
288         if (x->prepare_iteration > y->prepare_iteration)
289                 return 1;
290
291         /* Lower priority values first */
292         if (x->priority < y->priority)
293                 return -1;
294         if (x->priority > y->priority)
295                 return 1;
296
297         return 0;
298 }
299
300 static int earliest_time_prioq_compare(const void *a, const void *b) {
301         const sd_event_source *x = a, *y = b;
302
303         assert(EVENT_SOURCE_IS_TIME(x->type));
304         assert(x->type == y->type);
305
306         /* Enabled ones first */
307         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
308                 return -1;
309         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
310                 return 1;
311
312         /* Move the pending ones to the end */
313         if (!x->pending && y->pending)
314                 return -1;
315         if (x->pending && !y->pending)
316                 return 1;
317
318         /* Order by time */
319         if (x->time.next < y->time.next)
320                 return -1;
321         if (x->time.next > y->time.next)
322                 return 1;
323
324         return 0;
325 }
326
327 static usec_t time_event_source_latest(const sd_event_source *s) {
328         return usec_add(s->time.next, s->time.accuracy);
329 }
330
331 static int latest_time_prioq_compare(const void *a, const void *b) {
332         const sd_event_source *x = a, *y = b;
333
334         assert(EVENT_SOURCE_IS_TIME(x->type));
335         assert(x->type == y->type);
336
337         /* Enabled ones first */
338         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
339                 return -1;
340         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
341                 return 1;
342
343         /* Move the pending ones to the end */
344         if (!x->pending && y->pending)
345                 return -1;
346         if (x->pending && !y->pending)
347                 return 1;
348
349         /* Order by time */
350         if (time_event_source_latest(x) < time_event_source_latest(y))
351                 return -1;
352         if (time_event_source_latest(x) > time_event_source_latest(y))
353                 return 1;
354
355         return 0;
356 }
357
358 static int exit_prioq_compare(const void *a, const void *b) {
359         const sd_event_source *x = a, *y = b;
360
361         assert(x->type == SOURCE_EXIT);
362         assert(y->type == SOURCE_EXIT);
363
364         /* Enabled ones first */
365         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
366                 return -1;
367         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
368                 return 1;
369
370         /* Lower priority values first */
371         if (x->priority < y->priority)
372                 return -1;
373         if (x->priority > y->priority)
374                 return 1;
375
376         return 0;
377 }
378
379 static void free_clock_data(struct clock_data *d) {
380         assert(d);
381         assert(d->wakeup == WAKEUP_CLOCK_DATA);
382
383         safe_close(d->fd);
384         prioq_free(d->earliest);
385         prioq_free(d->latest);
386 }
387
388 static void event_free(sd_event *e) {
389         sd_event_source *s;
390
391         assert(e);
392
393         while ((s = e->sources)) {
394                 assert(s->floating);
395                 source_disconnect(s);
396                 sd_event_source_unref(s);
397         }
398
399         assert(e->n_sources == 0);
400
401         if (e->default_event_ptr)
402                 *(e->default_event_ptr) = NULL;
403
404         safe_close(e->epoll_fd);
405         safe_close(e->watchdog_fd);
406
407         free_clock_data(&e->realtime);
408         free_clock_data(&e->boottime);
409         free_clock_data(&e->monotonic);
410         free_clock_data(&e->realtime_alarm);
411         free_clock_data(&e->boottime_alarm);
412
413         prioq_free(e->pending);
414         prioq_free(e->prepare);
415         prioq_free(e->exit);
416
417         free(e->signal_sources);
418         hashmap_free(e->signal_data);
419
420         hashmap_free(e->child_sources);
421         set_free(e->post_sources);
422         free(e);
423 }
424
425 _public_ int sd_event_new(sd_event** ret) {
426         sd_event *e;
427         int r;
428
429         assert_return(ret, -EINVAL);
430
431         e = new0(sd_event, 1);
432         if (!e)
433                 return -ENOMEM;
434
435         e->n_ref = 1;
436         e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
437         e->realtime.next = e->boottime.next = e->monotonic.next = e->realtime_alarm.next = e->boottime_alarm.next = USEC_INFINITY;
438         e->realtime.wakeup = e->boottime.wakeup = e->monotonic.wakeup = e->realtime_alarm.wakeup = e->boottime_alarm.wakeup = WAKEUP_CLOCK_DATA;
439         e->original_pid = getpid();
440         e->perturb = USEC_INFINITY;
441
442         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
443         if (r < 0)
444                 goto fail;
445
446         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
447         if (e->epoll_fd < 0) {
448                 r = -errno;
449                 goto fail;
450         }
451
452         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
453                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 ... 2^63 us will be logged every 5s.");
454                 e->profile_delays = true;
455         }
456
457         *ret = e;
458         return 0;
459
460 fail:
461         event_free(e);
462         return r;
463 }
464
465 _public_ sd_event* sd_event_ref(sd_event *e) {
466
467         if (!e)
468                 return NULL;
469
470         assert(e->n_ref >= 1);
471         e->n_ref++;
472
473         return e;
474 }
475
476 _public_ sd_event* sd_event_unref(sd_event *e) {
477
478         if (!e)
479                 return NULL;
480
481         assert(e->n_ref >= 1);
482         e->n_ref--;
483
484         if (e->n_ref <= 0)
485                 event_free(e);
486
487         return NULL;
488 }
489
490 static bool event_pid_changed(sd_event *e) {
491         assert(e);
492
493         /* We don't support people creating an event loop and keeping
494          * it around over a fork(). Let's complain. */
495
496         return e->original_pid != getpid();
497 }
498
499 static void source_io_unregister(sd_event_source *s) {
500         int r;
501
502         assert(s);
503         assert(s->type == SOURCE_IO);
504
505         if (event_pid_changed(s->event))
506                 return;
507
508         if (!s->io.registered)
509                 return;
510
511         r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
512         if (r < 0)
513                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
514                                 strna(s->description), event_source_type_to_string(s->type));
515
516         s->io.registered = false;
517 }
518
519 static int source_io_register(
520                 sd_event_source *s,
521                 int enabled,
522                 uint32_t events) {
523
524         struct epoll_event ev = {};
525         int r;
526
527         assert(s);
528         assert(s->type == SOURCE_IO);
529         assert(enabled != SD_EVENT_OFF);
530
531         ev.events = events;
532         ev.data.ptr = s;
533
534         if (enabled == SD_EVENT_ONESHOT)
535                 ev.events |= EPOLLONESHOT;
536
537         if (s->io.registered)
538                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
539         else
540                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
541         if (r < 0)
542                 return -errno;
543
544         s->io.registered = true;
545
546         return 0;
547 }
548
549 static clockid_t event_source_type_to_clock(EventSourceType t) {
550
551         switch (t) {
552
553         case SOURCE_TIME_REALTIME:
554                 return CLOCK_REALTIME;
555
556         case SOURCE_TIME_BOOTTIME:
557                 return CLOCK_BOOTTIME;
558
559         case SOURCE_TIME_MONOTONIC:
560                 return CLOCK_MONOTONIC;
561
562         case SOURCE_TIME_REALTIME_ALARM:
563                 return CLOCK_REALTIME_ALARM;
564
565         case SOURCE_TIME_BOOTTIME_ALARM:
566                 return CLOCK_BOOTTIME_ALARM;
567
568         default:
569                 return (clockid_t) -1;
570         }
571 }
572
573 static EventSourceType clock_to_event_source_type(clockid_t clock) {
574
575         switch (clock) {
576
577         case CLOCK_REALTIME:
578                 return SOURCE_TIME_REALTIME;
579
580         case CLOCK_BOOTTIME:
581                 return SOURCE_TIME_BOOTTIME;
582
583         case CLOCK_MONOTONIC:
584                 return SOURCE_TIME_MONOTONIC;
585
586         case CLOCK_REALTIME_ALARM:
587                 return SOURCE_TIME_REALTIME_ALARM;
588
589         case CLOCK_BOOTTIME_ALARM:
590                 return SOURCE_TIME_BOOTTIME_ALARM;
591
592         default:
593                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
594         }
595 }
596
597 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
598         assert(e);
599
600         switch (t) {
601
602         case SOURCE_TIME_REALTIME:
603                 return &e->realtime;
604
605         case SOURCE_TIME_BOOTTIME:
606                 return &e->boottime;
607
608         case SOURCE_TIME_MONOTONIC:
609                 return &e->monotonic;
610
611         case SOURCE_TIME_REALTIME_ALARM:
612                 return &e->realtime_alarm;
613
614         case SOURCE_TIME_BOOTTIME_ALARM:
615                 return &e->boottime_alarm;
616
617         default:
618                 return NULL;
619         }
620 }
621
622 static int event_make_signal_data(
623                 sd_event *e,
624                 int sig,
625                 struct signal_data **ret) {
626
627         struct epoll_event ev = {};
628         struct signal_data *d;
629         bool added = false;
630         sigset_t ss_copy;
631         int64_t priority;
632         int r;
633
634         assert(e);
635
636         if (event_pid_changed(e))
637                 return -ECHILD;
638
639         if (e->signal_sources && e->signal_sources[sig])
640                 priority = e->signal_sources[sig]->priority;
641         else
642                 priority = 0;
643
644         d = hashmap_get(e->signal_data, &priority);
645         if (d) {
646                 if (sigismember(&d->sigset, sig) > 0) {
647                         if (ret)
648                                 *ret = d;
649                         return 0;
650                 }
651         } else {
652                 r = hashmap_ensure_allocated(&e->signal_data, &uint64_hash_ops);
653                 if (r < 0)
654                         return r;
655
656                 d = new0(struct signal_data, 1);
657                 if (!d)
658                         return -ENOMEM;
659
660                 d->wakeup = WAKEUP_SIGNAL_DATA;
661                 d->fd  = -1;
662                 d->priority = priority;
663
664                 r = hashmap_put(e->signal_data, &d->priority, d);
665                 if (r < 0) {
666                         free(d);
667                         return r;
668                 }
669
670                 added = true;
671         }
672
673         ss_copy = d->sigset;
674         assert_se(sigaddset(&ss_copy, sig) >= 0);
675
676         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
677         if (r < 0) {
678                 r = -errno;
679                 goto fail;
680         }
681
682         d->sigset = ss_copy;
683
684         if (d->fd >= 0) {
685                 if (ret)
686                         *ret = d;
687                 return 0;
688         }
689
690         d->fd = r;
691
692         ev.events = EPOLLIN;
693         ev.data.ptr = d;
694
695         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev);
696         if (r < 0)  {
697                 r = -errno;
698                 goto fail;
699         }
700
701         if (ret)
702                 *ret = d;
703
704         return 0;
705
706 fail:
707         if (added) {
708                 d->fd = safe_close(d->fd);
709                 hashmap_remove(e->signal_data, &d->priority);
710                 free(d);
711         }
712
713         return r;
714 }
715
716 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
717         assert(e);
718         assert(d);
719
720         /* Turns off the specified signal in the signal data
721          * object. If the signal mask of the object becomes empty that
722          * way removes it. */
723
724         if (sigismember(&d->sigset, sig) == 0)
725                 return;
726
727         assert_se(sigdelset(&d->sigset, sig) >= 0);
728
729         if (sigisemptyset(&d->sigset)) {
730
731                 /* If all the mask is all-zero we can get rid of the structure */
732                 hashmap_remove(e->signal_data, &d->priority);
733                 safe_close(d->fd);
734                 free(d);
735                 return;
736         }
737
738         assert(d->fd >= 0);
739
740         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
741                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
742 }
743
744 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
745         struct signal_data *d;
746         static const int64_t zero_priority = 0;
747
748         assert(e);
749
750         /* Rechecks if the specified signal is still something we are
751          * interested in. If not, we'll unmask it, and possibly drop
752          * the signalfd for it. */
753
754         if (sig == SIGCHLD &&
755             e->n_enabled_child_sources > 0)
756                 return;
757
758         if (e->signal_sources &&
759             e->signal_sources[sig] &&
760             e->signal_sources[sig]->enabled != SD_EVENT_OFF)
761                 return;
762
763         /*
764          * The specified signal might be enabled in three different queues:
765          *
766          * 1) the one that belongs to the priority passed (if it is non-NULL)
767          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
768          * 3) the 0 priority (to cover the SIGCHLD case)
769          *
770          * Hence, let's remove it from all three here.
771          */
772
773         if (priority) {
774                 d = hashmap_get(e->signal_data, priority);
775                 if (d)
776                         event_unmask_signal_data(e, d, sig);
777         }
778
779         if (e->signal_sources && e->signal_sources[sig]) {
780                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
781                 if (d)
782                         event_unmask_signal_data(e, d, sig);
783         }
784
785         d = hashmap_get(e->signal_data, &zero_priority);
786         if (d)
787                 event_unmask_signal_data(e, d, sig);
788 }
789
790 static void source_disconnect(sd_event_source *s) {
791         sd_event *event;
792
793         assert(s);
794
795         if (!s->event)
796                 return;
797
798         assert(s->event->n_sources > 0);
799
800         switch (s->type) {
801
802         case SOURCE_IO:
803                 if (s->io.fd >= 0)
804                         source_io_unregister(s);
805
806                 break;
807
808         case SOURCE_TIME_REALTIME:
809         case SOURCE_TIME_BOOTTIME:
810         case SOURCE_TIME_MONOTONIC:
811         case SOURCE_TIME_REALTIME_ALARM:
812         case SOURCE_TIME_BOOTTIME_ALARM: {
813                 struct clock_data *d;
814
815                 d = event_get_clock_data(s->event, s->type);
816                 assert(d);
817
818                 prioq_remove(d->earliest, s, &s->time.earliest_index);
819                 prioq_remove(d->latest, s, &s->time.latest_index);
820                 d->needs_rearm = true;
821                 break;
822         }
823
824         case SOURCE_SIGNAL:
825                 if (s->signal.sig > 0) {
826
827                         if (s->event->signal_sources)
828                                 s->event->signal_sources[s->signal.sig] = NULL;
829
830                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
831                 }
832
833                 break;
834
835         case SOURCE_CHILD:
836                 if (s->child.pid > 0) {
837                         if (s->enabled != SD_EVENT_OFF) {
838                                 assert(s->event->n_enabled_child_sources > 0);
839                                 s->event->n_enabled_child_sources--;
840                         }
841
842                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
843                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
844                 }
845
846                 break;
847
848         case SOURCE_DEFER:
849                 /* nothing */
850                 break;
851
852         case SOURCE_POST:
853                 set_remove(s->event->post_sources, s);
854                 break;
855
856         case SOURCE_EXIT:
857                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
858                 break;
859
860         default:
861                 assert_not_reached("Wut? I shouldn't exist.");
862         }
863
864         if (s->pending)
865                 prioq_remove(s->event->pending, s, &s->pending_index);
866
867         if (s->prepare)
868                 prioq_remove(s->event->prepare, s, &s->prepare_index);
869
870         event = s->event;
871
872         s->type = _SOURCE_EVENT_SOURCE_TYPE_INVALID;
873         s->event = NULL;
874         LIST_REMOVE(sources, event->sources, s);
875         event->n_sources--;
876
877         if (!s->floating)
878                 sd_event_unref(event);
879 }
880
881 static void source_free(sd_event_source *s) {
882         assert(s);
883
884         source_disconnect(s);
885         free(s->description);
886         free(s);
887 }
888
889 static int source_set_pending(sd_event_source *s, bool b) {
890         int r;
891
892         assert(s);
893         assert(s->type != SOURCE_EXIT);
894
895         if (s->pending == b)
896                 return 0;
897
898         s->pending = b;
899
900         if (b) {
901                 s->pending_iteration = s->event->iteration;
902
903                 r = prioq_put(s->event->pending, s, &s->pending_index);
904                 if (r < 0) {
905                         s->pending = false;
906                         return r;
907                 }
908         } else
909                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
910
911         if (EVENT_SOURCE_IS_TIME(s->type)) {
912                 struct clock_data *d;
913
914                 d = event_get_clock_data(s->event, s->type);
915                 assert(d);
916
917                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
918                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
919                 d->needs_rearm = true;
920         }
921
922         if (s->type == SOURCE_SIGNAL && !b) {
923                 struct signal_data *d;
924
925                 d = hashmap_get(s->event->signal_data, &s->priority);
926                 if (d && d->current == s)
927                         d->current = NULL;
928         }
929
930         return 0;
931 }
932
933 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
934         sd_event_source *s;
935
936         assert(e);
937
938         s = new0(sd_event_source, 1);
939         if (!s)
940                 return NULL;
941
942         s->n_ref = 1;
943         s->event = e;
944         s->floating = floating;
945         s->type = type;
946         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
947
948         if (!floating)
949                 sd_event_ref(e);
950
951         LIST_PREPEND(sources, e->sources, s);
952         e->n_sources++;
953
954         return s;
955 }
956
957 _public_ int sd_event_add_io(
958                 sd_event *e,
959                 sd_event_source **ret,
960                 int fd,
961                 uint32_t events,
962                 sd_event_io_handler_t callback,
963                 void *userdata) {
964
965         sd_event_source *s;
966         int r;
967
968         assert_return(e, -EINVAL);
969         assert_return(fd >= 0, -EBADF);
970         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
971         assert_return(callback, -EINVAL);
972         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
973         assert_return(!event_pid_changed(e), -ECHILD);
974
975         s = source_new(e, !ret, SOURCE_IO);
976         if (!s)
977                 return -ENOMEM;
978
979         s->wakeup = WAKEUP_EVENT_SOURCE;
980         s->io.fd = fd;
981         s->io.events = events;
982         s->io.callback = callback;
983         s->userdata = userdata;
984         s->enabled = SD_EVENT_ON;
985
986         r = source_io_register(s, s->enabled, events);
987         if (r < 0) {
988                 source_free(s);
989                 return r;
990         }
991
992         if (ret)
993                 *ret = s;
994
995         return 0;
996 }
997
998 static void initialize_perturb(sd_event *e) {
999         sd_id128_t bootid = {};
1000
1001         /* When we sleep for longer, we try to realign the wakeup to
1002            the same time wihtin each minute/second/250ms, so that
1003            events all across the system can be coalesced into a single
1004            CPU wakeup. However, let's take some system-specific
1005            randomness for this value, so that in a network of systems
1006            with synced clocks timer events are distributed a
1007            bit. Here, we calculate a perturbation usec offset from the
1008            boot ID. */
1009
1010         if (_likely_(e->perturb != USEC_INFINITY))
1011                 return;
1012
1013         if (sd_id128_get_boot(&bootid) >= 0)
1014                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1015 }
1016
1017 static int event_setup_timer_fd(
1018                 sd_event *e,
1019                 struct clock_data *d,
1020                 clockid_t clock) {
1021
1022         struct epoll_event ev = {};
1023         int r, fd;
1024
1025         assert(e);
1026         assert(d);
1027
1028         if (_likely_(d->fd >= 0))
1029                 return 0;
1030
1031         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1032         if (fd < 0)
1033                 return -errno;
1034
1035         ev.events = EPOLLIN;
1036         ev.data.ptr = d;
1037
1038         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
1039         if (r < 0) {
1040                 safe_close(fd);
1041                 return -errno;
1042         }
1043
1044         d->fd = fd;
1045         return 0;
1046 }
1047
1048 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1049         assert(s);
1050
1051         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1052 }
1053
1054 _public_ int sd_event_add_time(
1055                 sd_event *e,
1056                 sd_event_source **ret,
1057                 clockid_t clock,
1058                 uint64_t usec,
1059                 uint64_t accuracy,
1060                 sd_event_time_handler_t callback,
1061                 void *userdata) {
1062
1063         EventSourceType type;
1064         sd_event_source *s;
1065         struct clock_data *d;
1066         int r;
1067
1068         assert_return(e, -EINVAL);
1069         assert_return(accuracy != (uint64_t) -1, -EINVAL);
1070         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1071         assert_return(!event_pid_changed(e), -ECHILD);
1072
1073         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1074                 return -EOPNOTSUPP;
1075
1076         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1077         if (type < 0)
1078                 return -EOPNOTSUPP;
1079
1080         if (!callback)
1081                 callback = time_exit_callback;
1082
1083         d = event_get_clock_data(e, type);
1084         assert(d);
1085
1086         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1087         if (r < 0)
1088                 return r;
1089
1090         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1091         if (r < 0)
1092                 return r;
1093
1094         if (d->fd < 0) {
1095                 r = event_setup_timer_fd(e, d, clock);
1096                 if (r < 0)
1097                         return r;
1098         }
1099
1100         s = source_new(e, !ret, type);
1101         if (!s)
1102                 return -ENOMEM;
1103
1104         s->time.next = usec;
1105         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1106         s->time.callback = callback;
1107         s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
1108         s->userdata = userdata;
1109         s->enabled = SD_EVENT_ONESHOT;
1110
1111         d->needs_rearm = true;
1112
1113         r = prioq_put(d->earliest, s, &s->time.earliest_index);
1114         if (r < 0)
1115                 goto fail;
1116
1117         r = prioq_put(d->latest, s, &s->time.latest_index);
1118         if (r < 0)
1119                 goto fail;
1120
1121         if (ret)
1122                 *ret = s;
1123
1124         return 0;
1125
1126 fail:
1127         source_free(s);
1128         return r;
1129 }
1130
1131 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1132         assert(s);
1133
1134         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1135 }
1136
1137 _public_ int sd_event_add_signal(
1138                 sd_event *e,
1139                 sd_event_source **ret,
1140                 int sig,
1141                 sd_event_signal_handler_t callback,
1142                 void *userdata) {
1143
1144         sd_event_source *s;
1145         struct signal_data *d;
1146         sigset_t ss;
1147         int r;
1148
1149         assert_return(e, -EINVAL);
1150         assert_return(SIGNAL_VALID(sig), -EINVAL);
1151         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1152         assert_return(!event_pid_changed(e), -ECHILD);
1153
1154         if (!callback)
1155                 callback = signal_exit_callback;
1156
1157         r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
1158         if (r != 0)
1159                 return -r;
1160
1161         if (!sigismember(&ss, sig))
1162                 return -EBUSY;
1163
1164         if (!e->signal_sources) {
1165                 e->signal_sources = new0(sd_event_source*, _NSIG);
1166                 if (!e->signal_sources)
1167                         return -ENOMEM;
1168         } else if (e->signal_sources[sig])
1169                 return -EBUSY;
1170
1171         s = source_new(e, !ret, SOURCE_SIGNAL);
1172         if (!s)
1173                 return -ENOMEM;
1174
1175         s->signal.sig = sig;
1176         s->signal.callback = callback;
1177         s->userdata = userdata;
1178         s->enabled = SD_EVENT_ON;
1179
1180         e->signal_sources[sig] = s;
1181
1182         r = event_make_signal_data(e, sig, &d);
1183         if (r < 0) {
1184                 source_free(s);
1185                 return r;
1186         }
1187
1188         /* Use the signal name as description for the event source by default */
1189         (void) sd_event_source_set_description(s, signal_to_string(sig));
1190
1191         if (ret)
1192                 *ret = s;
1193
1194         return 0;
1195 }
1196
1197 _public_ int sd_event_add_child(
1198                 sd_event *e,
1199                 sd_event_source **ret,
1200                 pid_t pid,
1201                 int options,
1202                 sd_event_child_handler_t callback,
1203                 void *userdata) {
1204
1205         sd_event_source *s;
1206         int r;
1207
1208         assert_return(e, -EINVAL);
1209         assert_return(pid > 1, -EINVAL);
1210         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1211         assert_return(options != 0, -EINVAL);
1212         assert_return(callback, -EINVAL);
1213         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1214         assert_return(!event_pid_changed(e), -ECHILD);
1215
1216         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1217         if (r < 0)
1218                 return r;
1219
1220         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1221                 return -EBUSY;
1222
1223         s = source_new(e, !ret, SOURCE_CHILD);
1224         if (!s)
1225                 return -ENOMEM;
1226
1227         s->child.pid = pid;
1228         s->child.options = options;
1229         s->child.callback = callback;
1230         s->userdata = userdata;
1231         s->enabled = SD_EVENT_ONESHOT;
1232
1233         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1234         if (r < 0) {
1235                 source_free(s);
1236                 return r;
1237         }
1238
1239         e->n_enabled_child_sources++;
1240
1241         r = event_make_signal_data(e, SIGCHLD, NULL);
1242         if (r < 0) {
1243                 e->n_enabled_child_sources--;
1244                 source_free(s);
1245                 return r;
1246         }
1247
1248         e->need_process_child = true;
1249
1250         if (ret)
1251                 *ret = s;
1252
1253         return 0;
1254 }
1255
1256 _public_ int sd_event_add_defer(
1257                 sd_event *e,
1258                 sd_event_source **ret,
1259                 sd_event_handler_t callback,
1260                 void *userdata) {
1261
1262         sd_event_source *s;
1263         int r;
1264
1265         assert_return(e, -EINVAL);
1266         assert_return(callback, -EINVAL);
1267         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1268         assert_return(!event_pid_changed(e), -ECHILD);
1269
1270         s = source_new(e, !ret, SOURCE_DEFER);
1271         if (!s)
1272                 return -ENOMEM;
1273
1274         s->defer.callback = callback;
1275         s->userdata = userdata;
1276         s->enabled = SD_EVENT_ONESHOT;
1277
1278         r = source_set_pending(s, true);
1279         if (r < 0) {
1280                 source_free(s);
1281                 return r;
1282         }
1283
1284         if (ret)
1285                 *ret = s;
1286
1287         return 0;
1288 }
1289
1290 _public_ int sd_event_add_post(
1291                 sd_event *e,
1292                 sd_event_source **ret,
1293                 sd_event_handler_t callback,
1294                 void *userdata) {
1295
1296         sd_event_source *s;
1297         int r;
1298
1299         assert_return(e, -EINVAL);
1300         assert_return(callback, -EINVAL);
1301         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1302         assert_return(!event_pid_changed(e), -ECHILD);
1303
1304         r = set_ensure_allocated(&e->post_sources, NULL);
1305         if (r < 0)
1306                 return r;
1307
1308         s = source_new(e, !ret, SOURCE_POST);
1309         if (!s)
1310                 return -ENOMEM;
1311
1312         s->post.callback = callback;
1313         s->userdata = userdata;
1314         s->enabled = SD_EVENT_ON;
1315
1316         r = set_put(e->post_sources, s);
1317         if (r < 0) {
1318                 source_free(s);
1319                 return r;
1320         }
1321
1322         if (ret)
1323                 *ret = s;
1324
1325         return 0;
1326 }
1327
1328 _public_ int sd_event_add_exit(
1329                 sd_event *e,
1330                 sd_event_source **ret,
1331                 sd_event_handler_t callback,
1332                 void *userdata) {
1333
1334         sd_event_source *s;
1335         int r;
1336
1337         assert_return(e, -EINVAL);
1338         assert_return(callback, -EINVAL);
1339         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1340         assert_return(!event_pid_changed(e), -ECHILD);
1341
1342         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1343         if (r < 0)
1344                 return r;
1345
1346         s = source_new(e, !ret, SOURCE_EXIT);
1347         if (!s)
1348                 return -ENOMEM;
1349
1350         s->exit.callback = callback;
1351         s->userdata = userdata;
1352         s->exit.prioq_index = PRIOQ_IDX_NULL;
1353         s->enabled = SD_EVENT_ONESHOT;
1354
1355         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1356         if (r < 0) {
1357                 source_free(s);
1358                 return r;
1359         }
1360
1361         if (ret)
1362                 *ret = s;
1363
1364         return 0;
1365 }
1366
1367 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
1368
1369         if (!s)
1370                 return NULL;
1371
1372         assert(s->n_ref >= 1);
1373         s->n_ref++;
1374
1375         return s;
1376 }
1377
1378 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
1379
1380         if (!s)
1381                 return NULL;
1382
1383         assert(s->n_ref >= 1);
1384         s->n_ref--;
1385
1386         if (s->n_ref <= 0) {
1387                 /* Here's a special hack: when we are called from a
1388                  * dispatch handler we won't free the event source
1389                  * immediately, but we will detach the fd from the
1390                  * epoll. This way it is safe for the caller to unref
1391                  * the event source and immediately close the fd, but
1392                  * we still retain a valid event source object after
1393                  * the callback. */
1394
1395                 if (s->dispatching) {
1396                         if (s->type == SOURCE_IO)
1397                                 source_io_unregister(s);
1398
1399                         source_disconnect(s);
1400                 } else
1401                         source_free(s);
1402         }
1403
1404         return NULL;
1405 }
1406
1407 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
1408         assert_return(s, -EINVAL);
1409         assert_return(!event_pid_changed(s->event), -ECHILD);
1410
1411         return free_and_strdup(&s->description, description);
1412 }
1413
1414 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
1415         assert_return(s, -EINVAL);
1416         assert_return(description, -EINVAL);
1417         assert_return(s->description, -ENXIO);
1418         assert_return(!event_pid_changed(s->event), -ECHILD);
1419
1420         *description = s->description;
1421         return 0;
1422 }
1423
1424 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1425         assert_return(s, NULL);
1426
1427         return s->event;
1428 }
1429
1430 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1431         assert_return(s, -EINVAL);
1432         assert_return(s->type != SOURCE_EXIT, -EDOM);
1433         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1434         assert_return(!event_pid_changed(s->event), -ECHILD);
1435
1436         return s->pending;
1437 }
1438
1439 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1440         assert_return(s, -EINVAL);
1441         assert_return(s->type == SOURCE_IO, -EDOM);
1442         assert_return(!event_pid_changed(s->event), -ECHILD);
1443
1444         return s->io.fd;
1445 }
1446
1447 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1448         int r;
1449
1450         assert_return(s, -EINVAL);
1451         assert_return(fd >= 0, -EBADF);
1452         assert_return(s->type == SOURCE_IO, -EDOM);
1453         assert_return(!event_pid_changed(s->event), -ECHILD);
1454
1455         if (s->io.fd == fd)
1456                 return 0;
1457
1458         if (s->enabled == SD_EVENT_OFF) {
1459                 s->io.fd = fd;
1460                 s->io.registered = false;
1461         } else {
1462                 int saved_fd;
1463
1464                 saved_fd = s->io.fd;
1465                 assert(s->io.registered);
1466
1467                 s->io.fd = fd;
1468                 s->io.registered = false;
1469
1470                 r = source_io_register(s, s->enabled, s->io.events);
1471                 if (r < 0) {
1472                         s->io.fd = saved_fd;
1473                         s->io.registered = true;
1474                         return r;
1475                 }
1476
1477                 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
1478         }
1479
1480         return 0;
1481 }
1482
1483 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1484         assert_return(s, -EINVAL);
1485         assert_return(events, -EINVAL);
1486         assert_return(s->type == SOURCE_IO, -EDOM);
1487         assert_return(!event_pid_changed(s->event), -ECHILD);
1488
1489         *events = s->io.events;
1490         return 0;
1491 }
1492
1493 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1494         int r;
1495
1496         assert_return(s, -EINVAL);
1497         assert_return(s->type == SOURCE_IO, -EDOM);
1498         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1499         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1500         assert_return(!event_pid_changed(s->event), -ECHILD);
1501
1502         /* edge-triggered updates are never skipped, so we can reset edges */
1503         if (s->io.events == events && !(events & EPOLLET))
1504                 return 0;
1505
1506         if (s->enabled != SD_EVENT_OFF) {
1507                 r = source_io_register(s, s->enabled, events);
1508                 if (r < 0)
1509                         return r;
1510         }
1511
1512         s->io.events = events;
1513         source_set_pending(s, false);
1514
1515         return 0;
1516 }
1517
1518 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1519         assert_return(s, -EINVAL);
1520         assert_return(revents, -EINVAL);
1521         assert_return(s->type == SOURCE_IO, -EDOM);
1522         assert_return(s->pending, -ENODATA);
1523         assert_return(!event_pid_changed(s->event), -ECHILD);
1524
1525         *revents = s->io.revents;
1526         return 0;
1527 }
1528
1529 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1530         assert_return(s, -EINVAL);
1531         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1532         assert_return(!event_pid_changed(s->event), -ECHILD);
1533
1534         return s->signal.sig;
1535 }
1536
1537 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
1538         assert_return(s, -EINVAL);
1539         assert_return(!event_pid_changed(s->event), -ECHILD);
1540
1541         *priority = s->priority;
1542         return 0;
1543 }
1544
1545 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
1546         int r;
1547
1548         assert_return(s, -EINVAL);
1549         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1550         assert_return(!event_pid_changed(s->event), -ECHILD);
1551
1552         if (s->priority == priority)
1553                 return 0;
1554
1555         if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
1556                 struct signal_data *old, *d;
1557
1558                 /* Move us from the signalfd belonging to the old
1559                  * priority to the signalfd of the new priority */
1560
1561                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
1562
1563                 s->priority = priority;
1564
1565                 r = event_make_signal_data(s->event, s->signal.sig, &d);
1566                 if (r < 0) {
1567                         s->priority = old->priority;
1568                         return r;
1569                 }
1570
1571                 event_unmask_signal_data(s->event, old, s->signal.sig);
1572         } else
1573                 s->priority = priority;
1574
1575         if (s->pending)
1576                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1577
1578         if (s->prepare)
1579                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1580
1581         if (s->type == SOURCE_EXIT)
1582                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1583
1584         return 0;
1585 }
1586
1587 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1588         assert_return(s, -EINVAL);
1589         assert_return(m, -EINVAL);
1590         assert_return(!event_pid_changed(s->event), -ECHILD);
1591
1592         *m = s->enabled;
1593         return 0;
1594 }
1595
1596 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1597         int r;
1598
1599         assert_return(s, -EINVAL);
1600         assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
1601         assert_return(!event_pid_changed(s->event), -ECHILD);
1602
1603         /* If we are dead anyway, we are fine with turning off
1604          * sources, but everything else needs to fail. */
1605         if (s->event->state == SD_EVENT_FINISHED)
1606                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
1607
1608         if (s->enabled == m)
1609                 return 0;
1610
1611         if (m == SD_EVENT_OFF) {
1612
1613                 switch (s->type) {
1614
1615                 case SOURCE_IO:
1616                         source_io_unregister(s);
1617                         s->enabled = m;
1618                         break;
1619
1620                 case SOURCE_TIME_REALTIME:
1621                 case SOURCE_TIME_BOOTTIME:
1622                 case SOURCE_TIME_MONOTONIC:
1623                 case SOURCE_TIME_REALTIME_ALARM:
1624                 case SOURCE_TIME_BOOTTIME_ALARM: {
1625                         struct clock_data *d;
1626
1627                         s->enabled = m;
1628                         d = event_get_clock_data(s->event, s->type);
1629                         assert(d);
1630
1631                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1632                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1633                         d->needs_rearm = true;
1634                         break;
1635                 }
1636
1637                 case SOURCE_SIGNAL:
1638                         s->enabled = m;
1639
1640                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1641                         break;
1642
1643                 case SOURCE_CHILD:
1644                         s->enabled = m;
1645
1646                         assert(s->event->n_enabled_child_sources > 0);
1647                         s->event->n_enabled_child_sources--;
1648
1649                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1650                         break;
1651
1652                 case SOURCE_EXIT:
1653                         s->enabled = m;
1654                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1655                         break;
1656
1657                 case SOURCE_DEFER:
1658                 case SOURCE_POST:
1659                         s->enabled = m;
1660                         break;
1661
1662                 default:
1663                         assert_not_reached("Wut? I shouldn't exist.");
1664                 }
1665
1666         } else {
1667                 switch (s->type) {
1668
1669                 case SOURCE_IO:
1670                         r = source_io_register(s, m, s->io.events);
1671                         if (r < 0)
1672                                 return r;
1673
1674                         s->enabled = m;
1675                         break;
1676
1677                 case SOURCE_TIME_REALTIME:
1678                 case SOURCE_TIME_BOOTTIME:
1679                 case SOURCE_TIME_MONOTONIC:
1680                 case SOURCE_TIME_REALTIME_ALARM:
1681                 case SOURCE_TIME_BOOTTIME_ALARM: {
1682                         struct clock_data *d;
1683
1684                         s->enabled = m;
1685                         d = event_get_clock_data(s->event, s->type);
1686                         assert(d);
1687
1688                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1689                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1690                         d->needs_rearm = true;
1691                         break;
1692                 }
1693
1694                 case SOURCE_SIGNAL:
1695
1696                         s->enabled = m;
1697
1698                         r = event_make_signal_data(s->event, s->signal.sig, NULL);
1699                         if (r < 0) {
1700                                 s->enabled = SD_EVENT_OFF;
1701                                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1702                                 return r;
1703                         }
1704
1705                         break;
1706
1707                 case SOURCE_CHILD:
1708
1709                         if (s->enabled == SD_EVENT_OFF)
1710                                 s->event->n_enabled_child_sources++;
1711
1712                         s->enabled = m;
1713
1714                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
1715                         if (r < 0) {
1716                                 s->enabled = SD_EVENT_OFF;
1717                                 s->event->n_enabled_child_sources--;
1718                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1719                                 return r;
1720                         }
1721
1722                         break;
1723
1724                 case SOURCE_EXIT:
1725                         s->enabled = m;
1726                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1727                         break;
1728
1729                 case SOURCE_DEFER:
1730                 case SOURCE_POST:
1731                         s->enabled = m;
1732                         break;
1733
1734                 default:
1735                         assert_not_reached("Wut? I shouldn't exist.");
1736                 }
1737         }
1738
1739         if (s->pending)
1740                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1741
1742         if (s->prepare)
1743                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1744
1745         return 0;
1746 }
1747
1748 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1749         assert_return(s, -EINVAL);
1750         assert_return(usec, -EINVAL);
1751         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1752         assert_return(!event_pid_changed(s->event), -ECHILD);
1753
1754         *usec = s->time.next;
1755         return 0;
1756 }
1757
1758 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1759         struct clock_data *d;
1760
1761         assert_return(s, -EINVAL);
1762         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1763         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1764         assert_return(!event_pid_changed(s->event), -ECHILD);
1765
1766         s->time.next = usec;
1767
1768         source_set_pending(s, false);
1769
1770         d = event_get_clock_data(s->event, s->type);
1771         assert(d);
1772
1773         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1774         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1775         d->needs_rearm = true;
1776
1777         return 0;
1778 }
1779
1780 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1781         assert_return(s, -EINVAL);
1782         assert_return(usec, -EINVAL);
1783         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1784         assert_return(!event_pid_changed(s->event), -ECHILD);
1785
1786         *usec = s->time.accuracy;
1787         return 0;
1788 }
1789
1790 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1791         struct clock_data *d;
1792
1793         assert_return(s, -EINVAL);
1794         assert_return(usec != (uint64_t) -1, -EINVAL);
1795         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1796         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1797         assert_return(!event_pid_changed(s->event), -ECHILD);
1798
1799         if (usec == 0)
1800                 usec = DEFAULT_ACCURACY_USEC;
1801
1802         s->time.accuracy = usec;
1803
1804         source_set_pending(s, false);
1805
1806         d = event_get_clock_data(s->event, s->type);
1807         assert(d);
1808
1809         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1810         d->needs_rearm = true;
1811
1812         return 0;
1813 }
1814
1815 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
1816         assert_return(s, -EINVAL);
1817         assert_return(clock, -EINVAL);
1818         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1819         assert_return(!event_pid_changed(s->event), -ECHILD);
1820
1821         *clock = event_source_type_to_clock(s->type);
1822         return 0;
1823 }
1824
1825 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1826         assert_return(s, -EINVAL);
1827         assert_return(pid, -EINVAL);
1828         assert_return(s->type == SOURCE_CHILD, -EDOM);
1829         assert_return(!event_pid_changed(s->event), -ECHILD);
1830
1831         *pid = s->child.pid;
1832         return 0;
1833 }
1834
1835 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1836         int r;
1837
1838         assert_return(s, -EINVAL);
1839         assert_return(s->type != SOURCE_EXIT, -EDOM);
1840         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1841         assert_return(!event_pid_changed(s->event), -ECHILD);
1842
1843         if (s->prepare == callback)
1844                 return 0;
1845
1846         if (callback && s->prepare) {
1847                 s->prepare = callback;
1848                 return 0;
1849         }
1850
1851         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1852         if (r < 0)
1853                 return r;
1854
1855         s->prepare = callback;
1856
1857         if (callback) {
1858                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1859                 if (r < 0)
1860                         return r;
1861         } else
1862                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1863
1864         return 0;
1865 }
1866
1867 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1868         assert_return(s, NULL);
1869
1870         return s->userdata;
1871 }
1872
1873 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1874         void *ret;
1875
1876         assert_return(s, NULL);
1877
1878         ret = s->userdata;
1879         s->userdata = userdata;
1880
1881         return ret;
1882 }
1883
1884 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1885         usec_t c;
1886         assert(e);
1887         assert(a <= b);
1888
1889         if (a <= 0)
1890                 return 0;
1891         if (a >= USEC_INFINITY)
1892                 return USEC_INFINITY;
1893
1894         if (b <= a + 1)
1895                 return a;
1896
1897         initialize_perturb(e);
1898
1899         /*
1900           Find a good time to wake up again between times a and b. We
1901           have two goals here:
1902
1903           a) We want to wake up as seldom as possible, hence prefer
1904              later times over earlier times.
1905
1906           b) But if we have to wake up, then let's make sure to
1907              dispatch as much as possible on the entire system.
1908
1909           We implement this by waking up everywhere at the same time
1910           within any given minute if we can, synchronised via the
1911           perturbation value determined from the boot ID. If we can't,
1912           then we try to find the same spot in every 10s, then 1s and
1913           then 250ms step. Otherwise, we pick the last possible time
1914           to wake up.
1915         */
1916
1917         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1918         if (c >= b) {
1919                 if (_unlikely_(c < USEC_PER_MINUTE))
1920                         return b;
1921
1922                 c -= USEC_PER_MINUTE;
1923         }
1924
1925         if (c >= a)
1926                 return c;
1927
1928         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1929         if (c >= b) {
1930                 if (_unlikely_(c < USEC_PER_SEC*10))
1931                         return b;
1932
1933                 c -= USEC_PER_SEC*10;
1934         }
1935
1936         if (c >= a)
1937                 return c;
1938
1939         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1940         if (c >= b) {
1941                 if (_unlikely_(c < USEC_PER_SEC))
1942                         return b;
1943
1944                 c -= USEC_PER_SEC;
1945         }
1946
1947         if (c >= a)
1948                 return c;
1949
1950         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1951         if (c >= b) {
1952                 if (_unlikely_(c < USEC_PER_MSEC*250))
1953                         return b;
1954
1955                 c -= USEC_PER_MSEC*250;
1956         }
1957
1958         if (c >= a)
1959                 return c;
1960
1961         return b;
1962 }
1963
1964 static int event_arm_timer(
1965                 sd_event *e,
1966                 struct clock_data *d) {
1967
1968         struct itimerspec its = {};
1969         sd_event_source *a, *b;
1970         usec_t t;
1971         int r;
1972
1973         assert(e);
1974         assert(d);
1975
1976         if (!d->needs_rearm)
1977                 return 0;
1978         else
1979                 d->needs_rearm = false;
1980
1981         a = prioq_peek(d->earliest);
1982         if (!a || a->enabled == SD_EVENT_OFF || a->time.next == USEC_INFINITY) {
1983
1984                 if (d->fd < 0)
1985                         return 0;
1986
1987                 if (d->next == USEC_INFINITY)
1988                         return 0;
1989
1990                 /* disarm */
1991                 r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
1992                 if (r < 0)
1993                         return r;
1994
1995                 d->next = USEC_INFINITY;
1996                 return 0;
1997         }
1998
1999         b = prioq_peek(d->latest);
2000         assert_se(b && b->enabled != SD_EVENT_OFF);
2001
2002         t = sleep_between(e, a->time.next, time_event_source_latest(b));
2003         if (d->next == t)
2004                 return 0;
2005
2006         assert_se(d->fd >= 0);
2007
2008         if (t == 0) {
2009                 /* We don' want to disarm here, just mean some time looooong ago. */
2010                 its.it_value.tv_sec = 0;
2011                 its.it_value.tv_nsec = 1;
2012         } else
2013                 timespec_store(&its.it_value, t);
2014
2015         r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2016         if (r < 0)
2017                 return -errno;
2018
2019         d->next = t;
2020         return 0;
2021 }
2022
2023 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
2024         assert(e);
2025         assert(s);
2026         assert(s->type == SOURCE_IO);
2027
2028         /* If the event source was already pending, we just OR in the
2029          * new revents, otherwise we reset the value. The ORing is
2030          * necessary to handle EPOLLONESHOT events properly where
2031          * readability might happen independently of writability, and
2032          * we need to keep track of both */
2033
2034         if (s->pending)
2035                 s->io.revents |= revents;
2036         else
2037                 s->io.revents = revents;
2038
2039         return source_set_pending(s, true);
2040 }
2041
2042 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
2043         uint64_t x;
2044         ssize_t ss;
2045
2046         assert(e);
2047         assert(fd >= 0);
2048
2049         assert_return(events == EPOLLIN, -EIO);
2050
2051         ss = read(fd, &x, sizeof(x));
2052         if (ss < 0) {
2053                 if (errno == EAGAIN || errno == EINTR)
2054                         return 0;
2055
2056                 return -errno;
2057         }
2058
2059         if (_unlikely_(ss != sizeof(x)))
2060                 return -EIO;
2061
2062         if (next)
2063                 *next = USEC_INFINITY;
2064
2065         return 0;
2066 }
2067
2068 static int process_timer(
2069                 sd_event *e,
2070                 usec_t n,
2071                 struct clock_data *d) {
2072
2073         sd_event_source *s;
2074         int r;
2075
2076         assert(e);
2077         assert(d);
2078
2079         for (;;) {
2080                 s = prioq_peek(d->earliest);
2081                 if (!s ||
2082                     s->time.next > n ||
2083                     s->enabled == SD_EVENT_OFF ||
2084                     s->pending)
2085                         break;
2086
2087                 r = source_set_pending(s, true);
2088                 if (r < 0)
2089                         return r;
2090
2091                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2092                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
2093                 d->needs_rearm = true;
2094         }
2095
2096         return 0;
2097 }
2098
2099 static int process_child(sd_event *e) {
2100         sd_event_source *s;
2101         Iterator i;
2102         int r;
2103
2104         assert(e);
2105
2106         e->need_process_child = false;
2107
2108         /*
2109            So, this is ugly. We iteratively invoke waitid() with P_PID
2110            + WNOHANG for each PID we wait for, instead of using
2111            P_ALL. This is because we only want to get child
2112            information of very specific child processes, and not all
2113            of them. We might not have processed the SIGCHLD even of a
2114            previous invocation and we don't want to maintain a
2115            unbounded *per-child* event queue, hence we really don't
2116            want anything flushed out of the kernel's queue that we
2117            don't care about. Since this is O(n) this means that if you
2118            have a lot of processes you probably want to handle SIGCHLD
2119            yourself.
2120
2121            We do not reap the children here (by using WNOWAIT), this
2122            is only done after the event source is dispatched so that
2123            the callback still sees the process as a zombie.
2124         */
2125
2126         HASHMAP_FOREACH(s, e->child_sources, i) {
2127                 assert(s->type == SOURCE_CHILD);
2128
2129                 if (s->pending)
2130                         continue;
2131
2132                 if (s->enabled == SD_EVENT_OFF)
2133                         continue;
2134
2135                 zero(s->child.siginfo);
2136                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
2137                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
2138                 if (r < 0)
2139                         return -errno;
2140
2141                 if (s->child.siginfo.si_pid != 0) {
2142                         bool zombie =
2143                                 s->child.siginfo.si_code == CLD_EXITED ||
2144                                 s->child.siginfo.si_code == CLD_KILLED ||
2145                                 s->child.siginfo.si_code == CLD_DUMPED;
2146
2147                         if (!zombie && (s->child.options & WEXITED)) {
2148                                 /* If the child isn't dead then let's
2149                                  * immediately remove the state change
2150                                  * from the queue, since there's no
2151                                  * benefit in leaving it queued */
2152
2153                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
2154                                 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
2155                         }
2156
2157                         r = source_set_pending(s, true);
2158                         if (r < 0)
2159                                 return r;
2160                 }
2161         }
2162
2163         return 0;
2164 }
2165
2166 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
2167         bool read_one = false;
2168         int r;
2169
2170         assert(e);
2171         assert_return(events == EPOLLIN, -EIO);
2172
2173         /* If there's a signal queued on this priority and SIGCHLD is
2174            on this priority too, then make sure to recheck the
2175            children we watch. This is because we only ever dequeue
2176            the first signal per priority, and if we dequeue one, and
2177            SIGCHLD might be enqueued later we wouldn't know, but we
2178            might have higher priority children we care about hence we
2179            need to check that explicitly. */
2180
2181         if (sigismember(&d->sigset, SIGCHLD))
2182                 e->need_process_child = true;
2183
2184         /* If there's already an event source pending for this
2185          * priority we don't read another */
2186         if (d->current)
2187                 return 0;
2188
2189         for (;;) {
2190                 struct signalfd_siginfo si;
2191                 ssize_t n;
2192                 sd_event_source *s = NULL;
2193
2194                 n = read(d->fd, &si, sizeof(si));
2195                 if (n < 0) {
2196                         if (errno == EAGAIN || errno == EINTR)
2197                                 return read_one;
2198
2199                         return -errno;
2200                 }
2201
2202                 if (_unlikely_(n != sizeof(si)))
2203                         return -EIO;
2204
2205                 assert(SIGNAL_VALID(si.ssi_signo));
2206
2207                 read_one = true;
2208
2209                 if (e->signal_sources)
2210                         s = e->signal_sources[si.ssi_signo];
2211                 if (!s)
2212                         continue;
2213                 if (s->pending)
2214                         continue;
2215
2216                 s->signal.siginfo = si;
2217                 d->current = s;
2218
2219                 r = source_set_pending(s, true);
2220                 if (r < 0)
2221                         return r;
2222
2223                 return 1;
2224         }
2225 }
2226
2227 static int source_dispatch(sd_event_source *s) {
2228         EventSourceType saved_type;
2229         int r = 0;
2230
2231         assert(s);
2232         assert(s->pending || s->type == SOURCE_EXIT);
2233
2234         /* Save the event source type, here, so that we still know it after the event callback which might invalidate
2235          * the event. */
2236         saved_type = s->type;
2237
2238         if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) {
2239                 r = source_set_pending(s, false);
2240                 if (r < 0)
2241                         return r;
2242         }
2243
2244         if (s->type != SOURCE_POST) {
2245                 sd_event_source *z;
2246                 Iterator i;
2247
2248                 /* If we execute a non-post source, let's mark all
2249                  * post sources as pending */
2250
2251                 SET_FOREACH(z, s->event->post_sources, i) {
2252                         if (z->enabled == SD_EVENT_OFF)
2253                                 continue;
2254
2255                         r = source_set_pending(z, true);
2256                         if (r < 0)
2257                                 return r;
2258                 }
2259         }
2260
2261         if (s->enabled == SD_EVENT_ONESHOT) {
2262                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
2263                 if (r < 0)
2264                         return r;
2265         }
2266
2267         s->dispatching = true;
2268
2269         switch (s->type) {
2270
2271         case SOURCE_IO:
2272                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
2273                 break;
2274
2275         case SOURCE_TIME_REALTIME:
2276         case SOURCE_TIME_BOOTTIME:
2277         case SOURCE_TIME_MONOTONIC:
2278         case SOURCE_TIME_REALTIME_ALARM:
2279         case SOURCE_TIME_BOOTTIME_ALARM:
2280                 r = s->time.callback(s, s->time.next, s->userdata);
2281                 break;
2282
2283         case SOURCE_SIGNAL:
2284                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
2285                 break;
2286
2287         case SOURCE_CHILD: {
2288                 bool zombie;
2289
2290                 zombie = s->child.siginfo.si_code == CLD_EXITED ||
2291                          s->child.siginfo.si_code == CLD_KILLED ||
2292                          s->child.siginfo.si_code == CLD_DUMPED;
2293
2294                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
2295
2296                 /* Now, reap the PID for good. */
2297                 if (zombie)
2298                         waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
2299
2300                 break;
2301         }
2302
2303         case SOURCE_DEFER:
2304                 r = s->defer.callback(s, s->userdata);
2305                 break;
2306
2307         case SOURCE_POST:
2308                 r = s->post.callback(s, s->userdata);
2309                 break;
2310
2311         case SOURCE_EXIT:
2312                 r = s->exit.callback(s, s->userdata);
2313                 break;
2314
2315         case SOURCE_WATCHDOG:
2316         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
2317         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
2318                 assert_not_reached("Wut? I shouldn't exist.");
2319         }
2320
2321         s->dispatching = false;
2322
2323         if (r < 0)
2324                 log_debug_errno(r, "Event source %s (type %s) returned error, disabling: %m",
2325                                 strna(s->description), event_source_type_to_string(saved_type));
2326
2327         if (s->n_ref == 0)
2328                 source_free(s);
2329         else if (r < 0)
2330                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
2331
2332         return 1;
2333 }
2334
2335 static int event_prepare(sd_event *e) {
2336         int r;
2337
2338         assert(e);
2339
2340         for (;;) {
2341                 sd_event_source *s;
2342
2343                 s = prioq_peek(e->prepare);
2344                 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
2345                         break;
2346
2347                 s->prepare_iteration = e->iteration;
2348                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
2349                 if (r < 0)
2350                         return r;
2351
2352                 assert(s->prepare);
2353
2354                 s->dispatching = true;
2355                 r = s->prepare(s, s->userdata);
2356                 s->dispatching = false;
2357
2358                 if (r < 0)
2359                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, disabling: %m",
2360                                         strna(s->description), event_source_type_to_string(s->type));
2361
2362                 if (s->n_ref == 0)
2363                         source_free(s);
2364                 else if (r < 0)
2365                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
2366         }
2367
2368         return 0;
2369 }
2370
2371 static int dispatch_exit(sd_event *e) {
2372         sd_event_source *p;
2373         int r;
2374
2375         assert(e);
2376
2377         p = prioq_peek(e->exit);
2378         if (!p || p->enabled == SD_EVENT_OFF) {
2379                 e->state = SD_EVENT_FINISHED;
2380                 return 0;
2381         }
2382
2383         sd_event_ref(e);
2384         e->iteration++;
2385         e->state = SD_EVENT_EXITING;
2386
2387         r = source_dispatch(p);
2388
2389         e->state = SD_EVENT_INITIAL;
2390         sd_event_unref(e);
2391
2392         return r;
2393 }
2394
2395 static sd_event_source* event_next_pending(sd_event *e) {
2396         sd_event_source *p;
2397
2398         assert(e);
2399
2400         p = prioq_peek(e->pending);
2401         if (!p)
2402                 return NULL;
2403
2404         if (p->enabled == SD_EVENT_OFF)
2405                 return NULL;
2406
2407         return p;
2408 }
2409
2410 static int arm_watchdog(sd_event *e) {
2411         struct itimerspec its = {};
2412         usec_t t;
2413         int r;
2414
2415         assert(e);
2416         assert(e->watchdog_fd >= 0);
2417
2418         t = sleep_between(e,
2419                           e->watchdog_last + (e->watchdog_period / 2),
2420                           e->watchdog_last + (e->watchdog_period * 3 / 4));
2421
2422         timespec_store(&its.it_value, t);
2423
2424         /* Make sure we never set the watchdog to 0, which tells the
2425          * kernel to disable it. */
2426         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
2427                 its.it_value.tv_nsec = 1;
2428
2429         r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
2430         if (r < 0)
2431                 return -errno;
2432
2433         return 0;
2434 }
2435
2436 static int process_watchdog(sd_event *e) {
2437         assert(e);
2438
2439         if (!e->watchdog)
2440                 return 0;
2441
2442         /* Don't notify watchdog too often */
2443         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
2444                 return 0;
2445
2446         sd_notify(false, "WATCHDOG=1");
2447         e->watchdog_last = e->timestamp.monotonic;
2448
2449         return arm_watchdog(e);
2450 }
2451
2452 _public_ int sd_event_prepare(sd_event *e) {
2453         int r;
2454
2455         assert_return(e, -EINVAL);
2456         assert_return(!event_pid_changed(e), -ECHILD);
2457         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2458         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2459
2460         if (e->exit_requested)
2461                 goto pending;
2462
2463         e->iteration++;
2464
2465         e->state = SD_EVENT_PREPARING;
2466         r = event_prepare(e);
2467         e->state = SD_EVENT_INITIAL;
2468         if (r < 0)
2469                 return r;
2470
2471         r = event_arm_timer(e, &e->realtime);
2472         if (r < 0)
2473                 return r;
2474
2475         r = event_arm_timer(e, &e->boottime);
2476         if (r < 0)
2477                 return r;
2478
2479         r = event_arm_timer(e, &e->monotonic);
2480         if (r < 0)
2481                 return r;
2482
2483         r = event_arm_timer(e, &e->realtime_alarm);
2484         if (r < 0)
2485                 return r;
2486
2487         r = event_arm_timer(e, &e->boottime_alarm);
2488         if (r < 0)
2489                 return r;
2490
2491         if (event_next_pending(e) || e->need_process_child)
2492                 goto pending;
2493
2494         e->state = SD_EVENT_ARMED;
2495
2496         return 0;
2497
2498 pending:
2499         e->state = SD_EVENT_ARMED;
2500         r = sd_event_wait(e, 0);
2501         if (r == 0)
2502                 e->state = SD_EVENT_ARMED;
2503
2504         return r;
2505 }
2506
2507 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
2508         struct epoll_event *ev_queue;
2509         unsigned ev_queue_max;
2510         int r, m, i;
2511
2512         assert_return(e, -EINVAL);
2513         assert_return(!event_pid_changed(e), -ECHILD);
2514         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2515         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
2516
2517         if (e->exit_requested) {
2518                 e->state = SD_EVENT_PENDING;
2519                 return 1;
2520         }
2521
2522         ev_queue_max = MAX(e->n_sources, 1u);
2523         ev_queue = newa(struct epoll_event, ev_queue_max);
2524
2525         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
2526                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
2527         if (m < 0) {
2528                 if (errno == EINTR) {
2529                         e->state = SD_EVENT_PENDING;
2530                         return 1;
2531                 }
2532
2533                 r = -errno;
2534                 goto finish;
2535         }
2536
2537         triple_timestamp_get(&e->timestamp);
2538
2539         for (i = 0; i < m; i++) {
2540
2541                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
2542                         r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
2543                 else {
2544                         WakeupType *t = ev_queue[i].data.ptr;
2545
2546                         switch (*t) {
2547
2548                         case WAKEUP_EVENT_SOURCE:
2549                                 r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
2550                                 break;
2551
2552                         case WAKEUP_CLOCK_DATA: {
2553                                 struct clock_data *d = ev_queue[i].data.ptr;
2554                                 r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
2555                                 break;
2556                         }
2557
2558                         case WAKEUP_SIGNAL_DATA:
2559                                 r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
2560                                 break;
2561
2562                         default:
2563                                 assert_not_reached("Invalid wake-up pointer");
2564                         }
2565                 }
2566                 if (r < 0)
2567                         goto finish;
2568         }
2569
2570         r = process_watchdog(e);
2571         if (r < 0)
2572                 goto finish;
2573
2574         r = process_timer(e, e->timestamp.realtime, &e->realtime);
2575         if (r < 0)
2576                 goto finish;
2577
2578         r = process_timer(e, e->timestamp.boottime, &e->boottime);
2579         if (r < 0)
2580                 goto finish;
2581
2582         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
2583         if (r < 0)
2584                 goto finish;
2585
2586         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
2587         if (r < 0)
2588                 goto finish;
2589
2590         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
2591         if (r < 0)
2592                 goto finish;
2593
2594         if (e->need_process_child) {
2595                 r = process_child(e);
2596                 if (r < 0)
2597                         goto finish;
2598         }
2599
2600         if (event_next_pending(e)) {
2601                 e->state = SD_EVENT_PENDING;
2602
2603                 return 1;
2604         }
2605
2606         r = 0;
2607
2608 finish:
2609         e->state = SD_EVENT_INITIAL;
2610
2611         return r;
2612 }
2613
2614 _public_ int sd_event_dispatch(sd_event *e) {
2615         sd_event_source *p;
2616         int r;
2617
2618         assert_return(e, -EINVAL);
2619         assert_return(!event_pid_changed(e), -ECHILD);
2620         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2621         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
2622
2623         if (e->exit_requested)
2624                 return dispatch_exit(e);
2625
2626         p = event_next_pending(e);
2627         if (p) {
2628                 sd_event_ref(e);
2629
2630                 e->state = SD_EVENT_RUNNING;
2631                 r = source_dispatch(p);
2632                 e->state = SD_EVENT_INITIAL;
2633
2634                 sd_event_unref(e);
2635
2636                 return r;
2637         }
2638
2639         e->state = SD_EVENT_INITIAL;
2640
2641         return 1;
2642 }
2643
2644 static void event_log_delays(sd_event *e) {
2645         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1];
2646         unsigned i;
2647         int o;
2648
2649         for (i = o = 0; i < ELEMENTSOF(e->delays); i++) {
2650                 o += snprintf(&b[o], sizeof(b) - o, "%u ", e->delays[i]);
2651                 e->delays[i] = 0;
2652         }
2653         log_debug("Event loop iterations: %.*s", o, b);
2654 }
2655
2656 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
2657         int r;
2658
2659         assert_return(e, -EINVAL);
2660         assert_return(!event_pid_changed(e), -ECHILD);
2661         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2662         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2663
2664         if (e->profile_delays && e->last_run) {
2665                 usec_t this_run;
2666                 unsigned l;
2667
2668                 this_run = now(CLOCK_MONOTONIC);
2669
2670                 l = u64log2(this_run - e->last_run);
2671                 assert(l < sizeof(e->delays));
2672                 e->delays[l]++;
2673
2674                 if (this_run - e->last_log >= 5*USEC_PER_SEC) {
2675                         event_log_delays(e);
2676                         e->last_log = this_run;
2677                 }
2678         }
2679
2680         r = sd_event_prepare(e);
2681         if (r == 0)
2682                 /* There was nothing? Then wait... */
2683                 r = sd_event_wait(e, timeout);
2684
2685         if (e->profile_delays)
2686                 e->last_run = now(CLOCK_MONOTONIC);
2687
2688         if (r > 0) {
2689                 /* There's something now, then let's dispatch it */
2690                 r = sd_event_dispatch(e);
2691                 if (r < 0)
2692                         return r;
2693
2694                 return 1;
2695         }
2696
2697         return r;
2698 }
2699
2700 _public_ int sd_event_loop(sd_event *e) {
2701         int r;
2702
2703         assert_return(e, -EINVAL);
2704         assert_return(!event_pid_changed(e), -ECHILD);
2705         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2706
2707         sd_event_ref(e);
2708
2709         while (e->state != SD_EVENT_FINISHED) {
2710                 r = sd_event_run(e, (uint64_t) -1);
2711                 if (r < 0)
2712                         goto finish;
2713         }
2714
2715         r = e->exit_code;
2716
2717 finish:
2718         sd_event_unref(e);
2719         return r;
2720 }
2721
2722 _public_ int sd_event_get_fd(sd_event *e) {
2723
2724         assert_return(e, -EINVAL);
2725         assert_return(!event_pid_changed(e), -ECHILD);
2726
2727         return e->epoll_fd;
2728 }
2729
2730 _public_ int sd_event_get_state(sd_event *e) {
2731         assert_return(e, -EINVAL);
2732         assert_return(!event_pid_changed(e), -ECHILD);
2733
2734         return e->state;
2735 }
2736
2737 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2738         assert_return(e, -EINVAL);
2739         assert_return(code, -EINVAL);
2740         assert_return(!event_pid_changed(e), -ECHILD);
2741
2742         if (!e->exit_requested)
2743                 return -ENODATA;
2744
2745         *code = e->exit_code;
2746         return 0;
2747 }
2748
2749 _public_ int sd_event_exit(sd_event *e, int code) {
2750         assert_return(e, -EINVAL);
2751         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2752         assert_return(!event_pid_changed(e), -ECHILD);
2753
2754         e->exit_requested = true;
2755         e->exit_code = code;
2756
2757         return 0;
2758 }
2759
2760 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
2761         assert_return(e, -EINVAL);
2762         assert_return(usec, -EINVAL);
2763         assert_return(!event_pid_changed(e), -ECHILD);
2764
2765         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
2766                 return -EOPNOTSUPP;
2767
2768         /* Generate a clean error in case CLOCK_BOOTTIME is not available. Note that don't use clock_supported() here,
2769          * for a reason: there are systems where CLOCK_BOOTTIME is supported, but CLOCK_BOOTTIME_ALARM is not, but for
2770          * the purpose of getting the time this doesn't matter. */
2771         if (IN_SET(clock, CLOCK_BOOTTIME, CLOCK_BOOTTIME_ALARM) && !clock_boottime_supported())
2772                 return -EOPNOTSUPP;
2773
2774         if (!triple_timestamp_is_set(&e->timestamp)) {
2775                 /* Implicitly fall back to now() if we never ran
2776                  * before and thus have no cached time. */
2777                 *usec = now(clock);
2778                 return 1;
2779         }
2780
2781         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
2782         return 0;
2783 }
2784
2785 _public_ int sd_event_default(sd_event **ret) {
2786
2787         static thread_local sd_event *default_event = NULL;
2788         sd_event *e = NULL;
2789         int r;
2790
2791         if (!ret)
2792                 return !!default_event;
2793
2794         if (default_event) {
2795                 *ret = sd_event_ref(default_event);
2796                 return 0;
2797         }
2798
2799         r = sd_event_new(&e);
2800         if (r < 0)
2801                 return r;
2802
2803         e->default_event_ptr = &default_event;
2804         e->tid = gettid();
2805         default_event = e;
2806
2807         *ret = e;
2808         return 1;
2809 }
2810
2811 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2812         assert_return(e, -EINVAL);
2813         assert_return(tid, -EINVAL);
2814         assert_return(!event_pid_changed(e), -ECHILD);
2815
2816         if (e->tid != 0) {
2817                 *tid = e->tid;
2818                 return 0;
2819         }
2820
2821         return -ENXIO;
2822 }
2823
2824 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2825         int r;
2826
2827         assert_return(e, -EINVAL);
2828         assert_return(!event_pid_changed(e), -ECHILD);
2829
2830         if (e->watchdog == !!b)
2831                 return e->watchdog;
2832
2833         if (b) {
2834                 struct epoll_event ev = {};
2835
2836                 r = sd_watchdog_enabled(false, &e->watchdog_period);
2837                 if (r <= 0)
2838                         return r;
2839
2840                 /* Issue first ping immediately */
2841                 sd_notify(false, "WATCHDOG=1");
2842                 e->watchdog_last = now(CLOCK_MONOTONIC);
2843
2844                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2845                 if (e->watchdog_fd < 0)
2846                         return -errno;
2847
2848                 r = arm_watchdog(e);
2849                 if (r < 0)
2850                         goto fail;
2851
2852                 ev.events = EPOLLIN;
2853                 ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
2854
2855                 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2856                 if (r < 0) {
2857                         r = -errno;
2858                         goto fail;
2859                 }
2860
2861         } else {
2862                 if (e->watchdog_fd >= 0) {
2863                         epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2864                         e->watchdog_fd = safe_close(e->watchdog_fd);
2865                 }
2866         }
2867
2868         e->watchdog = !!b;
2869         return e->watchdog;
2870
2871 fail:
2872         e->watchdog_fd = safe_close(e->watchdog_fd);
2873         return r;
2874 }
2875
2876 _public_ int sd_event_get_watchdog(sd_event *e) {
2877         assert_return(e, -EINVAL);
2878         assert_return(!event_pid_changed(e), -ECHILD);
2879
2880         return e->watchdog;
2881 }
2882
2883 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
2884         assert_return(e, -EINVAL);
2885         assert_return(!event_pid_changed(e), -ECHILD);
2886
2887         *ret = e->iteration;
2888         return 0;
2889 }