1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
22 #include <sys/epoll.h>
23 #include <sys/timerfd.h>
27 #include "sd-daemon.h"
32 #include "time-util.h"
37 #define EPOLL_QUEUE_MAX 64
38 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
40 typedef enum EventSourceType {
51 struct sd_event_source {
56 sd_event_handler_t prepare;
58 EventSourceType type:4;
64 unsigned pending_index;
65 unsigned prepare_index;
66 unsigned pending_iteration;
67 unsigned prepare_iteration;
71 sd_event_io_handler_t callback;
78 sd_event_time_handler_t callback;
79 usec_t next, accuracy;
80 unsigned earliest_index;
81 unsigned latest_index;
84 sd_event_signal_handler_t callback;
85 struct signalfd_siginfo siginfo;
89 sd_event_child_handler_t callback;
95 sd_event_handler_t callback;
98 sd_event_handler_t callback;
116 /* For both clocks we maintain two priority queues each, one
117 * ordered for the earliest times the events may be
118 * dispatched, and one ordered by the latest times they must
119 * have been dispatched. The range between the top entries in
120 * the two prioqs is the time window we can freely schedule
122 Prioq *monotonic_earliest;
123 Prioq *monotonic_latest;
124 Prioq *realtime_earliest;
125 Prioq *realtime_latest;
127 usec_t realtime_next, monotonic_next;
131 sd_event_source **signal_sources;
133 Hashmap *child_sources;
134 unsigned n_enabled_child_sources;
141 dual_timestamp timestamp;
144 bool exit_requested:1;
145 bool need_process_child:1;
151 sd_event **default_event_ptr;
153 usec_t watchdog_last, watchdog_period;
156 static int pending_prioq_compare(const void *a, const void *b) {
157 const sd_event_source *x = a, *y = b;
162 /* Enabled ones first */
163 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
165 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
168 /* Lower priority values first */
169 if (x->priority < y->priority)
171 if (x->priority > y->priority)
174 /* Older entries first */
175 if (x->pending_iteration < y->pending_iteration)
177 if (x->pending_iteration > y->pending_iteration)
180 /* Stability for the rest */
189 static int prepare_prioq_compare(const void *a, const void *b) {
190 const sd_event_source *x = a, *y = b;
195 /* Move most recently prepared ones last, so that we can stop
196 * preparing as soon as we hit one that has already been
197 * prepared in the current iteration */
198 if (x->prepare_iteration < y->prepare_iteration)
200 if (x->prepare_iteration > y->prepare_iteration)
203 /* Enabled ones first */
204 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
206 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
209 /* Lower priority values first */
210 if (x->priority < y->priority)
212 if (x->priority > y->priority)
215 /* Stability for the rest */
224 static int earliest_time_prioq_compare(const void *a, const void *b) {
225 const sd_event_source *x = a, *y = b;
227 assert(x->type == SOURCE_MONOTONIC || x->type == SOURCE_REALTIME);
228 assert(y->type == SOURCE_MONOTONIC || y->type == SOURCE_REALTIME);
230 /* Enabled ones first */
231 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
233 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
236 /* Move the pending ones to the end */
237 if (!x->pending && y->pending)
239 if (x->pending && !y->pending)
243 if (x->time.next < y->time.next)
245 if (x->time.next > y->time.next)
248 /* Stability for the rest */
257 static int latest_time_prioq_compare(const void *a, const void *b) {
258 const sd_event_source *x = a, *y = b;
260 assert((x->type == SOURCE_MONOTONIC && y->type == SOURCE_MONOTONIC) ||
261 (x->type == SOURCE_REALTIME && y->type == SOURCE_REALTIME));
263 /* Enabled ones first */
264 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
266 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
269 /* Move the pending ones to the end */
270 if (!x->pending && y->pending)
272 if (x->pending && !y->pending)
276 if (x->time.next + x->time.accuracy < y->time.next + y->time.accuracy)
278 if (x->time.next + x->time.accuracy > y->time.next + y->time.accuracy)
281 /* Stability for the rest */
290 static int exit_prioq_compare(const void *a, const void *b) {
291 const sd_event_source *x = a, *y = b;
293 assert(x->type == SOURCE_EXIT);
294 assert(y->type == SOURCE_EXIT);
296 /* Enabled ones first */
297 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
299 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
302 /* Lower priority values first */
303 if (x->priority < y->priority)
305 if (x->priority > y->priority)
308 /* Stability for the rest */
317 static void event_free(sd_event *e) {
320 if (e->default_event_ptr)
321 *(e->default_event_ptr) = NULL;
323 if (e->epoll_fd >= 0)
324 close_nointr_nofail(e->epoll_fd);
326 if (e->signal_fd >= 0)
327 close_nointr_nofail(e->signal_fd);
329 if (e->realtime_fd >= 0)
330 close_nointr_nofail(e->realtime_fd);
332 if (e->monotonic_fd >= 0)
333 close_nointr_nofail(e->monotonic_fd);
335 if (e->watchdog_fd >= 0)
336 close_nointr_nofail(e->watchdog_fd);
338 prioq_free(e->pending);
339 prioq_free(e->prepare);
340 prioq_free(e->monotonic_earliest);
341 prioq_free(e->monotonic_latest);
342 prioq_free(e->realtime_earliest);
343 prioq_free(e->realtime_latest);
346 free(e->signal_sources);
348 hashmap_free(e->child_sources);
352 _public_ int sd_event_new(sd_event** ret) {
356 assert_return(ret, -EINVAL);
358 e = new0(sd_event, 1);
363 e->signal_fd = e->realtime_fd = e->monotonic_fd = e->watchdog_fd = e->epoll_fd = -1;
364 e->realtime_next = e->monotonic_next = (usec_t) -1;
365 e->original_pid = getpid();
367 assert_se(sigemptyset(&e->sigset) == 0);
369 e->pending = prioq_new(pending_prioq_compare);
375 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
376 if (e->epoll_fd < 0) {
389 _public_ sd_event* sd_event_ref(sd_event *e) {
390 assert_return(e, NULL);
392 assert(e->n_ref >= 1);
398 _public_ sd_event* sd_event_unref(sd_event *e) {
403 assert(e->n_ref >= 1);
412 static bool event_pid_changed(sd_event *e) {
415 /* We don't support people creating am event loop and keeping
416 * it around over a fork(). Let's complain. */
418 return e->original_pid != getpid();
421 static int source_io_unregister(sd_event_source *s) {
425 assert(s->type == SOURCE_IO);
427 if (!s->io.registered)
430 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
434 s->io.registered = false;
438 static int source_io_register(
443 struct epoll_event ev = {};
447 assert(s->type == SOURCE_IO);
448 assert(enabled != SD_EVENT_OFF);
453 if (enabled == SD_EVENT_ONESHOT)
454 ev.events |= EPOLLONESHOT;
456 if (s->io.registered)
457 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
459 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
464 s->io.registered = true;
469 static void source_free(sd_event_source *s) {
477 source_io_unregister(s);
481 case SOURCE_MONOTONIC:
482 prioq_remove(s->event->monotonic_earliest, s, &s->time.earliest_index);
483 prioq_remove(s->event->monotonic_latest, s, &s->time.latest_index);
486 case SOURCE_REALTIME:
487 prioq_remove(s->event->realtime_earliest, s, &s->time.earliest_index);
488 prioq_remove(s->event->realtime_latest, s, &s->time.latest_index);
492 if (s->signal.sig > 0) {
493 if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0)
494 assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
496 if (s->event->signal_sources)
497 s->event->signal_sources[s->signal.sig] = NULL;
503 if (s->child.pid > 0) {
504 if (s->enabled != SD_EVENT_OFF) {
505 assert(s->event->n_enabled_child_sources > 0);
506 s->event->n_enabled_child_sources--;
509 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD])
510 assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
512 hashmap_remove(s->event->child_sources, INT_TO_PTR(s->child.pid));
522 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
527 prioq_remove(s->event->pending, s, &s->pending_index);
530 prioq_remove(s->event->prepare, s, &s->prepare_index);
532 sd_event_unref(s->event);
538 static int source_set_pending(sd_event_source *s, bool b) {
542 assert(s->type != SOURCE_EXIT);
550 s->pending_iteration = s->event->iteration;
552 r = prioq_put(s->event->pending, s, &s->pending_index);
558 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
560 if (s->type == SOURCE_REALTIME) {
561 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
562 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
563 } else if (s->type == SOURCE_MONOTONIC) {
564 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
565 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
571 static sd_event_source *source_new(sd_event *e, EventSourceType type) {
576 s = new0(sd_event_source, 1);
581 s->event = sd_event_ref(e);
583 s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
588 _public_ int sd_event_add_io(
592 sd_event_io_handler_t callback,
594 sd_event_source **ret) {
599 assert_return(e, -EINVAL);
600 assert_return(fd >= 0, -EINVAL);
601 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
602 assert_return(callback, -EINVAL);
603 assert_return(ret, -EINVAL);
604 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
605 assert_return(!event_pid_changed(e), -ECHILD);
607 s = source_new(e, SOURCE_IO);
612 s->io.events = events;
613 s->io.callback = callback;
614 s->userdata = userdata;
615 s->enabled = SD_EVENT_ON;
617 r = source_io_register(s, s->enabled, events);
627 static int event_setup_timer_fd(
629 EventSourceType type,
633 struct epoll_event ev = {};
640 if (_likely_(*timer_fd >= 0))
643 fd = timerfd_create(id, TFD_NONBLOCK|TFD_CLOEXEC);
648 ev.data.ptr = INT_TO_PTR(type);
650 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
652 close_nointr_nofail(fd);
656 /* When we sleep for longer, we try to realign the wakeup to
657 the same time wihtin each minute/second/250ms, so that
658 events all across the system can be coalesced into a single
659 CPU wakeup. However, let's take some system-specific
660 randomness for this value, so that in a network of systems
661 with synced clocks timer events are distributed a
662 bit. Here, we calculate a perturbation usec offset from the
665 if (sd_id128_get_boot(&bootid) >= 0)
666 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
672 static int event_add_time_internal(
674 EventSourceType type,
681 sd_event_time_handler_t callback,
683 sd_event_source **ret) {
688 assert_return(e, -EINVAL);
689 assert_return(callback, -EINVAL);
690 assert_return(ret, -EINVAL);
691 assert_return(usec != (uint64_t) -1, -EINVAL);
692 assert_return(accuracy != (uint64_t) -1, -EINVAL);
693 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
694 assert_return(!event_pid_changed(e), -ECHILD);
701 *earliest = prioq_new(earliest_time_prioq_compare);
707 *latest = prioq_new(latest_time_prioq_compare);
713 r = event_setup_timer_fd(e, type, timer_fd, id);
718 s = source_new(e, type);
723 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
724 s->time.callback = callback;
725 s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
726 s->userdata = userdata;
727 s->enabled = SD_EVENT_ONESHOT;
729 r = prioq_put(*earliest, s, &s->time.earliest_index);
733 r = prioq_put(*latest, s, &s->time.latest_index);
745 _public_ int sd_event_add_monotonic(sd_event *e,
748 sd_event_time_handler_t callback,
750 sd_event_source **ret) {
752 return event_add_time_internal(e, SOURCE_MONOTONIC, &e->monotonic_fd, CLOCK_MONOTONIC, &e->monotonic_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
755 _public_ int sd_event_add_realtime(sd_event *e,
758 sd_event_time_handler_t callback,
760 sd_event_source **ret) {
762 return event_add_time_internal(e, SOURCE_REALTIME, &e->realtime_fd, CLOCK_REALTIME, &e->realtime_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
765 static int event_update_signal_fd(sd_event *e) {
766 struct epoll_event ev = {};
772 add_to_epoll = e->signal_fd < 0;
774 r = signalfd(e->signal_fd, &e->sigset, SFD_NONBLOCK|SFD_CLOEXEC);
784 ev.data.ptr = INT_TO_PTR(SOURCE_SIGNAL);
786 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->signal_fd, &ev);
788 close_nointr_nofail(e->signal_fd);
797 _public_ int sd_event_add_signal(
800 sd_event_signal_handler_t callback,
802 sd_event_source **ret) {
807 assert_return(e, -EINVAL);
808 assert_return(sig > 0, -EINVAL);
809 assert_return(sig < _NSIG, -EINVAL);
810 assert_return(callback, -EINVAL);
811 assert_return(ret, -EINVAL);
812 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
813 assert_return(!event_pid_changed(e), -ECHILD);
815 if (!e->signal_sources) {
816 e->signal_sources = new0(sd_event_source*, _NSIG);
817 if (!e->signal_sources)
819 } else if (e->signal_sources[sig])
822 s = source_new(e, SOURCE_SIGNAL);
827 s->signal.callback = callback;
828 s->userdata = userdata;
829 s->enabled = SD_EVENT_ON;
831 e->signal_sources[sig] = s;
832 assert_se(sigaddset(&e->sigset, sig) == 0);
834 if (sig != SIGCHLD || e->n_enabled_child_sources == 0) {
835 r = event_update_signal_fd(e);
846 _public_ int sd_event_add_child(
850 sd_event_child_handler_t callback,
852 sd_event_source **ret) {
857 assert_return(e, -EINVAL);
858 assert_return(pid > 1, -EINVAL);
859 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
860 assert_return(options != 0, -EINVAL);
861 assert_return(callback, -EINVAL);
862 assert_return(ret, -EINVAL);
863 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
864 assert_return(!event_pid_changed(e), -ECHILD);
866 r = hashmap_ensure_allocated(&e->child_sources, trivial_hash_func, trivial_compare_func);
870 if (hashmap_contains(e->child_sources, INT_TO_PTR(pid)))
873 s = source_new(e, SOURCE_CHILD);
878 s->child.options = options;
879 s->child.callback = callback;
880 s->userdata = userdata;
881 s->enabled = SD_EVENT_ONESHOT;
883 r = hashmap_put(e->child_sources, INT_TO_PTR(pid), s);
889 e->n_enabled_child_sources ++;
891 assert_se(sigaddset(&e->sigset, SIGCHLD) == 0);
893 if (!e->signal_sources || !e->signal_sources[SIGCHLD]) {
894 r = event_update_signal_fd(e);
901 e->need_process_child = true;
907 _public_ int sd_event_add_defer(
909 sd_event_handler_t callback,
911 sd_event_source **ret) {
916 assert_return(e, -EINVAL);
917 assert_return(callback, -EINVAL);
918 assert_return(ret, -EINVAL);
919 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
920 assert_return(!event_pid_changed(e), -ECHILD);
922 s = source_new(e, SOURCE_DEFER);
926 s->defer.callback = callback;
927 s->userdata = userdata;
928 s->enabled = SD_EVENT_ONESHOT;
930 r = source_set_pending(s, true);
940 _public_ int sd_event_add_exit(
942 sd_event_handler_t callback,
944 sd_event_source **ret) {
949 assert_return(e, -EINVAL);
950 assert_return(callback, -EINVAL);
951 assert_return(ret, -EINVAL);
952 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
953 assert_return(!event_pid_changed(e), -ECHILD);
956 e->exit = prioq_new(exit_prioq_compare);
961 s = source_new(e, SOURCE_EXIT);
965 s->exit.callback = callback;
966 s->userdata = userdata;
967 s->exit.prioq_index = PRIOQ_IDX_NULL;
968 s->enabled = SD_EVENT_ONESHOT;
970 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
980 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
981 assert_return(s, NULL);
983 assert(s->n_ref >= 1);
989 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
994 assert(s->n_ref >= 1);
998 /* Here's a special hack: when we are called from a
999 * dispatch handler we won't free the event source
1000 * immediately, but we will detach the fd from the
1001 * epoll. This way it is safe for the caller to unref
1002 * the event source and immediately close the fd, but
1003 * we still retain a valid event source object after
1006 if (s->dispatching) {
1007 if (s->type == SOURCE_IO)
1008 source_io_unregister(s);
1016 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1017 assert_return(s, NULL);
1022 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1023 assert_return(s, -EINVAL);
1024 assert_return(s->type != SOURCE_EXIT, -EDOM);
1025 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1026 assert_return(!event_pid_changed(s->event), -ECHILD);
1031 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1032 assert_return(s, -EINVAL);
1033 assert_return(s->type == SOURCE_IO, -EDOM);
1034 assert_return(!event_pid_changed(s->event), -ECHILD);
1039 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1040 assert_return(s, -EINVAL);
1041 assert_return(events, -EINVAL);
1042 assert_return(s->type == SOURCE_IO, -EDOM);
1043 assert_return(!event_pid_changed(s->event), -ECHILD);
1045 *events = s->io.events;
1049 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1052 assert_return(s, -EINVAL);
1053 assert_return(s->type == SOURCE_IO, -EDOM);
1054 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1055 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1056 assert_return(!event_pid_changed(s->event), -ECHILD);
1058 if (s->io.events == events)
1061 if (s->enabled != SD_EVENT_OFF) {
1062 r = source_io_register(s, s->enabled, events);
1067 s->io.events = events;
1068 source_set_pending(s, false);
1073 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1074 assert_return(s, -EINVAL);
1075 assert_return(revents, -EINVAL);
1076 assert_return(s->type == SOURCE_IO, -EDOM);
1077 assert_return(s->pending, -ENODATA);
1078 assert_return(!event_pid_changed(s->event), -ECHILD);
1080 *revents = s->io.revents;
1084 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1085 assert_return(s, -EINVAL);
1086 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1087 assert_return(!event_pid_changed(s->event), -ECHILD);
1089 return s->signal.sig;
1092 _public_ int sd_event_source_get_priority(sd_event_source *s, int *priority) {
1093 assert_return(s, -EINVAL);
1094 assert_return(!event_pid_changed(s->event), -ECHILD);
1099 _public_ int sd_event_source_set_priority(sd_event_source *s, int priority) {
1100 assert_return(s, -EINVAL);
1101 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1102 assert_return(!event_pid_changed(s->event), -ECHILD);
1104 if (s->priority == priority)
1107 s->priority = priority;
1110 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1113 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1115 if (s->type == SOURCE_EXIT)
1116 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1121 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1122 assert_return(s, -EINVAL);
1123 assert_return(m, -EINVAL);
1124 assert_return(!event_pid_changed(s->event), -ECHILD);
1130 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1133 assert_return(s, -EINVAL);
1134 assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
1135 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1136 assert_return(!event_pid_changed(s->event), -ECHILD);
1138 if (s->enabled == m)
1141 if (m == SD_EVENT_OFF) {
1146 r = source_io_unregister(s);
1153 case SOURCE_MONOTONIC:
1155 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1156 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1159 case SOURCE_REALTIME:
1161 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1162 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1167 if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) {
1168 assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
1169 event_update_signal_fd(s->event);
1177 assert(s->event->n_enabled_child_sources > 0);
1178 s->event->n_enabled_child_sources--;
1180 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
1181 assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
1182 event_update_signal_fd(s->event);
1189 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1201 r = source_io_register(s, m, s->io.events);
1208 case SOURCE_MONOTONIC:
1210 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1211 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1214 case SOURCE_REALTIME:
1216 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1217 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1223 if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) {
1224 assert_se(sigaddset(&s->event->sigset, s->signal.sig) == 0);
1225 event_update_signal_fd(s->event);
1232 if (s->enabled == SD_EVENT_OFF) {
1233 s->event->n_enabled_child_sources++;
1235 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
1236 assert_se(sigaddset(&s->event->sigset, SIGCHLD) == 0);
1237 event_update_signal_fd(s->event);
1244 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1254 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1257 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1262 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1263 assert_return(s, -EINVAL);
1264 assert_return(usec, -EINVAL);
1265 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1266 assert_return(!event_pid_changed(s->event), -ECHILD);
1268 *usec = s->time.next;
1272 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1273 assert_return(s, -EINVAL);
1274 assert_return(usec != (uint64_t) -1, -EINVAL);
1275 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1276 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1277 assert_return(!event_pid_changed(s->event), -ECHILD);
1279 s->time.next = usec;
1281 source_set_pending(s, false);
1283 if (s->type == SOURCE_REALTIME) {
1284 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1285 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1287 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1288 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1294 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1295 assert_return(s, -EINVAL);
1296 assert_return(usec, -EINVAL);
1297 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1298 assert_return(!event_pid_changed(s->event), -ECHILD);
1300 *usec = s->time.accuracy;
1304 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1305 assert_return(s, -EINVAL);
1306 assert_return(usec != (uint64_t) -1, -EINVAL);
1307 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1308 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1309 assert_return(!event_pid_changed(s->event), -ECHILD);
1312 usec = DEFAULT_ACCURACY_USEC;
1314 s->time.accuracy = usec;
1316 source_set_pending(s, false);
1318 if (s->type == SOURCE_REALTIME)
1319 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1321 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1326 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1327 assert_return(s, -EINVAL);
1328 assert_return(pid, -EINVAL);
1329 assert_return(s->type == SOURCE_CHILD, -EDOM);
1330 assert_return(!event_pid_changed(s->event), -ECHILD);
1332 *pid = s->child.pid;
1336 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1339 assert_return(s, -EINVAL);
1340 assert_return(s->type != SOURCE_EXIT, -EDOM);
1341 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1342 assert_return(!event_pid_changed(s->event), -ECHILD);
1344 if (s->prepare == callback)
1347 if (callback && s->prepare) {
1348 s->prepare = callback;
1352 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1356 s->prepare = callback;
1359 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1363 prioq_remove(s->event->prepare, s, &s->prepare_index);
1368 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1369 assert_return(s, NULL);
1374 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1377 assert_return(s, NULL);
1380 s->userdata = userdata;
1385 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1397 Find a good time to wake up again between times a and b. We
1398 have two goals here:
1400 a) We want to wake up as seldom as possible, hence prefer
1401 later times over earlier times.
1403 b) But if we have to wake up, then let's make sure to
1404 dispatch as much as possible on the entire system.
1406 We implement this by waking up everywhere at the same time
1407 within any given minute if we can, synchronised via the
1408 perturbation value determined from the boot ID. If we can't,
1409 then we try to find the same spot in every 10s, then 1s and
1410 then 250ms step. Otherwise, we pick the last possible time
1414 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1416 if (_unlikely_(c < USEC_PER_MINUTE))
1419 c -= USEC_PER_MINUTE;
1425 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1427 if (_unlikely_(c < USEC_PER_SEC*10))
1430 c -= USEC_PER_SEC*10;
1436 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1438 if (_unlikely_(c < USEC_PER_SEC))
1447 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1449 if (_unlikely_(c < USEC_PER_MSEC*250))
1452 c -= USEC_PER_MSEC*250;
1461 static int event_arm_timer(
1468 struct itimerspec its = {};
1469 sd_event_source *a, *b;
1476 a = prioq_peek(earliest);
1477 if (!a || a->enabled == SD_EVENT_OFF) {
1482 if (*next == (usec_t) -1)
1486 r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
1490 *next = (usec_t) -1;
1495 b = prioq_peek(latest);
1496 assert_se(b && b->enabled != SD_EVENT_OFF);
1498 t = sleep_between(e, a->time.next, b->time.next + b->time.accuracy);
1502 assert_se(timer_fd >= 0);
1505 /* We don' want to disarm here, just mean some time looooong ago. */
1506 its.it_value.tv_sec = 0;
1507 its.it_value.tv_nsec = 1;
1509 timespec_store(&its.it_value, t);
1511 r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
1519 static int process_io(sd_event *e, sd_event_source *s, uint32_t events) {
1522 assert(s->type == SOURCE_IO);
1524 s->io.revents = events;
1526 return source_set_pending(s, true);
1529 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
1536 assert_return(events == EPOLLIN, -EIO);
1538 ss = read(fd, &x, sizeof(x));
1540 if (errno == EAGAIN || errno == EINTR)
1546 if (ss != sizeof(x))
1550 *next = (usec_t) -1;
1555 static int process_timer(
1567 s = prioq_peek(earliest);
1570 s->enabled == SD_EVENT_OFF ||
1574 r = source_set_pending(s, true);
1578 prioq_reshuffle(earliest, s, &s->time.earliest_index);
1579 prioq_reshuffle(latest, s, &s->time.latest_index);
1585 static int process_child(sd_event *e) {
1592 e->need_process_child = false;
1595 So, this is ugly. We iteratively invoke waitid() with P_PID
1596 + WNOHANG for each PID we wait for, instead of using
1597 P_ALL. This is because we only want to get child
1598 information of very specific child processes, and not all
1599 of them. We might not have processed the SIGCHLD even of a
1600 previous invocation and we don't want to maintain a
1601 unbounded *per-child* event queue, hence we really don't
1602 want anything flushed out of the kernel's queue that we
1603 don't care about. Since this is O(n) this means that if you
1604 have a lot of processes you probably want to handle SIGCHLD
1607 We do not reap the children here (by using WNOWAIT), this
1608 is only done after the event source is dispatched so that
1609 the callback still sees the process as a zombie.
1612 HASHMAP_FOREACH(s, e->child_sources, i) {
1613 assert(s->type == SOURCE_CHILD);
1618 if (s->enabled == SD_EVENT_OFF)
1621 zero(s->child.siginfo);
1622 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
1623 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
1627 if (s->child.siginfo.si_pid != 0) {
1629 s->child.siginfo.si_code == CLD_EXITED ||
1630 s->child.siginfo.si_code == CLD_KILLED ||
1631 s->child.siginfo.si_code == CLD_DUMPED;
1633 if (!zombie && (s->child.options & WEXITED)) {
1634 /* If the child isn't dead then let's
1635 * immediately remove the state change
1636 * from the queue, since there's no
1637 * benefit in leaving it queued */
1639 assert(s->child.options & (WSTOPPED|WCONTINUED));
1640 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
1643 r = source_set_pending(s, true);
1652 static int process_signal(sd_event *e, uint32_t events) {
1653 bool read_one = false;
1657 assert(e->signal_sources);
1659 assert_return(events == EPOLLIN, -EIO);
1662 struct signalfd_siginfo si;
1666 ss = read(e->signal_fd, &si, sizeof(si));
1668 if (errno == EAGAIN || errno == EINTR)
1674 if (ss != sizeof(si))
1679 s = e->signal_sources[si.ssi_signo];
1680 if (si.ssi_signo == SIGCHLD) {
1681 r = process_child(e);
1690 s->signal.siginfo = si;
1691 r = source_set_pending(s, true);
1699 static int source_dispatch(sd_event_source *s) {
1703 assert(s->pending || s->type == SOURCE_EXIT);
1705 if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) {
1706 r = source_set_pending(s, false);
1711 if (s->enabled == SD_EVENT_ONESHOT) {
1712 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
1717 s->dispatching = true;
1722 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
1725 case SOURCE_MONOTONIC:
1726 r = s->time.callback(s, s->time.next, s->userdata);
1729 case SOURCE_REALTIME:
1730 r = s->time.callback(s, s->time.next, s->userdata);
1734 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
1737 case SOURCE_CHILD: {
1740 zombie = s->child.siginfo.si_code == CLD_EXITED ||
1741 s->child.siginfo.si_code == CLD_KILLED ||
1742 s->child.siginfo.si_code == CLD_DUMPED;
1744 r = s->child.callback(s, &s->child.siginfo, s->userdata);
1746 /* Now, reap the PID for good. */
1748 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
1754 r = s->defer.callback(s, s->userdata);
1758 r = s->exit.callback(s, s->userdata);
1762 s->dispatching = false;
1765 log_debug("Event source %p returned error, disabling: %s", s, strerror(-r));
1770 sd_event_source_set_enabled(s, SD_EVENT_OFF);
1775 static int event_prepare(sd_event *e) {
1783 s = prioq_peek(e->prepare);
1784 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
1787 s->prepare_iteration = e->iteration;
1788 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
1794 s->dispatching = true;
1795 r = s->prepare(s, s->userdata);
1796 s->dispatching = false;
1799 log_debug("Prepare callback of event source %p returned error, disabling: %s", s, strerror(-r));
1804 sd_event_source_set_enabled(s, SD_EVENT_OFF);
1810 static int dispatch_exit(sd_event *e) {
1816 p = prioq_peek(e->exit);
1817 if (!p || p->enabled == SD_EVENT_OFF) {
1818 e->state = SD_EVENT_FINISHED;
1824 e->state = SD_EVENT_EXITING;
1826 r = source_dispatch(p);
1828 e->state = SD_EVENT_PASSIVE;
1834 static sd_event_source* event_next_pending(sd_event *e) {
1839 p = prioq_peek(e->pending);
1843 if (p->enabled == SD_EVENT_OFF)
1849 static int arm_watchdog(sd_event *e) {
1850 struct itimerspec its = {};
1855 assert(e->watchdog_fd >= 0);
1857 t = sleep_between(e,
1858 e->watchdog_last + (e->watchdog_period / 2),
1859 e->watchdog_last + (e->watchdog_period * 3 / 4));
1861 timespec_store(&its.it_value, t);
1863 r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
1870 static int process_watchdog(sd_event *e) {
1876 /* Don't notify watchdog too often */
1877 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
1880 sd_notify(false, "WATCHDOG=1");
1881 e->watchdog_last = e->timestamp.monotonic;
1883 return arm_watchdog(e);
1886 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
1887 struct epoll_event ev_queue[EPOLL_QUEUE_MAX];
1891 assert_return(e, -EINVAL);
1892 assert_return(!event_pid_changed(e), -ECHILD);
1893 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1894 assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
1896 if (e->exit_requested)
1897 return dispatch_exit(e);
1901 e->state = SD_EVENT_RUNNING;
1903 r = event_prepare(e);
1907 r = event_arm_timer(e, e->monotonic_fd, e->monotonic_earliest, e->monotonic_latest, &e->monotonic_next);
1911 r = event_arm_timer(e, e->realtime_fd, e->realtime_earliest, e->realtime_latest, &e->realtime_next);
1915 if (event_next_pending(e) || e->need_process_child)
1918 m = epoll_wait(e->epoll_fd, ev_queue, EPOLL_QUEUE_MAX,
1919 timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
1921 r = errno == EAGAIN || errno == EINTR ? 0 : -errno;
1925 dual_timestamp_get(&e->timestamp);
1927 for (i = 0; i < m; i++) {
1929 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_MONOTONIC))
1930 r = flush_timer(e, e->monotonic_fd, ev_queue[i].events, &e->monotonic_next);
1931 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_REALTIME))
1932 r = flush_timer(e, e->realtime_fd, ev_queue[i].events, &e->realtime_next);
1933 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_SIGNAL))
1934 r = process_signal(e, ev_queue[i].events);
1935 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
1936 r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
1938 r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
1944 r = process_watchdog(e);
1948 r = process_timer(e, e->timestamp.monotonic, e->monotonic_earliest, e->monotonic_latest);
1952 r = process_timer(e, e->timestamp.realtime, e->realtime_earliest, e->realtime_latest);
1956 if (e->need_process_child) {
1957 r = process_child(e);
1962 p = event_next_pending(e);
1968 r = source_dispatch(p);
1971 e->state = SD_EVENT_PASSIVE;
1977 _public_ int sd_event_loop(sd_event *e) {
1980 assert_return(e, -EINVAL);
1981 assert_return(!event_pid_changed(e), -ECHILD);
1982 assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
1986 while (e->state != SD_EVENT_FINISHED) {
1987 r = sd_event_run(e, (uint64_t) -1);
1999 _public_ int sd_event_get_state(sd_event *e) {
2000 assert_return(e, -EINVAL);
2001 assert_return(!event_pid_changed(e), -ECHILD);
2006 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2007 assert_return(e, -EINVAL);
2008 assert_return(code, -EINVAL);
2009 assert_return(!event_pid_changed(e), -ECHILD);
2011 if (!e->exit_requested)
2014 *code = e->exit_code;
2018 _public_ int sd_event_exit(sd_event *e, int code) {
2019 assert_return(e, -EINVAL);
2020 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2021 assert_return(!event_pid_changed(e), -ECHILD);
2023 e->exit_requested = true;
2024 e->exit_code = code;
2029 _public_ int sd_event_get_now_realtime(sd_event *e, uint64_t *usec) {
2030 assert_return(e, -EINVAL);
2031 assert_return(usec, -EINVAL);
2032 assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
2033 assert_return(!event_pid_changed(e), -ECHILD);
2035 *usec = e->timestamp.realtime;
2039 _public_ int sd_event_get_now_monotonic(sd_event *e, uint64_t *usec) {
2040 assert_return(e, -EINVAL);
2041 assert_return(usec, -EINVAL);
2042 assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
2043 assert_return(!event_pid_changed(e), -ECHILD);
2045 *usec = e->timestamp.monotonic;
2049 _public_ int sd_event_default(sd_event **ret) {
2051 static __thread sd_event *default_event = NULL;
2056 return !!default_event;
2058 if (default_event) {
2059 *ret = sd_event_ref(default_event);
2063 r = sd_event_new(&e);
2067 e->default_event_ptr = &default_event;
2075 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2076 assert_return(e, -EINVAL);
2077 assert_return(tid, -EINVAL);
2078 assert_return(!event_pid_changed(e), -ECHILD);
2088 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2091 assert_return(e, -EINVAL);
2092 assert_return(!event_pid_changed(e), -ECHILD);
2094 if (e->watchdog == !!b)
2098 struct epoll_event ev = {};
2101 env = getenv("WATCHDOG_USEC");
2105 r = safe_atou64(env, &e->watchdog_period);
2108 if (e->watchdog_period <= 0)
2111 /* Issue first ping immediately */
2112 sd_notify(false, "WATCHDOG=1");
2113 e->watchdog_last = now(CLOCK_MONOTONIC);
2115 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2116 if (e->watchdog_fd < 0)
2119 r = arm_watchdog(e);
2123 ev.events = EPOLLIN;
2124 ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
2126 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2133 if (e->watchdog_fd >= 0) {
2134 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2135 close_nointr_nofail(e->watchdog_fd);
2136 e->watchdog_fd = -1;
2144 close_nointr_nofail(e->watchdog_fd);
2145 e->watchdog_fd = -1;
2149 _public_ int sd_event_get_watchdog(sd_event *e) {
2150 assert_return(e, -EINVAL);
2151 assert_return(!event_pid_changed(e), -ECHILD);