1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
22 #include <sys/epoll.h>
23 #include <sys/timerfd.h>
27 #include "sd-daemon.h"
32 #include "time-util.h"
37 #define EPOLL_QUEUE_MAX 64
38 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
40 typedef enum EventSourceType {
51 struct sd_event_source {
56 sd_event_handler_t prepare;
58 EventSourceType type:4;
64 unsigned pending_index;
65 unsigned prepare_index;
66 unsigned pending_iteration;
67 unsigned prepare_iteration;
71 sd_event_io_handler_t callback;
78 sd_event_time_handler_t callback;
79 usec_t next, accuracy;
80 unsigned earliest_index;
81 unsigned latest_index;
84 sd_event_signal_handler_t callback;
85 struct signalfd_siginfo siginfo;
89 sd_event_child_handler_t callback;
95 sd_event_handler_t callback;
98 sd_event_handler_t callback;
116 /* For both clocks we maintain two priority queues each, one
117 * ordered for the earliest times the events may be
118 * dispatched, and one ordered by the latest times they must
119 * have been dispatched. The range between the top entries in
120 * the two prioqs is the time window we can freely schedule
122 Prioq *monotonic_earliest;
123 Prioq *monotonic_latest;
124 Prioq *realtime_earliest;
125 Prioq *realtime_latest;
127 usec_t realtime_next, monotonic_next;
131 sd_event_source **signal_sources;
133 Hashmap *child_sources;
134 unsigned n_enabled_child_sources;
141 dual_timestamp timestamp;
144 bool exit_requested:1;
145 bool need_process_child:1;
151 sd_event **default_event_ptr;
153 usec_t watchdog_last, watchdog_period;
156 static int pending_prioq_compare(const void *a, const void *b) {
157 const sd_event_source *x = a, *y = b;
162 /* Enabled ones first */
163 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
165 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
168 /* Lower priority values first */
169 if (x->priority < y->priority)
171 if (x->priority > y->priority)
174 /* Older entries first */
175 if (x->pending_iteration < y->pending_iteration)
177 if (x->pending_iteration > y->pending_iteration)
180 /* Stability for the rest */
189 static int prepare_prioq_compare(const void *a, const void *b) {
190 const sd_event_source *x = a, *y = b;
195 /* Move most recently prepared ones last, so that we can stop
196 * preparing as soon as we hit one that has already been
197 * prepared in the current iteration */
198 if (x->prepare_iteration < y->prepare_iteration)
200 if (x->prepare_iteration > y->prepare_iteration)
203 /* Enabled ones first */
204 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
206 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
209 /* Lower priority values first */
210 if (x->priority < y->priority)
212 if (x->priority > y->priority)
215 /* Stability for the rest */
224 static int earliest_time_prioq_compare(const void *a, const void *b) {
225 const sd_event_source *x = a, *y = b;
227 assert(x->type == SOURCE_MONOTONIC || x->type == SOURCE_REALTIME);
228 assert(y->type == SOURCE_MONOTONIC || y->type == SOURCE_REALTIME);
230 /* Enabled ones first */
231 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
233 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
236 /* Move the pending ones to the end */
237 if (!x->pending && y->pending)
239 if (x->pending && !y->pending)
243 if (x->time.next < y->time.next)
245 if (x->time.next > y->time.next)
248 /* Stability for the rest */
257 static int latest_time_prioq_compare(const void *a, const void *b) {
258 const sd_event_source *x = a, *y = b;
260 assert((x->type == SOURCE_MONOTONIC && y->type == SOURCE_MONOTONIC) ||
261 (x->type == SOURCE_REALTIME && y->type == SOURCE_REALTIME));
263 /* Enabled ones first */
264 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
266 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
269 /* Move the pending ones to the end */
270 if (!x->pending && y->pending)
272 if (x->pending && !y->pending)
276 if (x->time.next + x->time.accuracy < y->time.next + y->time.accuracy)
278 if (x->time.next + x->time.accuracy > y->time.next + y->time.accuracy)
281 /* Stability for the rest */
290 static int exit_prioq_compare(const void *a, const void *b) {
291 const sd_event_source *x = a, *y = b;
293 assert(x->type == SOURCE_EXIT);
294 assert(y->type == SOURCE_EXIT);
296 /* Enabled ones first */
297 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
299 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
302 /* Lower priority values first */
303 if (x->priority < y->priority)
305 if (x->priority > y->priority)
308 /* Stability for the rest */
317 static void event_free(sd_event *e) {
320 if (e->default_event_ptr)
321 *(e->default_event_ptr) = NULL;
323 if (e->epoll_fd >= 0)
324 close_nointr_nofail(e->epoll_fd);
326 if (e->signal_fd >= 0)
327 close_nointr_nofail(e->signal_fd);
329 if (e->realtime_fd >= 0)
330 close_nointr_nofail(e->realtime_fd);
332 if (e->monotonic_fd >= 0)
333 close_nointr_nofail(e->monotonic_fd);
335 if (e->watchdog_fd >= 0)
336 close_nointr_nofail(e->watchdog_fd);
338 prioq_free(e->pending);
339 prioq_free(e->prepare);
340 prioq_free(e->monotonic_earliest);
341 prioq_free(e->monotonic_latest);
342 prioq_free(e->realtime_earliest);
343 prioq_free(e->realtime_latest);
346 free(e->signal_sources);
348 hashmap_free(e->child_sources);
352 _public_ int sd_event_new(sd_event** ret) {
356 assert_return(ret, -EINVAL);
358 e = new0(sd_event, 1);
363 e->signal_fd = e->realtime_fd = e->monotonic_fd = e->watchdog_fd = e->epoll_fd = -1;
364 e->realtime_next = e->monotonic_next = (usec_t) -1;
365 e->original_pid = getpid();
367 assert_se(sigemptyset(&e->sigset) == 0);
369 e->pending = prioq_new(pending_prioq_compare);
375 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
376 if (e->epoll_fd < 0) {
389 _public_ sd_event* sd_event_ref(sd_event *e) {
390 assert_return(e, NULL);
392 assert(e->n_ref >= 1);
398 _public_ sd_event* sd_event_unref(sd_event *e) {
403 assert(e->n_ref >= 1);
412 static bool event_pid_changed(sd_event *e) {
415 /* We don't support people creating am event loop and keeping
416 * it around over a fork(). Let's complain. */
418 return e->original_pid != getpid();
421 static int source_io_unregister(sd_event_source *s) {
425 assert(s->type == SOURCE_IO);
427 if (!s->io.registered)
430 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
434 s->io.registered = false;
438 static int source_io_register(
443 struct epoll_event ev = {};
447 assert(s->type == SOURCE_IO);
448 assert(enabled != SD_EVENT_OFF);
453 if (enabled == SD_EVENT_ONESHOT)
454 ev.events |= EPOLLONESHOT;
456 if (s->io.registered)
457 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
459 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
464 s->io.registered = true;
469 static void source_free(sd_event_source *s) {
477 source_io_unregister(s);
481 case SOURCE_MONOTONIC:
482 prioq_remove(s->event->monotonic_earliest, s, &s->time.earliest_index);
483 prioq_remove(s->event->monotonic_latest, s, &s->time.latest_index);
486 case SOURCE_REALTIME:
487 prioq_remove(s->event->realtime_earliest, s, &s->time.earliest_index);
488 prioq_remove(s->event->realtime_latest, s, &s->time.latest_index);
492 if (s->signal.sig > 0) {
493 if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0)
494 assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
496 if (s->event->signal_sources)
497 s->event->signal_sources[s->signal.sig] = NULL;
503 if (s->child.pid > 0) {
504 if (s->enabled != SD_EVENT_OFF) {
505 assert(s->event->n_enabled_child_sources > 0);
506 s->event->n_enabled_child_sources--;
509 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD])
510 assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
512 hashmap_remove(s->event->child_sources, INT_TO_PTR(s->child.pid));
522 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
527 prioq_remove(s->event->pending, s, &s->pending_index);
530 prioq_remove(s->event->prepare, s, &s->prepare_index);
532 sd_event_unref(s->event);
538 static int source_set_pending(sd_event_source *s, bool b) {
542 assert(s->type != SOURCE_EXIT);
550 s->pending_iteration = s->event->iteration;
552 r = prioq_put(s->event->pending, s, &s->pending_index);
558 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
560 if (s->type == SOURCE_REALTIME) {
561 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
562 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
563 } else if (s->type == SOURCE_MONOTONIC) {
564 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
565 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
571 static sd_event_source *source_new(sd_event *e, EventSourceType type) {
576 s = new0(sd_event_source, 1);
581 s->event = sd_event_ref(e);
583 s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
588 _public_ int sd_event_add_io(
592 sd_event_io_handler_t callback,
594 sd_event_source **ret) {
599 assert_return(e, -EINVAL);
600 assert_return(fd >= 0, -EINVAL);
601 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
602 assert_return(callback, -EINVAL);
603 assert_return(ret, -EINVAL);
604 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
605 assert_return(!event_pid_changed(e), -ECHILD);
607 s = source_new(e, SOURCE_IO);
612 s->io.events = events;
613 s->io.callback = callback;
614 s->userdata = userdata;
615 s->enabled = SD_EVENT_ON;
617 r = source_io_register(s, s->enabled, events);
627 static int event_setup_timer_fd(
629 EventSourceType type,
633 struct epoll_event ev = {};
640 if (_likely_(*timer_fd >= 0))
643 fd = timerfd_create(id, TFD_NONBLOCK|TFD_CLOEXEC);
648 ev.data.ptr = INT_TO_PTR(type);
650 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
652 close_nointr_nofail(fd);
656 /* When we sleep for longer, we try to realign the wakeup to
657 the same time wihtin each minute/second/250ms, so that
658 events all across the system can be coalesced into a single
659 CPU wakeup. However, let's take some system-specific
660 randomness for this value, so that in a network of systems
661 with synced clocks timer events are distributed a
662 bit. Here, we calculate a perturbation usec offset from the
665 if (sd_id128_get_boot(&bootid) >= 0)
666 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
672 static int event_add_time_internal(
674 EventSourceType type,
681 sd_event_time_handler_t callback,
683 sd_event_source **ret) {
688 assert_return(e, -EINVAL);
689 assert_return(callback, -EINVAL);
690 assert_return(ret, -EINVAL);
691 assert_return(usec != (uint64_t) -1, -EINVAL);
692 assert_return(accuracy != (uint64_t) -1, -EINVAL);
693 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
694 assert_return(!event_pid_changed(e), -ECHILD);
701 *earliest = prioq_new(earliest_time_prioq_compare);
707 *latest = prioq_new(latest_time_prioq_compare);
713 r = event_setup_timer_fd(e, type, timer_fd, id);
718 s = source_new(e, type);
723 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
724 s->time.callback = callback;
725 s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
726 s->userdata = userdata;
727 s->enabled = SD_EVENT_ONESHOT;
729 r = prioq_put(*earliest, s, &s->time.earliest_index);
733 r = prioq_put(*latest, s, &s->time.latest_index);
745 _public_ int sd_event_add_monotonic(sd_event *e,
748 sd_event_time_handler_t callback,
750 sd_event_source **ret) {
752 return event_add_time_internal(e, SOURCE_MONOTONIC, &e->monotonic_fd, CLOCK_MONOTONIC, &e->monotonic_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
755 _public_ int sd_event_add_realtime(sd_event *e,
758 sd_event_time_handler_t callback,
760 sd_event_source **ret) {
762 return event_add_time_internal(e, SOURCE_REALTIME, &e->realtime_fd, CLOCK_REALTIME, &e->realtime_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
765 static int event_update_signal_fd(sd_event *e) {
766 struct epoll_event ev = {};
772 add_to_epoll = e->signal_fd < 0;
774 r = signalfd(e->signal_fd, &e->sigset, SFD_NONBLOCK|SFD_CLOEXEC);
784 ev.data.ptr = INT_TO_PTR(SOURCE_SIGNAL);
786 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->signal_fd, &ev);
788 close_nointr_nofail(e->signal_fd);
797 _public_ int sd_event_add_signal(
800 sd_event_signal_handler_t callback,
802 sd_event_source **ret) {
807 assert_return(e, -EINVAL);
808 assert_return(sig > 0, -EINVAL);
809 assert_return(sig < _NSIG, -EINVAL);
810 assert_return(callback, -EINVAL);
811 assert_return(ret, -EINVAL);
812 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
813 assert_return(!event_pid_changed(e), -ECHILD);
815 if (!e->signal_sources) {
816 e->signal_sources = new0(sd_event_source*, _NSIG);
817 if (!e->signal_sources)
819 } else if (e->signal_sources[sig])
822 s = source_new(e, SOURCE_SIGNAL);
827 s->signal.callback = callback;
828 s->userdata = userdata;
829 s->enabled = SD_EVENT_ON;
831 e->signal_sources[sig] = s;
832 assert_se(sigaddset(&e->sigset, sig) == 0);
834 if (sig != SIGCHLD || e->n_enabled_child_sources == 0) {
835 r = event_update_signal_fd(e);
846 _public_ int sd_event_add_child(
850 sd_event_child_handler_t callback,
852 sd_event_source **ret) {
857 assert_return(e, -EINVAL);
858 assert_return(pid > 1, -EINVAL);
859 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
860 assert_return(options != 0, -EINVAL);
861 assert_return(callback, -EINVAL);
862 assert_return(ret, -EINVAL);
863 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
864 assert_return(!event_pid_changed(e), -ECHILD);
866 r = hashmap_ensure_allocated(&e->child_sources, trivial_hash_func, trivial_compare_func);
870 if (hashmap_contains(e->child_sources, INT_TO_PTR(pid)))
873 s = source_new(e, SOURCE_CHILD);
878 s->child.options = options;
879 s->child.callback = callback;
880 s->userdata = userdata;
881 s->enabled = SD_EVENT_ONESHOT;
883 r = hashmap_put(e->child_sources, INT_TO_PTR(pid), s);
889 e->n_enabled_child_sources ++;
891 assert_se(sigaddset(&e->sigset, SIGCHLD) == 0);
893 if (!e->signal_sources || !e->signal_sources[SIGCHLD]) {
894 r = event_update_signal_fd(e);
901 e->need_process_child = true;
907 _public_ int sd_event_add_defer(
909 sd_event_handler_t callback,
911 sd_event_source **ret) {
916 assert_return(e, -EINVAL);
917 assert_return(callback, -EINVAL);
918 assert_return(ret, -EINVAL);
919 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
920 assert_return(!event_pid_changed(e), -ECHILD);
922 s = source_new(e, SOURCE_DEFER);
926 s->defer.callback = callback;
927 s->userdata = userdata;
928 s->enabled = SD_EVENT_ONESHOT;
930 r = source_set_pending(s, true);
940 _public_ int sd_event_add_exit(
942 sd_event_handler_t callback,
944 sd_event_source **ret) {
949 assert_return(e, -EINVAL);
950 assert_return(callback, -EINVAL);
951 assert_return(ret, -EINVAL);
952 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
953 assert_return(!event_pid_changed(e), -ECHILD);
956 e->exit = prioq_new(exit_prioq_compare);
961 s = source_new(e, SOURCE_EXIT);
965 s->exit.callback = callback;
966 s->userdata = userdata;
967 s->exit.prioq_index = PRIOQ_IDX_NULL;
968 s->enabled = SD_EVENT_ONESHOT;
970 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
980 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
981 assert_return(s, NULL);
983 assert(s->n_ref >= 1);
989 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
994 assert(s->n_ref >= 1);
998 /* Here's a special hack: when we are called from a
999 * dispatch handler we won't free the event source
1000 * immediately, but we will detach the fd from the
1001 * epoll. This way it is safe for the caller to unref
1002 * the event source and immediately close the fd, but
1003 * we still retain a valid event source object after
1006 if (s->dispatching) {
1007 if (s->type == SOURCE_IO)
1008 source_io_unregister(s);
1016 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1017 assert_return(s, NULL);
1022 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1023 assert_return(s, -EINVAL);
1024 assert_return(s->type != SOURCE_EXIT, -EDOM);
1025 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1026 assert_return(!event_pid_changed(s->event), -ECHILD);
1031 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1032 assert_return(s, -EINVAL);
1033 assert_return(s->type == SOURCE_IO, -EDOM);
1034 assert_return(!event_pid_changed(s->event), -ECHILD);
1039 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1042 assert_return(s, -EINVAL);
1043 assert_return(fd >= 0, -EINVAL);
1044 assert_return(s->type == SOURCE_IO, -EDOM);
1045 assert_return(!event_pid_changed(s->event), -ECHILD);
1050 if (s->enabled == SD_EVENT_OFF) {
1052 s->io.registered = false;
1056 saved_fd = s->io.fd;
1057 assert(s->io.registered);
1060 s->io.registered = false;
1062 r = source_io_register(s, s->enabled, s->io.events);
1064 s->io.fd = saved_fd;
1065 s->io.registered = true;
1069 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
1075 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1076 assert_return(s, -EINVAL);
1077 assert_return(events, -EINVAL);
1078 assert_return(s->type == SOURCE_IO, -EDOM);
1079 assert_return(!event_pid_changed(s->event), -ECHILD);
1081 *events = s->io.events;
1085 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1088 assert_return(s, -EINVAL);
1089 assert_return(s->type == SOURCE_IO, -EDOM);
1090 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1091 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1092 assert_return(!event_pid_changed(s->event), -ECHILD);
1094 if (s->io.events == events)
1097 if (s->enabled != SD_EVENT_OFF) {
1098 r = source_io_register(s, s->enabled, events);
1103 s->io.events = events;
1104 source_set_pending(s, false);
1109 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1110 assert_return(s, -EINVAL);
1111 assert_return(revents, -EINVAL);
1112 assert_return(s->type == SOURCE_IO, -EDOM);
1113 assert_return(s->pending, -ENODATA);
1114 assert_return(!event_pid_changed(s->event), -ECHILD);
1116 *revents = s->io.revents;
1120 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1121 assert_return(s, -EINVAL);
1122 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1123 assert_return(!event_pid_changed(s->event), -ECHILD);
1125 return s->signal.sig;
1128 _public_ int sd_event_source_get_priority(sd_event_source *s, int *priority) {
1129 assert_return(s, -EINVAL);
1130 assert_return(!event_pid_changed(s->event), -ECHILD);
1135 _public_ int sd_event_source_set_priority(sd_event_source *s, int priority) {
1136 assert_return(s, -EINVAL);
1137 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1138 assert_return(!event_pid_changed(s->event), -ECHILD);
1140 if (s->priority == priority)
1143 s->priority = priority;
1146 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1149 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1151 if (s->type == SOURCE_EXIT)
1152 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1157 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1158 assert_return(s, -EINVAL);
1159 assert_return(m, -EINVAL);
1160 assert_return(!event_pid_changed(s->event), -ECHILD);
1166 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1169 assert_return(s, -EINVAL);
1170 assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
1171 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1172 assert_return(!event_pid_changed(s->event), -ECHILD);
1174 if (s->enabled == m)
1177 if (m == SD_EVENT_OFF) {
1182 r = source_io_unregister(s);
1189 case SOURCE_MONOTONIC:
1191 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1192 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1195 case SOURCE_REALTIME:
1197 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1198 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1203 if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) {
1204 assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
1205 event_update_signal_fd(s->event);
1213 assert(s->event->n_enabled_child_sources > 0);
1214 s->event->n_enabled_child_sources--;
1216 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
1217 assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
1218 event_update_signal_fd(s->event);
1225 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1237 r = source_io_register(s, m, s->io.events);
1244 case SOURCE_MONOTONIC:
1246 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1247 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1250 case SOURCE_REALTIME:
1252 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1253 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1259 if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) {
1260 assert_se(sigaddset(&s->event->sigset, s->signal.sig) == 0);
1261 event_update_signal_fd(s->event);
1268 if (s->enabled == SD_EVENT_OFF) {
1269 s->event->n_enabled_child_sources++;
1271 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
1272 assert_se(sigaddset(&s->event->sigset, SIGCHLD) == 0);
1273 event_update_signal_fd(s->event);
1280 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1290 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1293 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1298 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1299 assert_return(s, -EINVAL);
1300 assert_return(usec, -EINVAL);
1301 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1302 assert_return(!event_pid_changed(s->event), -ECHILD);
1304 *usec = s->time.next;
1308 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1309 assert_return(s, -EINVAL);
1310 assert_return(usec != (uint64_t) -1, -EINVAL);
1311 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1312 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1313 assert_return(!event_pid_changed(s->event), -ECHILD);
1315 s->time.next = usec;
1317 source_set_pending(s, false);
1319 if (s->type == SOURCE_REALTIME) {
1320 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1321 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1323 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1324 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1330 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1331 assert_return(s, -EINVAL);
1332 assert_return(usec, -EINVAL);
1333 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1334 assert_return(!event_pid_changed(s->event), -ECHILD);
1336 *usec = s->time.accuracy;
1340 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1341 assert_return(s, -EINVAL);
1342 assert_return(usec != (uint64_t) -1, -EINVAL);
1343 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1344 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1345 assert_return(!event_pid_changed(s->event), -ECHILD);
1348 usec = DEFAULT_ACCURACY_USEC;
1350 s->time.accuracy = usec;
1352 source_set_pending(s, false);
1354 if (s->type == SOURCE_REALTIME)
1355 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1357 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1362 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1363 assert_return(s, -EINVAL);
1364 assert_return(pid, -EINVAL);
1365 assert_return(s->type == SOURCE_CHILD, -EDOM);
1366 assert_return(!event_pid_changed(s->event), -ECHILD);
1368 *pid = s->child.pid;
1372 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1375 assert_return(s, -EINVAL);
1376 assert_return(s->type != SOURCE_EXIT, -EDOM);
1377 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1378 assert_return(!event_pid_changed(s->event), -ECHILD);
1380 if (s->prepare == callback)
1383 if (callback && s->prepare) {
1384 s->prepare = callback;
1388 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1392 s->prepare = callback;
1395 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1399 prioq_remove(s->event->prepare, s, &s->prepare_index);
1404 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1405 assert_return(s, NULL);
1410 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1413 assert_return(s, NULL);
1416 s->userdata = userdata;
1421 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1433 Find a good time to wake up again between times a and b. We
1434 have two goals here:
1436 a) We want to wake up as seldom as possible, hence prefer
1437 later times over earlier times.
1439 b) But if we have to wake up, then let's make sure to
1440 dispatch as much as possible on the entire system.
1442 We implement this by waking up everywhere at the same time
1443 within any given minute if we can, synchronised via the
1444 perturbation value determined from the boot ID. If we can't,
1445 then we try to find the same spot in every 10s, then 1s and
1446 then 250ms step. Otherwise, we pick the last possible time
1450 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1452 if (_unlikely_(c < USEC_PER_MINUTE))
1455 c -= USEC_PER_MINUTE;
1461 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1463 if (_unlikely_(c < USEC_PER_SEC*10))
1466 c -= USEC_PER_SEC*10;
1472 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1474 if (_unlikely_(c < USEC_PER_SEC))
1483 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1485 if (_unlikely_(c < USEC_PER_MSEC*250))
1488 c -= USEC_PER_MSEC*250;
1497 static int event_arm_timer(
1504 struct itimerspec its = {};
1505 sd_event_source *a, *b;
1512 a = prioq_peek(earliest);
1513 if (!a || a->enabled == SD_EVENT_OFF) {
1518 if (*next == (usec_t) -1)
1522 r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
1526 *next = (usec_t) -1;
1531 b = prioq_peek(latest);
1532 assert_se(b && b->enabled != SD_EVENT_OFF);
1534 t = sleep_between(e, a->time.next, b->time.next + b->time.accuracy);
1538 assert_se(timer_fd >= 0);
1541 /* We don' want to disarm here, just mean some time looooong ago. */
1542 its.it_value.tv_sec = 0;
1543 its.it_value.tv_nsec = 1;
1545 timespec_store(&its.it_value, t);
1547 r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
1555 static int process_io(sd_event *e, sd_event_source *s, uint32_t events) {
1558 assert(s->type == SOURCE_IO);
1560 s->io.revents = events;
1562 return source_set_pending(s, true);
1565 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
1572 assert_return(events == EPOLLIN, -EIO);
1574 ss = read(fd, &x, sizeof(x));
1576 if (errno == EAGAIN || errno == EINTR)
1582 if (ss != sizeof(x))
1586 *next = (usec_t) -1;
1591 static int process_timer(
1603 s = prioq_peek(earliest);
1606 s->enabled == SD_EVENT_OFF ||
1610 r = source_set_pending(s, true);
1614 prioq_reshuffle(earliest, s, &s->time.earliest_index);
1615 prioq_reshuffle(latest, s, &s->time.latest_index);
1621 static int process_child(sd_event *e) {
1628 e->need_process_child = false;
1631 So, this is ugly. We iteratively invoke waitid() with P_PID
1632 + WNOHANG for each PID we wait for, instead of using
1633 P_ALL. This is because we only want to get child
1634 information of very specific child processes, and not all
1635 of them. We might not have processed the SIGCHLD even of a
1636 previous invocation and we don't want to maintain a
1637 unbounded *per-child* event queue, hence we really don't
1638 want anything flushed out of the kernel's queue that we
1639 don't care about. Since this is O(n) this means that if you
1640 have a lot of processes you probably want to handle SIGCHLD
1643 We do not reap the children here (by using WNOWAIT), this
1644 is only done after the event source is dispatched so that
1645 the callback still sees the process as a zombie.
1648 HASHMAP_FOREACH(s, e->child_sources, i) {
1649 assert(s->type == SOURCE_CHILD);
1654 if (s->enabled == SD_EVENT_OFF)
1657 zero(s->child.siginfo);
1658 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
1659 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
1663 if (s->child.siginfo.si_pid != 0) {
1665 s->child.siginfo.si_code == CLD_EXITED ||
1666 s->child.siginfo.si_code == CLD_KILLED ||
1667 s->child.siginfo.si_code == CLD_DUMPED;
1669 if (!zombie && (s->child.options & WEXITED)) {
1670 /* If the child isn't dead then let's
1671 * immediately remove the state change
1672 * from the queue, since there's no
1673 * benefit in leaving it queued */
1675 assert(s->child.options & (WSTOPPED|WCONTINUED));
1676 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
1679 r = source_set_pending(s, true);
1688 static int process_signal(sd_event *e, uint32_t events) {
1689 bool read_one = false;
1693 assert(e->signal_sources);
1695 assert_return(events == EPOLLIN, -EIO);
1698 struct signalfd_siginfo si;
1702 ss = read(e->signal_fd, &si, sizeof(si));
1704 if (errno == EAGAIN || errno == EINTR)
1710 if (ss != sizeof(si))
1715 s = e->signal_sources[si.ssi_signo];
1716 if (si.ssi_signo == SIGCHLD) {
1717 r = process_child(e);
1726 s->signal.siginfo = si;
1727 r = source_set_pending(s, true);
1735 static int source_dispatch(sd_event_source *s) {
1739 assert(s->pending || s->type == SOURCE_EXIT);
1741 if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) {
1742 r = source_set_pending(s, false);
1747 if (s->enabled == SD_EVENT_ONESHOT) {
1748 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
1753 s->dispatching = true;
1758 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
1761 case SOURCE_MONOTONIC:
1762 r = s->time.callback(s, s->time.next, s->userdata);
1765 case SOURCE_REALTIME:
1766 r = s->time.callback(s, s->time.next, s->userdata);
1770 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
1773 case SOURCE_CHILD: {
1776 zombie = s->child.siginfo.si_code == CLD_EXITED ||
1777 s->child.siginfo.si_code == CLD_KILLED ||
1778 s->child.siginfo.si_code == CLD_DUMPED;
1780 r = s->child.callback(s, &s->child.siginfo, s->userdata);
1782 /* Now, reap the PID for good. */
1784 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
1790 r = s->defer.callback(s, s->userdata);
1794 r = s->exit.callback(s, s->userdata);
1798 s->dispatching = false;
1801 log_debug("Event source %p returned error, disabling: %s", s, strerror(-r));
1806 sd_event_source_set_enabled(s, SD_EVENT_OFF);
1811 static int event_prepare(sd_event *e) {
1819 s = prioq_peek(e->prepare);
1820 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
1823 s->prepare_iteration = e->iteration;
1824 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
1830 s->dispatching = true;
1831 r = s->prepare(s, s->userdata);
1832 s->dispatching = false;
1835 log_debug("Prepare callback of event source %p returned error, disabling: %s", s, strerror(-r));
1840 sd_event_source_set_enabled(s, SD_EVENT_OFF);
1846 static int dispatch_exit(sd_event *e) {
1852 p = prioq_peek(e->exit);
1853 if (!p || p->enabled == SD_EVENT_OFF) {
1854 e->state = SD_EVENT_FINISHED;
1860 e->state = SD_EVENT_EXITING;
1862 r = source_dispatch(p);
1864 e->state = SD_EVENT_PASSIVE;
1870 static sd_event_source* event_next_pending(sd_event *e) {
1875 p = prioq_peek(e->pending);
1879 if (p->enabled == SD_EVENT_OFF)
1885 static int arm_watchdog(sd_event *e) {
1886 struct itimerspec its = {};
1891 assert(e->watchdog_fd >= 0);
1893 t = sleep_between(e,
1894 e->watchdog_last + (e->watchdog_period / 2),
1895 e->watchdog_last + (e->watchdog_period * 3 / 4));
1897 timespec_store(&its.it_value, t);
1899 r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
1906 static int process_watchdog(sd_event *e) {
1912 /* Don't notify watchdog too often */
1913 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
1916 sd_notify(false, "WATCHDOG=1");
1917 e->watchdog_last = e->timestamp.monotonic;
1919 return arm_watchdog(e);
1922 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
1923 struct epoll_event ev_queue[EPOLL_QUEUE_MAX];
1927 assert_return(e, -EINVAL);
1928 assert_return(!event_pid_changed(e), -ECHILD);
1929 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1930 assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
1932 if (e->exit_requested)
1933 return dispatch_exit(e);
1937 e->state = SD_EVENT_RUNNING;
1939 r = event_prepare(e);
1943 r = event_arm_timer(e, e->monotonic_fd, e->monotonic_earliest, e->monotonic_latest, &e->monotonic_next);
1947 r = event_arm_timer(e, e->realtime_fd, e->realtime_earliest, e->realtime_latest, &e->realtime_next);
1951 if (event_next_pending(e) || e->need_process_child)
1954 m = epoll_wait(e->epoll_fd, ev_queue, EPOLL_QUEUE_MAX,
1955 timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
1957 r = errno == EAGAIN || errno == EINTR ? 0 : -errno;
1961 dual_timestamp_get(&e->timestamp);
1963 for (i = 0; i < m; i++) {
1965 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_MONOTONIC))
1966 r = flush_timer(e, e->monotonic_fd, ev_queue[i].events, &e->monotonic_next);
1967 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_REALTIME))
1968 r = flush_timer(e, e->realtime_fd, ev_queue[i].events, &e->realtime_next);
1969 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_SIGNAL))
1970 r = process_signal(e, ev_queue[i].events);
1971 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
1972 r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
1974 r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
1980 r = process_watchdog(e);
1984 r = process_timer(e, e->timestamp.monotonic, e->monotonic_earliest, e->monotonic_latest);
1988 r = process_timer(e, e->timestamp.realtime, e->realtime_earliest, e->realtime_latest);
1992 if (e->need_process_child) {
1993 r = process_child(e);
1998 p = event_next_pending(e);
2004 r = source_dispatch(p);
2007 e->state = SD_EVENT_PASSIVE;
2013 _public_ int sd_event_loop(sd_event *e) {
2016 assert_return(e, -EINVAL);
2017 assert_return(!event_pid_changed(e), -ECHILD);
2018 assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
2022 while (e->state != SD_EVENT_FINISHED) {
2023 r = sd_event_run(e, (uint64_t) -1);
2035 _public_ int sd_event_get_state(sd_event *e) {
2036 assert_return(e, -EINVAL);
2037 assert_return(!event_pid_changed(e), -ECHILD);
2042 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2043 assert_return(e, -EINVAL);
2044 assert_return(code, -EINVAL);
2045 assert_return(!event_pid_changed(e), -ECHILD);
2047 if (!e->exit_requested)
2050 *code = e->exit_code;
2054 _public_ int sd_event_exit(sd_event *e, int code) {
2055 assert_return(e, -EINVAL);
2056 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2057 assert_return(!event_pid_changed(e), -ECHILD);
2059 e->exit_requested = true;
2060 e->exit_code = code;
2065 _public_ int sd_event_get_now_realtime(sd_event *e, uint64_t *usec) {
2066 assert_return(e, -EINVAL);
2067 assert_return(usec, -EINVAL);
2068 assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
2069 assert_return(!event_pid_changed(e), -ECHILD);
2071 *usec = e->timestamp.realtime;
2075 _public_ int sd_event_get_now_monotonic(sd_event *e, uint64_t *usec) {
2076 assert_return(e, -EINVAL);
2077 assert_return(usec, -EINVAL);
2078 assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
2079 assert_return(!event_pid_changed(e), -ECHILD);
2081 *usec = e->timestamp.monotonic;
2085 _public_ int sd_event_default(sd_event **ret) {
2087 static __thread sd_event *default_event = NULL;
2092 return !!default_event;
2094 if (default_event) {
2095 *ret = sd_event_ref(default_event);
2099 r = sd_event_new(&e);
2103 e->default_event_ptr = &default_event;
2111 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2112 assert_return(e, -EINVAL);
2113 assert_return(tid, -EINVAL);
2114 assert_return(!event_pid_changed(e), -ECHILD);
2124 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2127 assert_return(e, -EINVAL);
2128 assert_return(!event_pid_changed(e), -ECHILD);
2130 if (e->watchdog == !!b)
2134 struct epoll_event ev = {};
2137 env = getenv("WATCHDOG_USEC");
2141 r = safe_atou64(env, &e->watchdog_period);
2144 if (e->watchdog_period <= 0)
2147 /* Issue first ping immediately */
2148 sd_notify(false, "WATCHDOG=1");
2149 e->watchdog_last = now(CLOCK_MONOTONIC);
2151 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2152 if (e->watchdog_fd < 0)
2155 r = arm_watchdog(e);
2159 ev.events = EPOLLIN;
2160 ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
2162 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2169 if (e->watchdog_fd >= 0) {
2170 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2171 close_nointr_nofail(e->watchdog_fd);
2172 e->watchdog_fd = -1;
2180 close_nointr_nofail(e->watchdog_fd);
2181 e->watchdog_fd = -1;
2185 _public_ int sd_event_get_watchdog(sd_event *e) {
2186 assert_return(e, -EINVAL);
2187 assert_return(!event_pid_changed(e), -ECHILD);