1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
22 #include <sys/epoll.h>
23 #include <sys/timerfd.h>
27 #include "sd-daemon.h"
32 #include "time-util.h"
37 #define EPOLL_QUEUE_MAX 64
38 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
40 typedef enum EventSourceType {
51 struct sd_event_source {
56 sd_event_handler_t prepare;
58 EventSourceType type:4;
64 unsigned pending_index;
65 unsigned prepare_index;
66 unsigned pending_iteration;
67 unsigned prepare_iteration;
71 sd_event_io_handler_t callback;
78 sd_event_time_handler_t callback;
79 usec_t next, accuracy;
80 unsigned earliest_index;
81 unsigned latest_index;
84 sd_event_signal_handler_t callback;
85 struct signalfd_siginfo siginfo;
89 sd_event_child_handler_t callback;
95 sd_event_handler_t callback;
98 sd_event_handler_t callback;
116 /* For both clocks we maintain two priority queues each, one
117 * ordered for the earliest times the events may be
118 * dispatched, and one ordered by the latest times they must
119 * have been dispatched. The range between the top entries in
120 * the two prioqs is the time window we can freely schedule
122 Prioq *monotonic_earliest;
123 Prioq *monotonic_latest;
124 Prioq *realtime_earliest;
125 Prioq *realtime_latest;
127 usec_t realtime_next, monotonic_next;
131 sd_event_source **signal_sources;
133 Hashmap *child_sources;
134 unsigned n_enabled_child_sources;
141 dual_timestamp timestamp;
144 bool exit_requested:1;
145 bool need_process_child:1;
151 sd_event **default_event_ptr;
153 usec_t watchdog_last, watchdog_period;
156 static int pending_prioq_compare(const void *a, const void *b) {
157 const sd_event_source *x = a, *y = b;
162 /* Enabled ones first */
163 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
165 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
168 /* Lower priority values first */
169 if (x->priority < y->priority)
171 if (x->priority > y->priority)
174 /* Older entries first */
175 if (x->pending_iteration < y->pending_iteration)
177 if (x->pending_iteration > y->pending_iteration)
180 /* Stability for the rest */
189 static int prepare_prioq_compare(const void *a, const void *b) {
190 const sd_event_source *x = a, *y = b;
195 /* Move most recently prepared ones last, so that we can stop
196 * preparing as soon as we hit one that has already been
197 * prepared in the current iteration */
198 if (x->prepare_iteration < y->prepare_iteration)
200 if (x->prepare_iteration > y->prepare_iteration)
203 /* Enabled ones first */
204 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
206 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
209 /* Lower priority values first */
210 if (x->priority < y->priority)
212 if (x->priority > y->priority)
215 /* Stability for the rest */
224 static int earliest_time_prioq_compare(const void *a, const void *b) {
225 const sd_event_source *x = a, *y = b;
227 assert(x->type == SOURCE_MONOTONIC || x->type == SOURCE_REALTIME);
228 assert(y->type == SOURCE_MONOTONIC || y->type == SOURCE_REALTIME);
230 /* Enabled ones first */
231 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
233 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
236 /* Move the pending ones to the end */
237 if (!x->pending && y->pending)
239 if (x->pending && !y->pending)
243 if (x->time.next < y->time.next)
245 if (x->time.next > y->time.next)
248 /* Stability for the rest */
257 static int latest_time_prioq_compare(const void *a, const void *b) {
258 const sd_event_source *x = a, *y = b;
260 assert((x->type == SOURCE_MONOTONIC && y->type == SOURCE_MONOTONIC) ||
261 (x->type == SOURCE_REALTIME && y->type == SOURCE_REALTIME));
263 /* Enabled ones first */
264 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
266 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
269 /* Move the pending ones to the end */
270 if (!x->pending && y->pending)
272 if (x->pending && !y->pending)
276 if (x->time.next + x->time.accuracy < y->time.next + y->time.accuracy)
278 if (x->time.next + x->time.accuracy > y->time.next + y->time.accuracy)
281 /* Stability for the rest */
290 static int exit_prioq_compare(const void *a, const void *b) {
291 const sd_event_source *x = a, *y = b;
293 assert(x->type == SOURCE_EXIT);
294 assert(y->type == SOURCE_EXIT);
296 /* Enabled ones first */
297 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
299 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
302 /* Lower priority values first */
303 if (x->priority < y->priority)
305 if (x->priority > y->priority)
308 /* Stability for the rest */
317 static void event_free(sd_event *e) {
320 if (e->default_event_ptr)
321 *(e->default_event_ptr) = NULL;
323 if (e->epoll_fd >= 0)
324 close_nointr_nofail(e->epoll_fd);
326 if (e->signal_fd >= 0)
327 close_nointr_nofail(e->signal_fd);
329 if (e->realtime_fd >= 0)
330 close_nointr_nofail(e->realtime_fd);
332 if (e->monotonic_fd >= 0)
333 close_nointr_nofail(e->monotonic_fd);
335 if (e->watchdog_fd >= 0)
336 close_nointr_nofail(e->watchdog_fd);
338 prioq_free(e->pending);
339 prioq_free(e->prepare);
340 prioq_free(e->monotonic_earliest);
341 prioq_free(e->monotonic_latest);
342 prioq_free(e->realtime_earliest);
343 prioq_free(e->realtime_latest);
346 free(e->signal_sources);
348 hashmap_free(e->child_sources);
352 _public_ int sd_event_new(sd_event** ret) {
356 assert_return(ret, -EINVAL);
358 e = new0(sd_event, 1);
363 e->signal_fd = e->realtime_fd = e->monotonic_fd = e->watchdog_fd = e->epoll_fd = -1;
364 e->realtime_next = e->monotonic_next = (usec_t) -1;
365 e->original_pid = getpid();
367 assert_se(sigemptyset(&e->sigset) == 0);
369 e->pending = prioq_new(pending_prioq_compare);
375 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
376 if (e->epoll_fd < 0) {
389 _public_ sd_event* sd_event_ref(sd_event *e) {
390 assert_return(e, NULL);
392 assert(e->n_ref >= 1);
398 _public_ sd_event* sd_event_unref(sd_event *e) {
403 assert(e->n_ref >= 1);
412 static bool event_pid_changed(sd_event *e) {
415 /* We don't support people creating am event loop and keeping
416 * it around over a fork(). Let's complain. */
418 return e->original_pid != getpid();
421 static int source_io_unregister(sd_event_source *s) {
425 assert(s->type == SOURCE_IO);
427 if (!s->io.registered)
430 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
434 s->io.registered = false;
438 static int source_io_register(
443 struct epoll_event ev = {};
447 assert(s->type == SOURCE_IO);
448 assert(enabled != SD_EVENT_OFF);
453 if (enabled == SD_EVENT_ONESHOT)
454 ev.events |= EPOLLONESHOT;
456 if (s->io.registered)
457 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
459 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
464 s->io.registered = true;
469 static void source_free(sd_event_source *s) {
477 source_io_unregister(s);
481 case SOURCE_MONOTONIC:
482 prioq_remove(s->event->monotonic_earliest, s, &s->time.earliest_index);
483 prioq_remove(s->event->monotonic_latest, s, &s->time.latest_index);
486 case SOURCE_REALTIME:
487 prioq_remove(s->event->realtime_earliest, s, &s->time.earliest_index);
488 prioq_remove(s->event->realtime_latest, s, &s->time.latest_index);
492 if (s->signal.sig > 0) {
493 if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0)
494 assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
496 if (s->event->signal_sources)
497 s->event->signal_sources[s->signal.sig] = NULL;
503 if (s->child.pid > 0) {
504 if (s->enabled != SD_EVENT_OFF) {
505 assert(s->event->n_enabled_child_sources > 0);
506 s->event->n_enabled_child_sources--;
509 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD])
510 assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
512 hashmap_remove(s->event->child_sources, INT_TO_PTR(s->child.pid));
522 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
527 prioq_remove(s->event->pending, s, &s->pending_index);
530 prioq_remove(s->event->prepare, s, &s->prepare_index);
532 sd_event_unref(s->event);
538 static int source_set_pending(sd_event_source *s, bool b) {
542 assert(s->type != SOURCE_EXIT);
550 s->pending_iteration = s->event->iteration;
552 r = prioq_put(s->event->pending, s, &s->pending_index);
558 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
560 if (s->type == SOURCE_REALTIME) {
561 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
562 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
563 } else if (s->type == SOURCE_MONOTONIC) {
564 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
565 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
571 static sd_event_source *source_new(sd_event *e, EventSourceType type) {
576 s = new0(sd_event_source, 1);
581 s->event = sd_event_ref(e);
583 s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
588 _public_ int sd_event_add_io(
592 sd_event_io_handler_t callback,
594 sd_event_source **ret) {
599 assert_return(e, -EINVAL);
600 assert_return(fd >= 0, -EINVAL);
601 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
602 assert_return(callback, -EINVAL);
603 assert_return(ret, -EINVAL);
604 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
605 assert_return(!event_pid_changed(e), -ECHILD);
607 s = source_new(e, SOURCE_IO);
612 s->io.events = events;
613 s->io.callback = callback;
614 s->userdata = userdata;
615 s->enabled = SD_EVENT_ON;
617 r = source_io_register(s, s->enabled, events);
627 static int event_setup_timer_fd(
629 EventSourceType type,
633 struct epoll_event ev = {};
640 if (_likely_(*timer_fd >= 0))
643 fd = timerfd_create(id, TFD_NONBLOCK|TFD_CLOEXEC);
648 ev.data.ptr = INT_TO_PTR(type);
650 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
652 close_nointr_nofail(fd);
656 /* When we sleep for longer, we try to realign the wakeup to
657 the same time wihtin each minute/second/250ms, so that
658 events all across the system can be coalesced into a single
659 CPU wakeup. However, let's take some system-specific
660 randomness for this value, so that in a network of systems
661 with synced clocks timer events are distributed a
662 bit. Here, we calculate a perturbation usec offset from the
665 if (sd_id128_get_boot(&bootid) >= 0)
666 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
672 static int event_add_time_internal(
674 EventSourceType type,
681 sd_event_time_handler_t callback,
683 sd_event_source **ret) {
688 assert_return(e, -EINVAL);
689 assert_return(callback, -EINVAL);
690 assert_return(ret, -EINVAL);
691 assert_return(usec != (uint64_t) -1, -EINVAL);
692 assert_return(accuracy != (uint64_t) -1, -EINVAL);
693 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
694 assert_return(!event_pid_changed(e), -ECHILD);
701 *earliest = prioq_new(earliest_time_prioq_compare);
707 *latest = prioq_new(latest_time_prioq_compare);
713 r = event_setup_timer_fd(e, type, timer_fd, id);
718 s = source_new(e, type);
723 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
724 s->time.callback = callback;
725 s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
726 s->userdata = userdata;
727 s->enabled = SD_EVENT_ONESHOT;
729 r = prioq_put(*earliest, s, &s->time.earliest_index);
733 r = prioq_put(*latest, s, &s->time.latest_index);
745 _public_ int sd_event_add_monotonic(sd_event *e,
748 sd_event_time_handler_t callback,
750 sd_event_source **ret) {
752 return event_add_time_internal(e, SOURCE_MONOTONIC, &e->monotonic_fd, CLOCK_MONOTONIC, &e->monotonic_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
755 _public_ int sd_event_add_realtime(sd_event *e,
758 sd_event_time_handler_t callback,
760 sd_event_source **ret) {
762 return event_add_time_internal(e, SOURCE_REALTIME, &e->realtime_fd, CLOCK_REALTIME, &e->realtime_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
765 static int event_update_signal_fd(sd_event *e) {
766 struct epoll_event ev = {};
772 add_to_epoll = e->signal_fd < 0;
774 r = signalfd(e->signal_fd, &e->sigset, SFD_NONBLOCK|SFD_CLOEXEC);
784 ev.data.ptr = INT_TO_PTR(SOURCE_SIGNAL);
786 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->signal_fd, &ev);
788 close_nointr_nofail(e->signal_fd);
797 _public_ int sd_event_add_signal(
800 sd_event_signal_handler_t callback,
802 sd_event_source **ret) {
807 assert_return(e, -EINVAL);
808 assert_return(sig > 0, -EINVAL);
809 assert_return(sig < _NSIG, -EINVAL);
810 assert_return(callback, -EINVAL);
811 assert_return(ret, -EINVAL);
812 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
813 assert_return(!event_pid_changed(e), -ECHILD);
815 if (!e->signal_sources) {
816 e->signal_sources = new0(sd_event_source*, _NSIG);
817 if (!e->signal_sources)
819 } else if (e->signal_sources[sig])
822 s = source_new(e, SOURCE_SIGNAL);
827 s->signal.callback = callback;
828 s->userdata = userdata;
829 s->enabled = SD_EVENT_ON;
831 e->signal_sources[sig] = s;
832 assert_se(sigaddset(&e->sigset, sig) == 0);
834 if (sig != SIGCHLD || e->n_enabled_child_sources == 0) {
835 r = event_update_signal_fd(e);
846 _public_ int sd_event_add_child(
850 sd_event_child_handler_t callback,
852 sd_event_source **ret) {
857 assert_return(e, -EINVAL);
858 assert_return(pid > 1, -EINVAL);
859 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
860 assert_return(options != 0, -EINVAL);
861 assert_return(callback, -EINVAL);
862 assert_return(ret, -EINVAL);
863 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
864 assert_return(!event_pid_changed(e), -ECHILD);
866 r = hashmap_ensure_allocated(&e->child_sources, trivial_hash_func, trivial_compare_func);
870 if (hashmap_contains(e->child_sources, INT_TO_PTR(pid)))
873 s = source_new(e, SOURCE_CHILD);
878 s->child.options = options;
879 s->child.callback = callback;
880 s->userdata = userdata;
881 s->enabled = SD_EVENT_ONESHOT;
883 r = hashmap_put(e->child_sources, INT_TO_PTR(pid), s);
889 e->n_enabled_child_sources ++;
891 assert_se(sigaddset(&e->sigset, SIGCHLD) == 0);
893 if (!e->signal_sources || !e->signal_sources[SIGCHLD]) {
894 r = event_update_signal_fd(e);
901 e->need_process_child = true;
907 _public_ int sd_event_add_defer(
909 sd_event_handler_t callback,
911 sd_event_source **ret) {
916 assert_return(e, -EINVAL);
917 assert_return(callback, -EINVAL);
918 assert_return(ret, -EINVAL);
919 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
920 assert_return(!event_pid_changed(e), -ECHILD);
922 s = source_new(e, SOURCE_DEFER);
926 s->defer.callback = callback;
927 s->userdata = userdata;
928 s->enabled = SD_EVENT_ONESHOT;
930 r = source_set_pending(s, true);
940 _public_ int sd_event_add_exit(
942 sd_event_handler_t callback,
944 sd_event_source **ret) {
949 assert_return(e, -EINVAL);
950 assert_return(callback, -EINVAL);
951 assert_return(ret, -EINVAL);
952 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
953 assert_return(!event_pid_changed(e), -ECHILD);
956 e->exit = prioq_new(exit_prioq_compare);
961 s = source_new(e, SOURCE_EXIT);
965 s->exit.callback = callback;
966 s->userdata = userdata;
967 s->exit.prioq_index = PRIOQ_IDX_NULL;
968 s->enabled = SD_EVENT_ONESHOT;
970 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
980 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
981 assert_return(s, NULL);
983 assert(s->n_ref >= 1);
989 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
994 assert(s->n_ref >= 1);
998 /* Here's a special hack: when we are called from a
999 * dispatch handler we won't free the event source
1000 * immediately, but we will detach the fd from the
1001 * epoll. This way it is safe for the caller to unref
1002 * the event source and immediately close the fd, but
1003 * we still retain a valid event source object after
1006 if (s->dispatching) {
1007 if (s->type == SOURCE_IO)
1008 source_io_unregister(s);
1016 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1017 assert_return(s, NULL);
1022 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1023 assert_return(s, -EINVAL);
1024 assert_return(s->type != SOURCE_EXIT, -EDOM);
1025 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1026 assert_return(!event_pid_changed(s->event), -ECHILD);
1031 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1032 assert_return(s, -EINVAL);
1033 assert_return(s->type == SOURCE_IO, -EDOM);
1034 assert_return(!event_pid_changed(s->event), -ECHILD);
1039 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1040 assert_return(s, -EINVAL);
1041 assert_return(events, -EINVAL);
1042 assert_return(s->type == SOURCE_IO, -EDOM);
1043 assert_return(!event_pid_changed(s->event), -ECHILD);
1045 *events = s->io.events;
1049 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1052 assert_return(s, -EINVAL);
1053 assert_return(s->type == SOURCE_IO, -EDOM);
1054 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1055 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1056 assert_return(!event_pid_changed(s->event), -ECHILD);
1058 if (s->io.events == events)
1061 if (s->enabled != SD_EVENT_OFF) {
1062 r = source_io_register(s, s->enabled, events);
1067 s->io.events = events;
1068 source_set_pending(s, false);
1073 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1074 assert_return(s, -EINVAL);
1075 assert_return(revents, -EINVAL);
1076 assert_return(s->type == SOURCE_IO, -EDOM);
1077 assert_return(s->pending, -ENODATA);
1078 assert_return(!event_pid_changed(s->event), -ECHILD);
1080 *revents = s->io.revents;
1084 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1085 assert_return(s, -EINVAL);
1086 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1087 assert_return(!event_pid_changed(s->event), -ECHILD);
1089 return s->signal.sig;
1092 _public_ int sd_event_source_get_priority(sd_event_source *s, int *priority) {
1093 assert_return(s, -EINVAL);
1094 assert_return(!event_pid_changed(s->event), -ECHILD);
1099 _public_ int sd_event_source_set_priority(sd_event_source *s, int priority) {
1100 assert_return(s, -EINVAL);
1101 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1102 assert_return(!event_pid_changed(s->event), -ECHILD);
1104 if (s->priority == priority)
1107 s->priority = priority;
1110 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1113 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1115 if (s->type == SOURCE_EXIT)
1116 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1121 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1122 assert_return(s, -EINVAL);
1123 assert_return(m, -EINVAL);
1124 assert_return(!event_pid_changed(s->event), -ECHILD);
1130 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1133 assert_return(s, -EINVAL);
1134 assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
1135 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1136 assert_return(!event_pid_changed(s->event), -ECHILD);
1138 if (s->enabled == m)
1141 if (m == SD_EVENT_OFF) {
1146 r = source_io_unregister(s);
1153 case SOURCE_MONOTONIC:
1155 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1156 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1159 case SOURCE_REALTIME:
1161 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1162 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1167 if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) {
1168 assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
1169 event_update_signal_fd(s->event);
1177 assert(s->event->n_enabled_child_sources > 0);
1178 s->event->n_enabled_child_sources--;
1180 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
1181 assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
1182 event_update_signal_fd(s->event);
1189 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1201 r = source_io_register(s, m, s->io.events);
1208 case SOURCE_MONOTONIC:
1210 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1211 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1214 case SOURCE_REALTIME:
1216 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1217 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1223 if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) {
1224 assert_se(sigaddset(&s->event->sigset, s->signal.sig) == 0);
1225 event_update_signal_fd(s->event);
1232 if (s->enabled == SD_EVENT_OFF) {
1233 s->event->n_enabled_child_sources++;
1235 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
1236 assert_se(sigaddset(&s->event->sigset, SIGCHLD) == 0);
1237 event_update_signal_fd(s->event);
1244 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1254 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1257 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1262 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1263 assert_return(s, -EINVAL);
1264 assert_return(usec, -EINVAL);
1265 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1266 assert_return(!event_pid_changed(s->event), -ECHILD);
1268 *usec = s->time.next;
1272 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1273 assert_return(s, -EINVAL);
1274 assert_return(usec != (uint64_t) -1, -EINVAL);
1275 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1276 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1277 assert_return(!event_pid_changed(s->event), -ECHILD);
1279 s->time.next = usec;
1281 source_set_pending(s, false);
1283 if (s->type == SOURCE_REALTIME) {
1284 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1285 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1287 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1288 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1294 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1295 assert_return(s, -EINVAL);
1296 assert_return(usec, -EINVAL);
1297 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1298 assert_return(!event_pid_changed(s->event), -ECHILD);
1300 *usec = s->time.accuracy;
1304 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1305 assert_return(s, -EINVAL);
1306 assert_return(usec != (uint64_t) -1, -EINVAL);
1307 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1308 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1309 assert_return(!event_pid_changed(s->event), -ECHILD);
1312 usec = DEFAULT_ACCURACY_USEC;
1314 s->time.accuracy = usec;
1316 source_set_pending(s, false);
1318 if (s->type == SOURCE_REALTIME)
1319 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1321 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1326 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1327 assert_return(s, -EINVAL);
1328 assert_return(pid, -EINVAL);
1329 assert_return(s->type == SOURCE_CHILD, -EDOM);
1330 assert_return(!event_pid_changed(s->event), -ECHILD);
1332 *pid = s->child.pid;
1336 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1339 assert_return(s, -EINVAL);
1340 assert_return(s->type != SOURCE_EXIT, -EDOM);
1341 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1342 assert_return(!event_pid_changed(s->event), -ECHILD);
1344 if (s->prepare == callback)
1347 if (callback && s->prepare) {
1348 s->prepare = callback;
1352 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1356 s->prepare = callback;
1359 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1363 prioq_remove(s->event->prepare, s, &s->prepare_index);
1368 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1369 assert_return(s, NULL);
1374 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1386 Find a good time to wake up again between times a and b. We
1387 have two goals here:
1389 a) We want to wake up as seldom as possible, hence prefer
1390 later times over earlier times.
1392 b) But if we have to wake up, then let's make sure to
1393 dispatch as much as possible on the entire system.
1395 We implement this by waking up everywhere at the same time
1396 within any given minute if we can, synchronised via the
1397 perturbation value determined from the boot ID. If we can't,
1398 then we try to find the same spot in every 10s, then 1s and
1399 then 250ms step. Otherwise, we pick the last possible time
1403 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1405 if (_unlikely_(c < USEC_PER_MINUTE))
1408 c -= USEC_PER_MINUTE;
1414 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1416 if (_unlikely_(c < USEC_PER_SEC*10))
1419 c -= USEC_PER_SEC*10;
1425 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1427 if (_unlikely_(c < USEC_PER_SEC))
1436 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1438 if (_unlikely_(c < USEC_PER_MSEC*250))
1441 c -= USEC_PER_MSEC*250;
1450 static int event_arm_timer(
1457 struct itimerspec its = {};
1458 sd_event_source *a, *b;
1465 a = prioq_peek(earliest);
1466 if (!a || a->enabled == SD_EVENT_OFF) {
1471 if (*next == (usec_t) -1)
1475 r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
1479 *next = (usec_t) -1;
1484 b = prioq_peek(latest);
1485 assert_se(b && b->enabled != SD_EVENT_OFF);
1487 t = sleep_between(e, a->time.next, b->time.next + b->time.accuracy);
1491 assert_se(timer_fd >= 0);
1494 /* We don' want to disarm here, just mean some time looooong ago. */
1495 its.it_value.tv_sec = 0;
1496 its.it_value.tv_nsec = 1;
1498 timespec_store(&its.it_value, t);
1500 r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
1508 static int process_io(sd_event *e, sd_event_source *s, uint32_t events) {
1511 assert(s->type == SOURCE_IO);
1513 s->io.revents = events;
1515 return source_set_pending(s, true);
1518 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
1525 assert_return(events == EPOLLIN, -EIO);
1527 ss = read(fd, &x, sizeof(x));
1529 if (errno == EAGAIN || errno == EINTR)
1535 if (ss != sizeof(x))
1539 *next = (usec_t) -1;
1544 static int process_timer(
1556 s = prioq_peek(earliest);
1559 s->enabled == SD_EVENT_OFF ||
1563 r = source_set_pending(s, true);
1567 prioq_reshuffle(earliest, s, &s->time.earliest_index);
1568 prioq_reshuffle(latest, s, &s->time.latest_index);
1574 static int process_child(sd_event *e) {
1581 e->need_process_child = false;
1584 So, this is ugly. We iteratively invoke waitid() with P_PID
1585 + WNOHANG for each PID we wait for, instead of using
1586 P_ALL. This is because we only want to get child
1587 information of very specific child processes, and not all
1588 of them. We might not have processed the SIGCHLD even of a
1589 previous invocation and we don't want to maintain a
1590 unbounded *per-child* event queue, hence we really don't
1591 want anything flushed out of the kernel's queue that we
1592 don't care about. Since this is O(n) this means that if you
1593 have a lot of processes you probably want to handle SIGCHLD
1596 We do not reap the children here (by using WNOWAIT), this
1597 is only done after the event source is dispatched so that
1598 the callback still sees the process as a zombie.
1601 HASHMAP_FOREACH(s, e->child_sources, i) {
1602 assert(s->type == SOURCE_CHILD);
1607 if (s->enabled == SD_EVENT_OFF)
1610 zero(s->child.siginfo);
1611 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
1612 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
1616 if (s->child.siginfo.si_pid != 0) {
1618 s->child.siginfo.si_code == CLD_EXITED ||
1619 s->child.siginfo.si_code == CLD_KILLED ||
1620 s->child.siginfo.si_code == CLD_DUMPED;
1622 if (!zombie && (s->child.options & WEXITED)) {
1623 /* If the child isn't dead then let's
1624 * immediately remove the state change
1625 * from the queue, since there's no
1626 * benefit in leaving it queued */
1628 assert(s->child.options & (WSTOPPED|WCONTINUED));
1629 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
1632 r = source_set_pending(s, true);
1641 static int process_signal(sd_event *e, uint32_t events) {
1642 bool read_one = false;
1646 assert(e->signal_sources);
1648 assert_return(events == EPOLLIN, -EIO);
1651 struct signalfd_siginfo si;
1655 ss = read(e->signal_fd, &si, sizeof(si));
1657 if (errno == EAGAIN || errno == EINTR)
1663 if (ss != sizeof(si))
1668 s = e->signal_sources[si.ssi_signo];
1669 if (si.ssi_signo == SIGCHLD) {
1670 r = process_child(e);
1679 s->signal.siginfo = si;
1680 r = source_set_pending(s, true);
1688 static int source_dispatch(sd_event_source *s) {
1692 assert(s->pending || s->type == SOURCE_EXIT);
1694 if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) {
1695 r = source_set_pending(s, false);
1700 if (s->enabled == SD_EVENT_ONESHOT) {
1701 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
1706 s->dispatching = true;
1711 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
1714 case SOURCE_MONOTONIC:
1715 r = s->time.callback(s, s->time.next, s->userdata);
1718 case SOURCE_REALTIME:
1719 r = s->time.callback(s, s->time.next, s->userdata);
1723 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
1726 case SOURCE_CHILD: {
1729 zombie = s->child.siginfo.si_code == CLD_EXITED ||
1730 s->child.siginfo.si_code == CLD_KILLED ||
1731 s->child.siginfo.si_code == CLD_DUMPED;
1733 r = s->child.callback(s, &s->child.siginfo, s->userdata);
1735 /* Now, reap the PID for good. */
1737 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
1743 r = s->defer.callback(s, s->userdata);
1747 r = s->exit.callback(s, s->userdata);
1751 s->dispatching = false;
1754 log_debug("Event source %p returned error, disabling: %s", s, strerror(-r));
1759 sd_event_source_set_enabled(s, SD_EVENT_OFF);
1764 static int event_prepare(sd_event *e) {
1772 s = prioq_peek(e->prepare);
1773 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
1776 s->prepare_iteration = e->iteration;
1777 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
1783 s->dispatching = true;
1784 r = s->prepare(s, s->userdata);
1785 s->dispatching = false;
1788 log_debug("Prepare callback of event source %p returned error, disabling: %s", s, strerror(-r));
1793 sd_event_source_set_enabled(s, SD_EVENT_OFF);
1799 static int dispatch_exit(sd_event *e) {
1805 p = prioq_peek(e->exit);
1806 if (!p || p->enabled == SD_EVENT_OFF) {
1807 e->state = SD_EVENT_FINISHED;
1813 e->state = SD_EVENT_EXITING;
1815 r = source_dispatch(p);
1817 e->state = SD_EVENT_PASSIVE;
1823 static sd_event_source* event_next_pending(sd_event *e) {
1828 p = prioq_peek(e->pending);
1832 if (p->enabled == SD_EVENT_OFF)
1838 static int arm_watchdog(sd_event *e) {
1839 struct itimerspec its = {};
1844 assert(e->watchdog_fd >= 0);
1846 t = sleep_between(e,
1847 e->watchdog_last + (e->watchdog_period / 2),
1848 e->watchdog_last + (e->watchdog_period * 3 / 4));
1850 timespec_store(&its.it_value, t);
1852 r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
1859 static int process_watchdog(sd_event *e) {
1865 /* Don't notify watchdog too often */
1866 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
1869 sd_notify(false, "WATCHDOG=1");
1870 e->watchdog_last = e->timestamp.monotonic;
1872 return arm_watchdog(e);
1875 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
1876 struct epoll_event ev_queue[EPOLL_QUEUE_MAX];
1880 assert_return(e, -EINVAL);
1881 assert_return(!event_pid_changed(e), -ECHILD);
1882 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1883 assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
1885 if (e->exit_requested)
1886 return dispatch_exit(e);
1890 e->state = SD_EVENT_RUNNING;
1892 r = event_prepare(e);
1896 r = event_arm_timer(e, e->monotonic_fd, e->monotonic_earliest, e->monotonic_latest, &e->monotonic_next);
1900 r = event_arm_timer(e, e->realtime_fd, e->realtime_earliest, e->realtime_latest, &e->realtime_next);
1904 if (event_next_pending(e) || e->need_process_child)
1907 m = epoll_wait(e->epoll_fd, ev_queue, EPOLL_QUEUE_MAX,
1908 timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
1910 r = errno == EAGAIN || errno == EINTR ? 0 : -errno;
1914 dual_timestamp_get(&e->timestamp);
1916 for (i = 0; i < m; i++) {
1918 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_MONOTONIC))
1919 r = flush_timer(e, e->monotonic_fd, ev_queue[i].events, &e->monotonic_next);
1920 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_REALTIME))
1921 r = flush_timer(e, e->realtime_fd, ev_queue[i].events, &e->realtime_next);
1922 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_SIGNAL))
1923 r = process_signal(e, ev_queue[i].events);
1924 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
1925 r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
1927 r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
1933 r = process_watchdog(e);
1937 r = process_timer(e, e->timestamp.monotonic, e->monotonic_earliest, e->monotonic_latest);
1941 r = process_timer(e, e->timestamp.realtime, e->realtime_earliest, e->realtime_latest);
1945 if (e->need_process_child) {
1946 r = process_child(e);
1951 p = event_next_pending(e);
1957 r = source_dispatch(p);
1960 e->state = SD_EVENT_PASSIVE;
1966 _public_ int sd_event_loop(sd_event *e) {
1969 assert_return(e, -EINVAL);
1970 assert_return(!event_pid_changed(e), -ECHILD);
1971 assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
1975 while (e->state != SD_EVENT_FINISHED) {
1976 r = sd_event_run(e, (uint64_t) -1);
1988 _public_ int sd_event_get_state(sd_event *e) {
1989 assert_return(e, -EINVAL);
1990 assert_return(!event_pid_changed(e), -ECHILD);
1995 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
1996 assert_return(e, -EINVAL);
1997 assert_return(code, -EINVAL);
1998 assert_return(!event_pid_changed(e), -ECHILD);
2000 if (!e->exit_requested)
2003 *code = e->exit_code;
2007 _public_ int sd_event_exit(sd_event *e, int code) {
2008 assert_return(e, -EINVAL);
2009 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2010 assert_return(!event_pid_changed(e), -ECHILD);
2012 e->exit_requested = true;
2013 e->exit_code = code;
2018 _public_ int sd_event_get_now_realtime(sd_event *e, uint64_t *usec) {
2019 assert_return(e, -EINVAL);
2020 assert_return(usec, -EINVAL);
2021 assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
2022 assert_return(!event_pid_changed(e), -ECHILD);
2024 *usec = e->timestamp.realtime;
2028 _public_ int sd_event_get_now_monotonic(sd_event *e, uint64_t *usec) {
2029 assert_return(e, -EINVAL);
2030 assert_return(usec, -EINVAL);
2031 assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
2032 assert_return(!event_pid_changed(e), -ECHILD);
2034 *usec = e->timestamp.monotonic;
2038 _public_ int sd_event_default(sd_event **ret) {
2040 static __thread sd_event *default_event = NULL;
2045 return !!default_event;
2047 if (default_event) {
2048 *ret = sd_event_ref(default_event);
2052 r = sd_event_new(&e);
2056 e->default_event_ptr = &default_event;
2064 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2065 assert_return(e, -EINVAL);
2066 assert_return(tid, -EINVAL);
2067 assert_return(!event_pid_changed(e), -ECHILD);
2077 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2080 assert_return(e, -EINVAL);
2082 if (e->watchdog == !!b)
2086 struct epoll_event ev = {};
2089 env = getenv("WATCHDOG_USEC");
2093 r = safe_atou64(env, &e->watchdog_period);
2096 if (e->watchdog_period <= 0)
2099 /* Issue first ping immediately */
2100 sd_notify(false, "WATCHDOG=1");
2101 e->watchdog_last = now(CLOCK_MONOTONIC);
2103 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2104 if (e->watchdog_fd < 0)
2107 r = arm_watchdog(e);
2111 ev.events = EPOLLIN;
2112 ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
2114 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2121 if (e->watchdog_fd >= 0) {
2122 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2123 close_nointr_nofail(e->watchdog_fd);
2124 e->watchdog_fd = -1;
2132 close_nointr_nofail(e->watchdog_fd);
2133 e->watchdog_fd = -1;