1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2013 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
22 #include <sys/epoll.h>
23 #include <sys/timerfd.h>
27 #include "sd-daemon.h"
32 #include "time-util.h"
37 #define EPOLL_QUEUE_MAX 64
38 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
40 typedef enum EventSourceType {
51 struct sd_event_source {
56 sd_event_handler_t prepare;
58 EventSourceType type:4;
63 unsigned pending_index;
64 unsigned prepare_index;
65 unsigned pending_iteration;
66 unsigned prepare_iteration;
70 sd_event_io_handler_t callback;
77 sd_event_time_handler_t callback;
78 usec_t next, accuracy;
79 unsigned earliest_index;
80 unsigned latest_index;
83 sd_event_signal_handler_t callback;
84 struct signalfd_siginfo siginfo;
88 sd_event_child_handler_t callback;
94 sd_event_handler_t callback;
97 sd_event_handler_t callback;
115 /* For both clocks we maintain two priority queues each, one
116 * ordered for the earliest times the events may be
117 * dispatched, and one ordered by the latest times they must
118 * have been dispatched. The range between the top entries in
119 * the two prioqs is the time window we can freely schedule
121 Prioq *monotonic_earliest;
122 Prioq *monotonic_latest;
123 Prioq *realtime_earliest;
124 Prioq *realtime_latest;
126 usec_t realtime_next, monotonic_next;
130 sd_event_source **signal_sources;
132 Hashmap *child_sources;
133 unsigned n_enabled_child_sources;
140 dual_timestamp timestamp;
143 bool exit_requested:1;
144 bool need_process_child:1;
150 sd_event **default_event_ptr;
152 usec_t watchdog_last, watchdog_period;
155 static int pending_prioq_compare(const void *a, const void *b) {
156 const sd_event_source *x = a, *y = b;
161 /* Enabled ones first */
162 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
164 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
167 /* Lower priority values first */
168 if (x->priority < y->priority)
170 if (x->priority > y->priority)
173 /* Older entries first */
174 if (x->pending_iteration < y->pending_iteration)
176 if (x->pending_iteration > y->pending_iteration)
179 /* Stability for the rest */
188 static int prepare_prioq_compare(const void *a, const void *b) {
189 const sd_event_source *x = a, *y = b;
194 /* Move most recently prepared ones last, so that we can stop
195 * preparing as soon as we hit one that has already been
196 * prepared in the current iteration */
197 if (x->prepare_iteration < y->prepare_iteration)
199 if (x->prepare_iteration > y->prepare_iteration)
202 /* Enabled ones first */
203 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
205 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
208 /* Lower priority values first */
209 if (x->priority < y->priority)
211 if (x->priority > y->priority)
214 /* Stability for the rest */
223 static int earliest_time_prioq_compare(const void *a, const void *b) {
224 const sd_event_source *x = a, *y = b;
226 assert(x->type == SOURCE_MONOTONIC || x->type == SOURCE_REALTIME);
227 assert(y->type == SOURCE_MONOTONIC || y->type == SOURCE_REALTIME);
229 /* Enabled ones first */
230 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
232 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
235 /* Move the pending ones to the end */
236 if (!x->pending && y->pending)
238 if (x->pending && !y->pending)
242 if (x->time.next < y->time.next)
244 if (x->time.next > y->time.next)
247 /* Stability for the rest */
256 static int latest_time_prioq_compare(const void *a, const void *b) {
257 const sd_event_source *x = a, *y = b;
259 assert((x->type == SOURCE_MONOTONIC && y->type == SOURCE_MONOTONIC) ||
260 (x->type == SOURCE_REALTIME && y->type == SOURCE_REALTIME));
262 /* Enabled ones first */
263 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
265 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
268 /* Move the pending ones to the end */
269 if (!x->pending && y->pending)
271 if (x->pending && !y->pending)
275 if (x->time.next + x->time.accuracy < y->time.next + y->time.accuracy)
277 if (x->time.next + x->time.accuracy > y->time.next + y->time.accuracy)
280 /* Stability for the rest */
289 static int exit_prioq_compare(const void *a, const void *b) {
290 const sd_event_source *x = a, *y = b;
292 assert(x->type == SOURCE_EXIT);
293 assert(y->type == SOURCE_EXIT);
295 /* Enabled ones first */
296 if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
298 if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
301 /* Lower priority values first */
302 if (x->priority < y->priority)
304 if (x->priority > y->priority)
307 /* Stability for the rest */
316 static void event_free(sd_event *e) {
319 if (e->default_event_ptr)
320 *(e->default_event_ptr) = NULL;
322 if (e->epoll_fd >= 0)
323 close_nointr_nofail(e->epoll_fd);
325 if (e->signal_fd >= 0)
326 close_nointr_nofail(e->signal_fd);
328 if (e->realtime_fd >= 0)
329 close_nointr_nofail(e->realtime_fd);
331 if (e->monotonic_fd >= 0)
332 close_nointr_nofail(e->monotonic_fd);
334 if (e->watchdog_fd >= 0)
335 close_nointr_nofail(e->watchdog_fd);
337 prioq_free(e->pending);
338 prioq_free(e->prepare);
339 prioq_free(e->monotonic_earliest);
340 prioq_free(e->monotonic_latest);
341 prioq_free(e->realtime_earliest);
342 prioq_free(e->realtime_latest);
345 free(e->signal_sources);
347 hashmap_free(e->child_sources);
351 _public_ int sd_event_new(sd_event** ret) {
355 assert_return(ret, -EINVAL);
357 e = new0(sd_event, 1);
362 e->signal_fd = e->realtime_fd = e->monotonic_fd = e->watchdog_fd = e->epoll_fd = -1;
363 e->realtime_next = e->monotonic_next = (usec_t) -1;
364 e->original_pid = getpid();
366 assert_se(sigemptyset(&e->sigset) == 0);
368 e->pending = prioq_new(pending_prioq_compare);
374 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
375 if (e->epoll_fd < 0) {
388 _public_ sd_event* sd_event_ref(sd_event *e) {
389 assert_return(e, NULL);
391 assert(e->n_ref >= 1);
397 _public_ sd_event* sd_event_unref(sd_event *e) {
402 assert(e->n_ref >= 1);
411 static bool event_pid_changed(sd_event *e) {
414 /* We don't support people creating am event loop and keeping
415 * it around over a fork(). Let's complain. */
417 return e->original_pid != getpid();
420 static int source_io_unregister(sd_event_source *s) {
424 assert(s->type == SOURCE_IO);
426 if (!s->io.registered)
429 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
433 s->io.registered = false;
437 static int source_io_register(
442 struct epoll_event ev = {};
446 assert(s->type == SOURCE_IO);
447 assert(enabled != SD_EVENT_OFF);
452 if (enabled == SD_EVENT_ONESHOT)
453 ev.events |= EPOLLONESHOT;
455 if (s->io.registered)
456 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
458 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
463 s->io.registered = true;
468 static void source_free(sd_event_source *s) {
476 source_io_unregister(s);
480 case SOURCE_MONOTONIC:
481 prioq_remove(s->event->monotonic_earliest, s, &s->time.earliest_index);
482 prioq_remove(s->event->monotonic_latest, s, &s->time.latest_index);
485 case SOURCE_REALTIME:
486 prioq_remove(s->event->realtime_earliest, s, &s->time.earliest_index);
487 prioq_remove(s->event->realtime_latest, s, &s->time.latest_index);
491 if (s->signal.sig > 0) {
492 if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0)
493 assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
495 if (s->event->signal_sources)
496 s->event->signal_sources[s->signal.sig] = NULL;
502 if (s->child.pid > 0) {
503 if (s->enabled != SD_EVENT_OFF) {
504 assert(s->event->n_enabled_child_sources > 0);
505 s->event->n_enabled_child_sources--;
508 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD])
509 assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
511 hashmap_remove(s->event->child_sources, INT_TO_PTR(s->child.pid));
521 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
526 prioq_remove(s->event->pending, s, &s->pending_index);
529 prioq_remove(s->event->prepare, s, &s->prepare_index);
531 sd_event_unref(s->event);
537 static int source_set_pending(sd_event_source *s, bool b) {
541 assert(s->type != SOURCE_EXIT);
549 s->pending_iteration = s->event->iteration;
551 r = prioq_put(s->event->pending, s, &s->pending_index);
557 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
559 if (s->type == SOURCE_REALTIME) {
560 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
561 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
562 } else if (s->type == SOURCE_MONOTONIC) {
563 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
564 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
570 static sd_event_source *source_new(sd_event *e, EventSourceType type) {
575 s = new0(sd_event_source, 1);
580 s->event = sd_event_ref(e);
582 s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
587 _public_ int sd_event_add_io(
591 sd_event_io_handler_t callback,
593 sd_event_source **ret) {
598 assert_return(e, -EINVAL);
599 assert_return(fd >= 0, -EINVAL);
600 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
601 assert_return(callback, -EINVAL);
602 assert_return(ret, -EINVAL);
603 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
604 assert_return(!event_pid_changed(e), -ECHILD);
606 s = source_new(e, SOURCE_IO);
611 s->io.events = events;
612 s->io.callback = callback;
613 s->userdata = userdata;
614 s->enabled = SD_EVENT_ON;
616 r = source_io_register(s, s->enabled, events);
626 static int event_setup_timer_fd(
628 EventSourceType type,
632 struct epoll_event ev = {};
639 if (_likely_(*timer_fd >= 0))
642 fd = timerfd_create(id, TFD_NONBLOCK|TFD_CLOEXEC);
647 ev.data.ptr = INT_TO_PTR(type);
649 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
651 close_nointr_nofail(fd);
655 /* When we sleep for longer, we try to realign the wakeup to
656 the same time wihtin each minute/second/250ms, so that
657 events all across the system can be coalesced into a single
658 CPU wakeup. However, let's take some system-specific
659 randomness for this value, so that in a network of systems
660 with synced clocks timer events are distributed a
661 bit. Here, we calculate a perturbation usec offset from the
664 if (sd_id128_get_boot(&bootid) >= 0)
665 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
671 static int event_add_time_internal(
673 EventSourceType type,
680 sd_event_time_handler_t callback,
682 sd_event_source **ret) {
687 assert_return(e, -EINVAL);
688 assert_return(callback, -EINVAL);
689 assert_return(ret, -EINVAL);
690 assert_return(usec != (uint64_t) -1, -EINVAL);
691 assert_return(accuracy != (uint64_t) -1, -EINVAL);
692 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
693 assert_return(!event_pid_changed(e), -ECHILD);
700 *earliest = prioq_new(earliest_time_prioq_compare);
706 *latest = prioq_new(latest_time_prioq_compare);
712 r = event_setup_timer_fd(e, type, timer_fd, id);
717 s = source_new(e, type);
722 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
723 s->time.callback = callback;
724 s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
725 s->userdata = userdata;
726 s->enabled = SD_EVENT_ONESHOT;
728 r = prioq_put(*earliest, s, &s->time.earliest_index);
732 r = prioq_put(*latest, s, &s->time.latest_index);
744 _public_ int sd_event_add_monotonic(sd_event *e,
747 sd_event_time_handler_t callback,
749 sd_event_source **ret) {
751 return event_add_time_internal(e, SOURCE_MONOTONIC, &e->monotonic_fd, CLOCK_MONOTONIC, &e->monotonic_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
754 _public_ int sd_event_add_realtime(sd_event *e,
757 sd_event_time_handler_t callback,
759 sd_event_source **ret) {
761 return event_add_time_internal(e, SOURCE_REALTIME, &e->realtime_fd, CLOCK_REALTIME, &e->realtime_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
764 static int event_update_signal_fd(sd_event *e) {
765 struct epoll_event ev = {};
771 add_to_epoll = e->signal_fd < 0;
773 r = signalfd(e->signal_fd, &e->sigset, SFD_NONBLOCK|SFD_CLOEXEC);
783 ev.data.ptr = INT_TO_PTR(SOURCE_SIGNAL);
785 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->signal_fd, &ev);
787 close_nointr_nofail(e->signal_fd);
796 _public_ int sd_event_add_signal(
799 sd_event_signal_handler_t callback,
801 sd_event_source **ret) {
806 assert_return(e, -EINVAL);
807 assert_return(sig > 0, -EINVAL);
808 assert_return(sig < _NSIG, -EINVAL);
809 assert_return(callback, -EINVAL);
810 assert_return(ret, -EINVAL);
811 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
812 assert_return(!event_pid_changed(e), -ECHILD);
814 if (!e->signal_sources) {
815 e->signal_sources = new0(sd_event_source*, _NSIG);
816 if (!e->signal_sources)
818 } else if (e->signal_sources[sig])
821 s = source_new(e, SOURCE_SIGNAL);
826 s->signal.callback = callback;
827 s->userdata = userdata;
828 s->enabled = SD_EVENT_ON;
830 e->signal_sources[sig] = s;
831 assert_se(sigaddset(&e->sigset, sig) == 0);
833 if (sig != SIGCHLD || e->n_enabled_child_sources == 0) {
834 r = event_update_signal_fd(e);
845 _public_ int sd_event_add_child(
849 sd_event_child_handler_t callback,
851 sd_event_source **ret) {
856 assert_return(e, -EINVAL);
857 assert_return(pid > 1, -EINVAL);
858 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
859 assert_return(options != 0, -EINVAL);
860 assert_return(callback, -EINVAL);
861 assert_return(ret, -EINVAL);
862 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
863 assert_return(!event_pid_changed(e), -ECHILD);
865 r = hashmap_ensure_allocated(&e->child_sources, trivial_hash_func, trivial_compare_func);
869 if (hashmap_contains(e->child_sources, INT_TO_PTR(pid)))
872 s = source_new(e, SOURCE_CHILD);
877 s->child.options = options;
878 s->child.callback = callback;
879 s->userdata = userdata;
880 s->enabled = SD_EVENT_ONESHOT;
882 r = hashmap_put(e->child_sources, INT_TO_PTR(pid), s);
888 e->n_enabled_child_sources ++;
890 assert_se(sigaddset(&e->sigset, SIGCHLD) == 0);
892 if (!e->signal_sources || !e->signal_sources[SIGCHLD]) {
893 r = event_update_signal_fd(e);
900 e->need_process_child = true;
906 _public_ int sd_event_add_defer(
908 sd_event_handler_t callback,
910 sd_event_source **ret) {
915 assert_return(e, -EINVAL);
916 assert_return(callback, -EINVAL);
917 assert_return(ret, -EINVAL);
918 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
919 assert_return(!event_pid_changed(e), -ECHILD);
921 s = source_new(e, SOURCE_DEFER);
925 s->defer.callback = callback;
926 s->userdata = userdata;
927 s->enabled = SD_EVENT_ONESHOT;
929 r = source_set_pending(s, true);
939 _public_ int sd_event_add_exit(
941 sd_event_handler_t callback,
943 sd_event_source **ret) {
948 assert_return(e, -EINVAL);
949 assert_return(callback, -EINVAL);
950 assert_return(ret, -EINVAL);
951 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
952 assert_return(!event_pid_changed(e), -ECHILD);
955 e->exit = prioq_new(exit_prioq_compare);
960 s = source_new(e, SOURCE_EXIT);
964 s->exit.callback = callback;
965 s->userdata = userdata;
966 s->exit.prioq_index = PRIOQ_IDX_NULL;
967 s->enabled = SD_EVENT_ONESHOT;
969 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
979 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
980 assert_return(s, NULL);
982 assert(s->n_ref >= 1);
988 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
993 assert(s->n_ref >= 1);
1002 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1003 assert_return(s, NULL);
1008 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1009 assert_return(s, -EINVAL);
1010 assert_return(s->type != SOURCE_EXIT, -EDOM);
1011 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1012 assert_return(!event_pid_changed(s->event), -ECHILD);
1017 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1018 assert_return(s, -EINVAL);
1019 assert_return(s->type == SOURCE_IO, -EDOM);
1020 assert_return(!event_pid_changed(s->event), -ECHILD);
1025 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1026 assert_return(s, -EINVAL);
1027 assert_return(events, -EINVAL);
1028 assert_return(s->type == SOURCE_IO, -EDOM);
1029 assert_return(!event_pid_changed(s->event), -ECHILD);
1031 *events = s->io.events;
1035 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1038 assert_return(s, -EINVAL);
1039 assert_return(s->type == SOURCE_IO, -EDOM);
1040 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1041 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1042 assert_return(!event_pid_changed(s->event), -ECHILD);
1044 if (s->io.events == events)
1047 if (s->enabled != SD_EVENT_OFF) {
1048 r = source_io_register(s, s->enabled, events);
1053 s->io.events = events;
1054 source_set_pending(s, false);
1059 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1060 assert_return(s, -EINVAL);
1061 assert_return(revents, -EINVAL);
1062 assert_return(s->type == SOURCE_IO, -EDOM);
1063 assert_return(s->pending, -ENODATA);
1064 assert_return(!event_pid_changed(s->event), -ECHILD);
1066 *revents = s->io.revents;
1070 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1071 assert_return(s, -EINVAL);
1072 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1073 assert_return(!event_pid_changed(s->event), -ECHILD);
1075 return s->signal.sig;
1078 _public_ int sd_event_source_get_priority(sd_event_source *s, int *priority) {
1079 assert_return(s, -EINVAL);
1080 assert_return(!event_pid_changed(s->event), -ECHILD);
1085 _public_ int sd_event_source_set_priority(sd_event_source *s, int priority) {
1086 assert_return(s, -EINVAL);
1087 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1088 assert_return(!event_pid_changed(s->event), -ECHILD);
1090 if (s->priority == priority)
1093 s->priority = priority;
1096 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1099 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1101 if (s->type == SOURCE_EXIT)
1102 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1107 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1108 assert_return(s, -EINVAL);
1109 assert_return(m, -EINVAL);
1110 assert_return(!event_pid_changed(s->event), -ECHILD);
1116 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1119 assert_return(s, -EINVAL);
1120 assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
1121 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1122 assert_return(!event_pid_changed(s->event), -ECHILD);
1124 if (s->enabled == m)
1127 if (m == SD_EVENT_OFF) {
1132 r = source_io_unregister(s);
1139 case SOURCE_MONOTONIC:
1141 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1142 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1145 case SOURCE_REALTIME:
1147 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1148 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1153 if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) {
1154 assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
1155 event_update_signal_fd(s->event);
1163 assert(s->event->n_enabled_child_sources > 0);
1164 s->event->n_enabled_child_sources--;
1166 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
1167 assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
1168 event_update_signal_fd(s->event);
1175 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1187 r = source_io_register(s, m, s->io.events);
1194 case SOURCE_MONOTONIC:
1196 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1197 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1200 case SOURCE_REALTIME:
1202 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1203 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1209 if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) {
1210 assert_se(sigaddset(&s->event->sigset, s->signal.sig) == 0);
1211 event_update_signal_fd(s->event);
1218 if (s->enabled == SD_EVENT_OFF) {
1219 s->event->n_enabled_child_sources++;
1221 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
1222 assert_se(sigaddset(&s->event->sigset, SIGCHLD) == 0);
1223 event_update_signal_fd(s->event);
1230 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1240 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1243 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1248 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1249 assert_return(s, -EINVAL);
1250 assert_return(usec, -EINVAL);
1251 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1252 assert_return(!event_pid_changed(s->event), -ECHILD);
1254 *usec = s->time.next;
1258 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1259 assert_return(s, -EINVAL);
1260 assert_return(usec != (uint64_t) -1, -EINVAL);
1261 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1262 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1263 assert_return(!event_pid_changed(s->event), -ECHILD);
1265 s->time.next = usec;
1267 source_set_pending(s, false);
1269 if (s->type == SOURCE_REALTIME) {
1270 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1271 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1273 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1274 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1280 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1281 assert_return(s, -EINVAL);
1282 assert_return(usec, -EINVAL);
1283 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1284 assert_return(!event_pid_changed(s->event), -ECHILD);
1286 *usec = s->time.accuracy;
1290 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1291 assert_return(s, -EINVAL);
1292 assert_return(usec != (uint64_t) -1, -EINVAL);
1293 assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1294 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1295 assert_return(!event_pid_changed(s->event), -ECHILD);
1298 usec = DEFAULT_ACCURACY_USEC;
1300 s->time.accuracy = usec;
1302 source_set_pending(s, false);
1304 if (s->type == SOURCE_REALTIME)
1305 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1307 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1312 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1313 assert_return(s, -EINVAL);
1314 assert_return(pid, -EINVAL);
1315 assert_return(s->type == SOURCE_CHILD, -EDOM);
1316 assert_return(!event_pid_changed(s->event), -ECHILD);
1318 *pid = s->child.pid;
1322 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1325 assert_return(s, -EINVAL);
1326 assert_return(s->type != SOURCE_EXIT, -EDOM);
1327 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1328 assert_return(!event_pid_changed(s->event), -ECHILD);
1330 if (s->prepare == callback)
1333 if (callback && s->prepare) {
1334 s->prepare = callback;
1338 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1342 s->prepare = callback;
1345 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1349 prioq_remove(s->event->prepare, s, &s->prepare_index);
1354 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1355 assert_return(s, NULL);
1360 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1372 Find a good time to wake up again between times a and b. We
1373 have two goals here:
1375 a) We want to wake up as seldom as possible, hence prefer
1376 later times over earlier times.
1378 b) But if we have to wake up, then let's make sure to
1379 dispatch as much as possible on the entire system.
1381 We implement this by waking up everywhere at the same time
1382 within any given minute if we can, synchronised via the
1383 perturbation value determined from the boot ID. If we can't,
1384 then we try to find the same spot in every 10s, then 1s and
1385 then 250ms step. Otherwise, we pick the last possible time
1389 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1391 if (_unlikely_(c < USEC_PER_MINUTE))
1394 c -= USEC_PER_MINUTE;
1400 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1402 if (_unlikely_(c < USEC_PER_SEC*10))
1405 c -= USEC_PER_SEC*10;
1411 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1413 if (_unlikely_(c < USEC_PER_SEC))
1422 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1424 if (_unlikely_(c < USEC_PER_MSEC*250))
1427 c -= USEC_PER_MSEC*250;
1436 static int event_arm_timer(
1443 struct itimerspec its = {};
1444 sd_event_source *a, *b;
1451 a = prioq_peek(earliest);
1452 if (!a || a->enabled == SD_EVENT_OFF) {
1457 if (*next == (usec_t) -1)
1461 r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
1465 *next = (usec_t) -1;
1470 b = prioq_peek(latest);
1471 assert_se(b && b->enabled != SD_EVENT_OFF);
1473 t = sleep_between(e, a->time.next, b->time.next + b->time.accuracy);
1477 assert_se(timer_fd >= 0);
1480 /* We don' want to disarm here, just mean some time looooong ago. */
1481 its.it_value.tv_sec = 0;
1482 its.it_value.tv_nsec = 1;
1484 timespec_store(&its.it_value, t);
1486 r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
1494 static int process_io(sd_event *e, sd_event_source *s, uint32_t events) {
1497 assert(s->type == SOURCE_IO);
1499 s->io.revents = events;
1501 return source_set_pending(s, true);
1504 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
1511 assert_return(events == EPOLLIN, -EIO);
1513 ss = read(fd, &x, sizeof(x));
1515 if (errno == EAGAIN || errno == EINTR)
1521 if (ss != sizeof(x))
1525 *next = (usec_t) -1;
1530 static int process_timer(
1542 s = prioq_peek(earliest);
1545 s->enabled == SD_EVENT_OFF ||
1549 r = source_set_pending(s, true);
1553 prioq_reshuffle(earliest, s, &s->time.earliest_index);
1554 prioq_reshuffle(latest, s, &s->time.latest_index);
1560 static int process_child(sd_event *e) {
1567 e->need_process_child = false;
1570 So, this is ugly. We iteratively invoke waitid() with P_PID
1571 + WNOHANG for each PID we wait for, instead of using
1572 P_ALL. This is because we only want to get child
1573 information of very specific child processes, and not all
1574 of them. We might not have processed the SIGCHLD even of a
1575 previous invocation and we don't want to maintain a
1576 unbounded *per-child* event queue, hence we really don't
1577 want anything flushed out of the kernel's queue that we
1578 don't care about. Since this is O(n) this means that if you
1579 have a lot of processes you probably want to handle SIGCHLD
1582 We do not reap the children here (by using WNOWAIT), this
1583 is only done after the event source is dispatched so that
1584 the callback still sees the process as a zombie.
1587 HASHMAP_FOREACH(s, e->child_sources, i) {
1588 assert(s->type == SOURCE_CHILD);
1593 if (s->enabled == SD_EVENT_OFF)
1596 zero(s->child.siginfo);
1597 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
1598 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
1602 if (s->child.siginfo.si_pid != 0) {
1604 s->child.siginfo.si_code == CLD_EXITED ||
1605 s->child.siginfo.si_code == CLD_KILLED ||
1606 s->child.siginfo.si_code == CLD_DUMPED;
1608 if (!zombie && (s->child.options & WEXITED)) {
1609 /* If the child isn't dead then let's
1610 * immediately remove the state change
1611 * from the queue, since there's no
1612 * benefit in leaving it queued */
1614 assert(s->child.options & (WSTOPPED|WCONTINUED));
1615 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
1618 r = source_set_pending(s, true);
1627 static int process_signal(sd_event *e, uint32_t events) {
1628 bool read_one = false;
1632 assert(e->signal_sources);
1634 assert_return(events == EPOLLIN, -EIO);
1637 struct signalfd_siginfo si;
1641 ss = read(e->signal_fd, &si, sizeof(si));
1643 if (errno == EAGAIN || errno == EINTR)
1649 if (ss != sizeof(si))
1654 s = e->signal_sources[si.ssi_signo];
1655 if (si.ssi_signo == SIGCHLD) {
1656 r = process_child(e);
1665 s->signal.siginfo = si;
1666 r = source_set_pending(s, true);
1674 static int source_dispatch(sd_event_source *s) {
1678 assert(s->pending || s->type == SOURCE_EXIT);
1680 if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) {
1681 r = source_set_pending(s, false);
1686 if (s->enabled == SD_EVENT_ONESHOT) {
1687 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
1692 sd_event_source_ref(s);
1697 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
1700 case SOURCE_MONOTONIC:
1701 r = s->time.callback(s, s->time.next, s->userdata);
1704 case SOURCE_REALTIME:
1705 r = s->time.callback(s, s->time.next, s->userdata);
1709 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
1712 case SOURCE_CHILD: {
1715 zombie = s->child.siginfo.si_code == CLD_EXITED ||
1716 s->child.siginfo.si_code == CLD_KILLED ||
1717 s->child.siginfo.si_code == CLD_DUMPED;
1719 r = s->child.callback(s, &s->child.siginfo, s->userdata);
1721 /* Now, reap the PID for good. */
1723 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
1729 r = s->defer.callback(s, s->userdata);
1733 r = s->exit.callback(s, s->userdata);
1738 log_debug("Event source %p returned error, disabling: %s", s, strerror(-r));
1739 sd_event_source_set_enabled(s, SD_EVENT_OFF);
1742 sd_event_source_unref(s);
1746 static int event_prepare(sd_event *e) {
1754 s = prioq_peek(e->prepare);
1755 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
1758 s->prepare_iteration = e->iteration;
1759 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
1764 r = s->prepare(s, s->userdata);
1773 static int dispatch_exit(sd_event *e) {
1779 p = prioq_peek(e->exit);
1780 if (!p || p->enabled == SD_EVENT_OFF) {
1781 e->state = SD_EVENT_FINISHED;
1787 e->state = SD_EVENT_EXITING;
1789 r = source_dispatch(p);
1791 e->state = SD_EVENT_PASSIVE;
1797 static sd_event_source* event_next_pending(sd_event *e) {
1802 p = prioq_peek(e->pending);
1806 if (p->enabled == SD_EVENT_OFF)
1812 static int arm_watchdog(sd_event *e) {
1813 struct itimerspec its = {};
1818 assert(e->watchdog_fd >= 0);
1820 t = sleep_between(e,
1821 e->watchdog_last + (e->watchdog_period / 2),
1822 e->watchdog_last + (e->watchdog_period * 3 / 4));
1824 timespec_store(&its.it_value, t);
1826 r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
1833 static int process_watchdog(sd_event *e) {
1839 /* Don't notify watchdog too often */
1840 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
1843 sd_notify(false, "WATCHDOG=1");
1844 e->watchdog_last = e->timestamp.monotonic;
1846 return arm_watchdog(e);
1849 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
1850 struct epoll_event ev_queue[EPOLL_QUEUE_MAX];
1854 assert_return(e, -EINVAL);
1855 assert_return(!event_pid_changed(e), -ECHILD);
1856 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1857 assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
1859 if (e->exit_requested)
1860 return dispatch_exit(e);
1864 e->state = SD_EVENT_RUNNING;
1866 r = event_prepare(e);
1870 r = event_arm_timer(e, e->monotonic_fd, e->monotonic_earliest, e->monotonic_latest, &e->monotonic_next);
1874 r = event_arm_timer(e, e->realtime_fd, e->realtime_earliest, e->realtime_latest, &e->realtime_next);
1878 if (event_next_pending(e) || e->need_process_child)
1881 m = epoll_wait(e->epoll_fd, ev_queue, EPOLL_QUEUE_MAX,
1882 timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
1884 r = errno == EAGAIN || errno == EINTR ? 0 : -errno;
1888 dual_timestamp_get(&e->timestamp);
1890 for (i = 0; i < m; i++) {
1892 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_MONOTONIC))
1893 r = flush_timer(e, e->monotonic_fd, ev_queue[i].events, &e->monotonic_next);
1894 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_REALTIME))
1895 r = flush_timer(e, e->realtime_fd, ev_queue[i].events, &e->realtime_next);
1896 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_SIGNAL))
1897 r = process_signal(e, ev_queue[i].events);
1898 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
1899 r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
1901 r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
1907 r = process_watchdog(e);
1911 r = process_timer(e, e->timestamp.monotonic, e->monotonic_earliest, e->monotonic_latest);
1915 r = process_timer(e, e->timestamp.realtime, e->realtime_earliest, e->realtime_latest);
1919 if (e->need_process_child) {
1920 r = process_child(e);
1925 p = event_next_pending(e);
1931 r = source_dispatch(p);
1934 e->state = SD_EVENT_PASSIVE;
1940 _public_ int sd_event_loop(sd_event *e) {
1943 assert_return(e, -EINVAL);
1944 assert_return(!event_pid_changed(e), -ECHILD);
1945 assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
1949 while (e->state != SD_EVENT_FINISHED) {
1950 r = sd_event_run(e, (uint64_t) -1);
1962 _public_ int sd_event_get_state(sd_event *e) {
1963 assert_return(e, -EINVAL);
1964 assert_return(!event_pid_changed(e), -ECHILD);
1969 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
1970 assert_return(e, -EINVAL);
1971 assert_return(code, -EINVAL);
1972 assert_return(!event_pid_changed(e), -ECHILD);
1974 if (!e->exit_requested)
1977 *code = e->exit_code;
1981 _public_ int sd_event_exit(sd_event *e, int code) {
1982 assert_return(e, -EINVAL);
1983 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1984 assert_return(!event_pid_changed(e), -ECHILD);
1986 e->exit_requested = true;
1987 e->exit_code = code;
1992 _public_ int sd_event_get_now_realtime(sd_event *e, uint64_t *usec) {
1993 assert_return(e, -EINVAL);
1994 assert_return(usec, -EINVAL);
1995 assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
1996 assert_return(!event_pid_changed(e), -ECHILD);
1998 *usec = e->timestamp.realtime;
2002 _public_ int sd_event_get_now_monotonic(sd_event *e, uint64_t *usec) {
2003 assert_return(e, -EINVAL);
2004 assert_return(usec, -EINVAL);
2005 assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
2006 assert_return(!event_pid_changed(e), -ECHILD);
2008 *usec = e->timestamp.monotonic;
2012 _public_ int sd_event_default(sd_event **ret) {
2014 static __thread sd_event *default_event = NULL;
2019 return !!default_event;
2021 if (default_event) {
2022 *ret = sd_event_ref(default_event);
2026 r = sd_event_new(&e);
2030 e->default_event_ptr = &default_event;
2038 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2039 assert_return(e, -EINVAL);
2040 assert_return(tid, -EINVAL);
2041 assert_return(!event_pid_changed(e), -ECHILD);
2051 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2054 assert_return(e, -EINVAL);
2056 if (e->watchdog == !!b)
2060 struct epoll_event ev = {};
2063 env = getenv("WATCHDOG_USEC");
2067 r = safe_atou64(env, &e->watchdog_period);
2070 if (e->watchdog_period <= 0)
2073 /* Issue first ping immediately */
2074 sd_notify(false, "WATCHDOG=1");
2075 e->watchdog_last = now(CLOCK_MONOTONIC);
2077 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2078 if (e->watchdog_fd < 0)
2081 r = arm_watchdog(e);
2085 ev.events = EPOLLIN;
2086 ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
2088 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2095 if (e->watchdog_fd >= 0) {
2096 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2097 close_nointr_nofail(e->watchdog_fd);
2098 e->watchdog_fd = -1;
2106 close_nointr_nofail(e->watchdog_fd);
2107 e->watchdog_fd = -1;