X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~ianmdlvl/git?a=blobdiff_plain;f=src%2Flibsystemd%2Fsd-event.c;fp=src%2Flibsystemd%2Fsd-event.c;h=0b7b71d16e5ed5f893dfe8d4ba402e887a1af6f6;hb=6bb648a16ae4a682ad4784412af706d2e6a3e4da;hp=0000000000000000000000000000000000000000;hpb=883b36908788361a8bb945ce884dc518da83b371;p=elogind.git diff --git a/src/libsystemd/sd-event.c b/src/libsystemd/sd-event.c new file mode 100644 index 000000000..0b7b71d16 --- /dev/null +++ b/src/libsystemd/sd-event.c @@ -0,0 +1,2224 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2013 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include + +#include "sd-id128.h" +#include "sd-daemon.h" +#include "macro.h" +#include "prioq.h" +#include "hashmap.h" +#include "util.h" +#include "time-util.h" +#include "missing.h" + +#include "sd-event.h" + +#define EPOLL_QUEUE_MAX 512U +#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC) + +typedef enum EventSourceType { + SOURCE_IO, + SOURCE_MONOTONIC, + SOURCE_REALTIME, + SOURCE_SIGNAL, + SOURCE_CHILD, + SOURCE_DEFER, + SOURCE_EXIT, + SOURCE_WATCHDOG +} EventSourceType; + +struct sd_event_source { + unsigned n_ref; + + sd_event *event; + void *userdata; + sd_event_handler_t prepare; + + EventSourceType type:4; + int enabled:3; + bool pending:1; + bool dispatching:1; + + int priority; + unsigned pending_index; + unsigned prepare_index; + unsigned pending_iteration; + unsigned prepare_iteration; + + union { + struct { + sd_event_io_handler_t callback; + int fd; + uint32_t events; + uint32_t revents; + bool registered:1; + } io; + struct { + sd_event_time_handler_t callback; + usec_t next, accuracy; + unsigned earliest_index; + unsigned latest_index; + } time; + struct { + sd_event_signal_handler_t callback; + struct signalfd_siginfo siginfo; + int sig; + } signal; + struct { + sd_event_child_handler_t callback; + siginfo_t siginfo; + pid_t pid; + int options; + } child; + struct { + sd_event_handler_t callback; + } defer; + struct { + sd_event_handler_t callback; + unsigned prioq_index; + } exit; + }; +}; + +struct sd_event { + unsigned n_ref; + + int epoll_fd; + int signal_fd; + int realtime_fd; + int monotonic_fd; + int watchdog_fd; + + Prioq *pending; + Prioq *prepare; + + /* For both clocks we maintain two priority queues each, one + * ordered for the earliest times the events may be + * dispatched, and one ordered by the latest times they must + * have been dispatched. The range between the top entries in + * the two prioqs is the time window we can freely schedule + * wakeups in */ + Prioq *monotonic_earliest; + Prioq *monotonic_latest; + Prioq *realtime_earliest; + Prioq *realtime_latest; + + usec_t realtime_next, monotonic_next; + usec_t perturb; + + sigset_t sigset; + sd_event_source **signal_sources; + + Hashmap *child_sources; + unsigned n_enabled_child_sources; + + Prioq *exit; + + pid_t original_pid; + + unsigned iteration; + dual_timestamp timestamp; + int state; + + bool exit_requested:1; + bool need_process_child:1; + bool watchdog:1; + + int exit_code; + + pid_t tid; + sd_event **default_event_ptr; + + usec_t watchdog_last, watchdog_period; + + unsigned n_sources; +}; + +static int pending_prioq_compare(const void *a, const void *b) { + const sd_event_source *x = a, *y = b; + + assert(x->pending); + assert(y->pending); + + /* Enabled ones first */ + if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF) + return -1; + if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF) + return 1; + + /* Lower priority values first */ + if (x->priority < y->priority) + return -1; + if (x->priority > y->priority) + return 1; + + /* Older entries first */ + if (x->pending_iteration < y->pending_iteration) + return -1; + if (x->pending_iteration > y->pending_iteration) + return 1; + + /* Stability for the rest */ + if (x < y) + return -1; + if (x > y) + return 1; + + return 0; +} + +static int prepare_prioq_compare(const void *a, const void *b) { + const sd_event_source *x = a, *y = b; + + assert(x->prepare); + assert(y->prepare); + + /* Move most recently prepared ones last, so that we can stop + * preparing as soon as we hit one that has already been + * prepared in the current iteration */ + if (x->prepare_iteration < y->prepare_iteration) + return -1; + if (x->prepare_iteration > y->prepare_iteration) + return 1; + + /* Enabled ones first */ + if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF) + return -1; + if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF) + return 1; + + /* Lower priority values first */ + if (x->priority < y->priority) + return -1; + if (x->priority > y->priority) + return 1; + + /* Stability for the rest */ + if (x < y) + return -1; + if (x > y) + return 1; + + return 0; +} + +static int earliest_time_prioq_compare(const void *a, const void *b) { + const sd_event_source *x = a, *y = b; + + assert(x->type == SOURCE_MONOTONIC || x->type == SOURCE_REALTIME); + assert(y->type == SOURCE_MONOTONIC || y->type == SOURCE_REALTIME); + + /* Enabled ones first */ + if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF) + return -1; + if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF) + return 1; + + /* Move the pending ones to the end */ + if (!x->pending && y->pending) + return -1; + if (x->pending && !y->pending) + return 1; + + /* Order by time */ + if (x->time.next < y->time.next) + return -1; + if (x->time.next > y->time.next) + return 1; + + /* Stability for the rest */ + if (x < y) + return -1; + if (x > y) + return 1; + + return 0; +} + +static int latest_time_prioq_compare(const void *a, const void *b) { + const sd_event_source *x = a, *y = b; + + assert((x->type == SOURCE_MONOTONIC && y->type == SOURCE_MONOTONIC) || + (x->type == SOURCE_REALTIME && y->type == SOURCE_REALTIME)); + + /* Enabled ones first */ + if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF) + return -1; + if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF) + return 1; + + /* Move the pending ones to the end */ + if (!x->pending && y->pending) + return -1; + if (x->pending && !y->pending) + return 1; + + /* Order by time */ + if (x->time.next + x->time.accuracy < y->time.next + y->time.accuracy) + return -1; + if (x->time.next + x->time.accuracy > y->time.next + y->time.accuracy) + return 1; + + /* Stability for the rest */ + if (x < y) + return -1; + if (x > y) + return 1; + + return 0; +} + +static int exit_prioq_compare(const void *a, const void *b) { + const sd_event_source *x = a, *y = b; + + assert(x->type == SOURCE_EXIT); + assert(y->type == SOURCE_EXIT); + + /* Enabled ones first */ + if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF) + return -1; + if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF) + return 1; + + /* Lower priority values first */ + if (x->priority < y->priority) + return -1; + if (x->priority > y->priority) + return 1; + + /* Stability for the rest */ + if (x < y) + return -1; + if (x > y) + return 1; + + return 0; +} + +static void event_free(sd_event *e) { + assert(e); + assert(e->n_sources == 0); + + if (e->default_event_ptr) + *(e->default_event_ptr) = NULL; + + if (e->epoll_fd >= 0) + close_nointr_nofail(e->epoll_fd); + + if (e->signal_fd >= 0) + close_nointr_nofail(e->signal_fd); + + if (e->realtime_fd >= 0) + close_nointr_nofail(e->realtime_fd); + + if (e->monotonic_fd >= 0) + close_nointr_nofail(e->monotonic_fd); + + if (e->watchdog_fd >= 0) + close_nointr_nofail(e->watchdog_fd); + + prioq_free(e->pending); + prioq_free(e->prepare); + prioq_free(e->monotonic_earliest); + prioq_free(e->monotonic_latest); + prioq_free(e->realtime_earliest); + prioq_free(e->realtime_latest); + prioq_free(e->exit); + + free(e->signal_sources); + + hashmap_free(e->child_sources); + free(e); +} + +_public_ int sd_event_new(sd_event** ret) { + sd_event *e; + int r; + + assert_return(ret, -EINVAL); + + e = new0(sd_event, 1); + if (!e) + return -ENOMEM; + + e->n_ref = 1; + e->signal_fd = e->realtime_fd = e->monotonic_fd = e->watchdog_fd = e->epoll_fd = -1; + e->realtime_next = e->monotonic_next = (usec_t) -1; + e->original_pid = getpid(); + + assert_se(sigemptyset(&e->sigset) == 0); + + e->pending = prioq_new(pending_prioq_compare); + if (!e->pending) { + r = -ENOMEM; + goto fail; + } + + e->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (e->epoll_fd < 0) { + r = -errno; + goto fail; + } + + *ret = e; + return 0; + +fail: + event_free(e); + return r; +} + +_public_ sd_event* sd_event_ref(sd_event *e) { + assert_return(e, NULL); + + assert(e->n_ref >= 1); + e->n_ref++; + + return e; +} + +_public_ sd_event* sd_event_unref(sd_event *e) { + + if (!e) + return NULL; + + assert(e->n_ref >= 1); + e->n_ref--; + + if (e->n_ref <= 0) + event_free(e); + + return NULL; +} + +static bool event_pid_changed(sd_event *e) { + assert(e); + + /* We don't support people creating am event loop and keeping + * it around over a fork(). Let's complain. */ + + return e->original_pid != getpid(); +} + +static int source_io_unregister(sd_event_source *s) { + int r; + + assert(s); + assert(s->type == SOURCE_IO); + + if (!s->io.registered) + return 0; + + r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL); + if (r < 0) + return -errno; + + s->io.registered = false; + return 0; +} + +static int source_io_register( + sd_event_source *s, + int enabled, + uint32_t events) { + + struct epoll_event ev = {}; + int r; + + assert(s); + assert(s->type == SOURCE_IO); + assert(enabled != SD_EVENT_OFF); + + ev.events = events; + ev.data.ptr = s; + + if (enabled == SD_EVENT_ONESHOT) + ev.events |= EPOLLONESHOT; + + if (s->io.registered) + r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev); + else + r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev); + + if (r < 0) + return -errno; + + s->io.registered = true; + + return 0; +} + +static void source_free(sd_event_source *s) { + assert(s); + + if (s->event) { + assert(s->event->n_sources > 0); + + switch (s->type) { + + case SOURCE_IO: + if (s->io.fd >= 0) + source_io_unregister(s); + + break; + + case SOURCE_MONOTONIC: + prioq_remove(s->event->monotonic_earliest, s, &s->time.earliest_index); + prioq_remove(s->event->monotonic_latest, s, &s->time.latest_index); + break; + + case SOURCE_REALTIME: + prioq_remove(s->event->realtime_earliest, s, &s->time.earliest_index); + prioq_remove(s->event->realtime_latest, s, &s->time.latest_index); + break; + + case SOURCE_SIGNAL: + if (s->signal.sig > 0) { + if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) + assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0); + + if (s->event->signal_sources) + s->event->signal_sources[s->signal.sig] = NULL; + } + + break; + + case SOURCE_CHILD: + if (s->child.pid > 0) { + if (s->enabled != SD_EVENT_OFF) { + assert(s->event->n_enabled_child_sources > 0); + s->event->n_enabled_child_sources--; + } + + if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) + assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0); + + hashmap_remove(s->event->child_sources, INT_TO_PTR(s->child.pid)); + } + + break; + + case SOURCE_DEFER: + /* nothing */ + break; + + case SOURCE_EXIT: + prioq_remove(s->event->exit, s, &s->exit.prioq_index); + break; + + case SOURCE_WATCHDOG: + assert_not_reached("Wut? I shouldn't exist."); + } + + if (s->pending) + prioq_remove(s->event->pending, s, &s->pending_index); + + if (s->prepare) + prioq_remove(s->event->prepare, s, &s->prepare_index); + + s->event->n_sources--; + sd_event_unref(s->event); + } + + free(s); +} + +static int source_set_pending(sd_event_source *s, bool b) { + int r; + + assert(s); + assert(s->type != SOURCE_EXIT); + + if (s->pending == b) + return 0; + + s->pending = b; + + if (b) { + s->pending_iteration = s->event->iteration; + + r = prioq_put(s->event->pending, s, &s->pending_index); + if (r < 0) { + s->pending = false; + return r; + } + } else + assert_se(prioq_remove(s->event->pending, s, &s->pending_index)); + + if (s->type == SOURCE_REALTIME) { + prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index); + prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index); + } else if (s->type == SOURCE_MONOTONIC) { + prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index); + prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index); + } + + return 0; +} + +static sd_event_source *source_new(sd_event *e, EventSourceType type) { + sd_event_source *s; + + assert(e); + + s = new0(sd_event_source, 1); + if (!s) + return NULL; + + s->n_ref = 1; + s->event = sd_event_ref(e); + s->type = type; + s->pending_index = s->prepare_index = PRIOQ_IDX_NULL; + + e->n_sources ++; + + return s; +} + +_public_ int sd_event_add_io( + sd_event *e, + int fd, + uint32_t events, + sd_event_io_handler_t callback, + void *userdata, + sd_event_source **ret) { + + sd_event_source *s; + int r; + + assert_return(e, -EINVAL); + assert_return(fd >= 0, -EINVAL); + assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL); + assert_return(callback, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(e), -ECHILD); + + s = source_new(e, SOURCE_IO); + if (!s) + return -ENOMEM; + + s->io.fd = fd; + s->io.events = events; + s->io.callback = callback; + s->userdata = userdata; + s->enabled = SD_EVENT_ON; + + r = source_io_register(s, s->enabled, events); + if (r < 0) { + source_free(s); + return -errno; + } + + *ret = s; + return 0; +} + +static int event_setup_timer_fd( + sd_event *e, + EventSourceType type, + int *timer_fd, + clockid_t id) { + + struct epoll_event ev = {}; + int r, fd; + sd_id128_t bootid; + + assert(e); + assert(timer_fd); + + if (_likely_(*timer_fd >= 0)) + return 0; + + fd = timerfd_create(id, TFD_NONBLOCK|TFD_CLOEXEC); + if (fd < 0) + return -errno; + + ev.events = EPOLLIN; + ev.data.ptr = INT_TO_PTR(type); + + r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev); + if (r < 0) { + close_nointr_nofail(fd); + return -errno; + } + + /* When we sleep for longer, we try to realign the wakeup to + the same time wihtin each minute/second/250ms, so that + events all across the system can be coalesced into a single + CPU wakeup. However, let's take some system-specific + randomness for this value, so that in a network of systems + with synced clocks timer events are distributed a + bit. Here, we calculate a perturbation usec offset from the + boot ID. */ + + if (sd_id128_get_boot(&bootid) >= 0) + e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE; + + *timer_fd = fd; + return 0; +} + +static int event_add_time_internal( + sd_event *e, + EventSourceType type, + int *timer_fd, + clockid_t id, + Prioq **earliest, + Prioq **latest, + uint64_t usec, + uint64_t accuracy, + sd_event_time_handler_t callback, + void *userdata, + sd_event_source **ret) { + + sd_event_source *s; + int r; + + assert_return(e, -EINVAL); + assert_return(callback, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(usec != (uint64_t) -1, -EINVAL); + assert_return(accuracy != (uint64_t) -1, -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(e), -ECHILD); + + assert(timer_fd); + assert(earliest); + assert(latest); + + if (!*earliest) { + *earliest = prioq_new(earliest_time_prioq_compare); + if (!*earliest) + return -ENOMEM; + } + + if (!*latest) { + *latest = prioq_new(latest_time_prioq_compare); + if (!*latest) + return -ENOMEM; + } + + if (*timer_fd < 0) { + r = event_setup_timer_fd(e, type, timer_fd, id); + if (r < 0) + return r; + } + + s = source_new(e, type); + if (!s) + return -ENOMEM; + + s->time.next = usec; + s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy; + s->time.callback = callback; + s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL; + s->userdata = userdata; + s->enabled = SD_EVENT_ONESHOT; + + r = prioq_put(*earliest, s, &s->time.earliest_index); + if (r < 0) + goto fail; + + r = prioq_put(*latest, s, &s->time.latest_index); + if (r < 0) + goto fail; + + *ret = s; + return 0; + +fail: + source_free(s); + return r; +} + +_public_ int sd_event_add_monotonic(sd_event *e, + uint64_t usec, + uint64_t accuracy, + sd_event_time_handler_t callback, + void *userdata, + sd_event_source **ret) { + + return event_add_time_internal(e, SOURCE_MONOTONIC, &e->monotonic_fd, CLOCK_MONOTONIC, &e->monotonic_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret); +} + +_public_ int sd_event_add_realtime(sd_event *e, + uint64_t usec, + uint64_t accuracy, + sd_event_time_handler_t callback, + void *userdata, + sd_event_source **ret) { + + return event_add_time_internal(e, SOURCE_REALTIME, &e->realtime_fd, CLOCK_REALTIME, &e->realtime_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret); +} + +static int event_update_signal_fd(sd_event *e) { + struct epoll_event ev = {}; + bool add_to_epoll; + int r; + + assert(e); + + add_to_epoll = e->signal_fd < 0; + + r = signalfd(e->signal_fd, &e->sigset, SFD_NONBLOCK|SFD_CLOEXEC); + if (r < 0) + return -errno; + + e->signal_fd = r; + + if (!add_to_epoll) + return 0; + + ev.events = EPOLLIN; + ev.data.ptr = INT_TO_PTR(SOURCE_SIGNAL); + + r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->signal_fd, &ev); + if (r < 0) { + close_nointr_nofail(e->signal_fd); + e->signal_fd = -1; + + return -errno; + } + + return 0; +} + +_public_ int sd_event_add_signal( + sd_event *e, + int sig, + sd_event_signal_handler_t callback, + void *userdata, + sd_event_source **ret) { + + sd_event_source *s; + sigset_t ss; + int r; + + assert_return(e, -EINVAL); + assert_return(sig > 0, -EINVAL); + assert_return(sig < _NSIG, -EINVAL); + assert_return(callback, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(e), -ECHILD); + + r = pthread_sigmask(SIG_SETMASK, NULL, &ss); + if (r < 0) + return -errno; + + if (!sigismember(&ss, sig)) + return -EBUSY; + + if (!e->signal_sources) { + e->signal_sources = new0(sd_event_source*, _NSIG); + if (!e->signal_sources) + return -ENOMEM; + } else if (e->signal_sources[sig]) + return -EBUSY; + + s = source_new(e, SOURCE_SIGNAL); + if (!s) + return -ENOMEM; + + s->signal.sig = sig; + s->signal.callback = callback; + s->userdata = userdata; + s->enabled = SD_EVENT_ON; + + e->signal_sources[sig] = s; + assert_se(sigaddset(&e->sigset, sig) == 0); + + if (sig != SIGCHLD || e->n_enabled_child_sources == 0) { + r = event_update_signal_fd(e); + if (r < 0) { + source_free(s); + return r; + } + } + + *ret = s; + return 0; +} + +_public_ int sd_event_add_child( + sd_event *e, + pid_t pid, + int options, + sd_event_child_handler_t callback, + void *userdata, + sd_event_source **ret) { + + sd_event_source *s; + int r; + + assert_return(e, -EINVAL); + assert_return(pid > 1, -EINVAL); + assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL); + assert_return(options != 0, -EINVAL); + assert_return(callback, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(e), -ECHILD); + + r = hashmap_ensure_allocated(&e->child_sources, trivial_hash_func, trivial_compare_func); + if (r < 0) + return r; + + if (hashmap_contains(e->child_sources, INT_TO_PTR(pid))) + return -EBUSY; + + s = source_new(e, SOURCE_CHILD); + if (!s) + return -ENOMEM; + + s->child.pid = pid; + s->child.options = options; + s->child.callback = callback; + s->userdata = userdata; + s->enabled = SD_EVENT_ONESHOT; + + r = hashmap_put(e->child_sources, INT_TO_PTR(pid), s); + if (r < 0) { + source_free(s); + return r; + } + + e->n_enabled_child_sources ++; + + assert_se(sigaddset(&e->sigset, SIGCHLD) == 0); + + if (!e->signal_sources || !e->signal_sources[SIGCHLD]) { + r = event_update_signal_fd(e); + if (r < 0) { + source_free(s); + return -errno; + } + } + + e->need_process_child = true; + + *ret = s; + return 0; +} + +_public_ int sd_event_add_defer( + sd_event *e, + sd_event_handler_t callback, + void *userdata, + sd_event_source **ret) { + + sd_event_source *s; + int r; + + assert_return(e, -EINVAL); + assert_return(callback, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(e), -ECHILD); + + s = source_new(e, SOURCE_DEFER); + if (!s) + return -ENOMEM; + + s->defer.callback = callback; + s->userdata = userdata; + s->enabled = SD_EVENT_ONESHOT; + + r = source_set_pending(s, true); + if (r < 0) { + source_free(s); + return r; + } + + *ret = s; + return 0; +} + +_public_ int sd_event_add_exit( + sd_event *e, + sd_event_handler_t callback, + void *userdata, + sd_event_source **ret) { + + sd_event_source *s; + int r; + + assert_return(e, -EINVAL); + assert_return(callback, -EINVAL); + assert_return(ret, -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(e), -ECHILD); + + if (!e->exit) { + e->exit = prioq_new(exit_prioq_compare); + if (!e->exit) + return -ENOMEM; + } + + s = source_new(e, SOURCE_EXIT); + if (!s) + return -ENOMEM; + + s->exit.callback = callback; + s->userdata = userdata; + s->exit.prioq_index = PRIOQ_IDX_NULL; + s->enabled = SD_EVENT_ONESHOT; + + r = prioq_put(s->event->exit, s, &s->exit.prioq_index); + if (r < 0) { + source_free(s); + return r; + } + + *ret = s; + return 0; +} + +_public_ sd_event_source* sd_event_source_ref(sd_event_source *s) { + assert_return(s, NULL); + + assert(s->n_ref >= 1); + s->n_ref++; + + return s; +} + +_public_ sd_event_source* sd_event_source_unref(sd_event_source *s) { + + if (!s) + return NULL; + + assert(s->n_ref >= 1); + s->n_ref--; + + if (s->n_ref <= 0) { + /* Here's a special hack: when we are called from a + * dispatch handler we won't free the event source + * immediately, but we will detach the fd from the + * epoll. This way it is safe for the caller to unref + * the event source and immediately close the fd, but + * we still retain a valid event source object after + * the callback. */ + + if (s->dispatching) { + if (s->type == SOURCE_IO) + source_io_unregister(s); + } else + source_free(s); + } + + return NULL; +} + +_public_ sd_event *sd_event_source_get_event(sd_event_source *s) { + assert_return(s, NULL); + + return s->event; +} + +_public_ int sd_event_source_get_pending(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type != SOURCE_EXIT, -EDOM); + assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(s->event), -ECHILD); + + return s->pending; +} + +_public_ int sd_event_source_get_io_fd(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_IO, -EDOM); + assert_return(!event_pid_changed(s->event), -ECHILD); + + return s->io.fd; +} + +_public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) { + int r; + + assert_return(s, -EINVAL); + assert_return(fd >= 0, -EINVAL); + assert_return(s->type == SOURCE_IO, -EDOM); + assert_return(!event_pid_changed(s->event), -ECHILD); + + if (s->io.fd == fd) + return 0; + + if (s->enabled == SD_EVENT_OFF) { + s->io.fd = fd; + s->io.registered = false; + } else { + int saved_fd; + + saved_fd = s->io.fd; + assert(s->io.registered); + + s->io.fd = fd; + s->io.registered = false; + + r = source_io_register(s, s->enabled, s->io.events); + if (r < 0) { + s->io.fd = saved_fd; + s->io.registered = true; + return r; + } + + epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL); + } + + return 0; +} + +_public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) { + assert_return(s, -EINVAL); + assert_return(events, -EINVAL); + assert_return(s->type == SOURCE_IO, -EDOM); + assert_return(!event_pid_changed(s->event), -ECHILD); + + *events = s->io.events; + return 0; +} + +_public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) { + int r; + + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_IO, -EDOM); + assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL); + assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(s->event), -ECHILD); + + if (s->io.events == events) + return 0; + + if (s->enabled != SD_EVENT_OFF) { + r = source_io_register(s, s->enabled, events); + if (r < 0) + return r; + } + + s->io.events = events; + source_set_pending(s, false); + + return 0; +} + +_public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) { + assert_return(s, -EINVAL); + assert_return(revents, -EINVAL); + assert_return(s->type == SOURCE_IO, -EDOM); + assert_return(s->pending, -ENODATA); + assert_return(!event_pid_changed(s->event), -ECHILD); + + *revents = s->io.revents; + return 0; +} + +_public_ int sd_event_source_get_signal(sd_event_source *s) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_SIGNAL, -EDOM); + assert_return(!event_pid_changed(s->event), -ECHILD); + + return s->signal.sig; +} + +_public_ int sd_event_source_get_priority(sd_event_source *s, int *priority) { + assert_return(s, -EINVAL); + assert_return(!event_pid_changed(s->event), -ECHILD); + + return s->priority; +} + +_public_ int sd_event_source_set_priority(sd_event_source *s, int priority) { + assert_return(s, -EINVAL); + assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(s->event), -ECHILD); + + if (s->priority == priority) + return 0; + + s->priority = priority; + + if (s->pending) + prioq_reshuffle(s->event->pending, s, &s->pending_index); + + if (s->prepare) + prioq_reshuffle(s->event->prepare, s, &s->prepare_index); + + if (s->type == SOURCE_EXIT) + prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index); + + return 0; +} + +_public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) { + assert_return(s, -EINVAL); + assert_return(m, -EINVAL); + assert_return(!event_pid_changed(s->event), -ECHILD); + + *m = s->enabled; + return 0; +} + +_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) { + int r; + + assert_return(s, -EINVAL); + assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL); + assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(s->event), -ECHILD); + + if (s->enabled == m) + return 0; + + if (m == SD_EVENT_OFF) { + + switch (s->type) { + + case SOURCE_IO: + r = source_io_unregister(s); + if (r < 0) + return r; + + s->enabled = m; + break; + + case SOURCE_MONOTONIC: + s->enabled = m; + prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index); + prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index); + break; + + case SOURCE_REALTIME: + s->enabled = m; + prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index); + prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index); + break; + + case SOURCE_SIGNAL: + s->enabled = m; + if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) { + assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0); + event_update_signal_fd(s->event); + } + + break; + + case SOURCE_CHILD: + s->enabled = m; + + assert(s->event->n_enabled_child_sources > 0); + s->event->n_enabled_child_sources--; + + if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) { + assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0); + event_update_signal_fd(s->event); + } + + break; + + case SOURCE_EXIT: + s->enabled = m; + prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index); + break; + + case SOURCE_DEFER: + s->enabled = m; + break; + + case SOURCE_WATCHDOG: + assert_not_reached("Wut? I shouldn't exist."); + } + + } else { + switch (s->type) { + + case SOURCE_IO: + r = source_io_register(s, m, s->io.events); + if (r < 0) + return r; + + s->enabled = m; + break; + + case SOURCE_MONOTONIC: + s->enabled = m; + prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index); + prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index); + break; + + case SOURCE_REALTIME: + s->enabled = m; + prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index); + prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index); + break; + + case SOURCE_SIGNAL: + s->enabled = m; + + if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) { + assert_se(sigaddset(&s->event->sigset, s->signal.sig) == 0); + event_update_signal_fd(s->event); + } + break; + + case SOURCE_CHILD: + s->enabled = m; + + if (s->enabled == SD_EVENT_OFF) { + s->event->n_enabled_child_sources++; + + if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) { + assert_se(sigaddset(&s->event->sigset, SIGCHLD) == 0); + event_update_signal_fd(s->event); + } + } + break; + + case SOURCE_EXIT: + s->enabled = m; + prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index); + break; + + case SOURCE_DEFER: + s->enabled = m; + break; + + case SOURCE_WATCHDOG: + assert_not_reached("Wut? I shouldn't exist."); + } + } + + if (s->pending) + prioq_reshuffle(s->event->pending, s, &s->pending_index); + + if (s->prepare) + prioq_reshuffle(s->event->prepare, s, &s->prepare_index); + + return 0; +} + +_public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) { + assert_return(s, -EINVAL); + assert_return(usec, -EINVAL); + assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM); + assert_return(!event_pid_changed(s->event), -ECHILD); + + *usec = s->time.next; + return 0; +} + +_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) { + assert_return(s, -EINVAL); + assert_return(usec != (uint64_t) -1, -EINVAL); + assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM); + assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(s->event), -ECHILD); + + s->time.next = usec; + + source_set_pending(s, false); + + if (s->type == SOURCE_REALTIME) { + prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index); + prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index); + } else { + prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index); + prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index); + } + + return 0; +} + +_public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) { + assert_return(s, -EINVAL); + assert_return(usec, -EINVAL); + assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM); + assert_return(!event_pid_changed(s->event), -ECHILD); + + *usec = s->time.accuracy; + return 0; +} + +_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) { + assert_return(s, -EINVAL); + assert_return(usec != (uint64_t) -1, -EINVAL); + assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM); + assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(s->event), -ECHILD); + + if (usec == 0) + usec = DEFAULT_ACCURACY_USEC; + + s->time.accuracy = usec; + + source_set_pending(s, false); + + if (s->type == SOURCE_REALTIME) + prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index); + else + prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index); + + return 0; +} + +_public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) { + assert_return(s, -EINVAL); + assert_return(pid, -EINVAL); + assert_return(s->type == SOURCE_CHILD, -EDOM); + assert_return(!event_pid_changed(s->event), -ECHILD); + + *pid = s->child.pid; + return 0; +} + +_public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) { + int r; + + assert_return(s, -EINVAL); + assert_return(s->type != SOURCE_EXIT, -EDOM); + assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(s->event), -ECHILD); + + if (s->prepare == callback) + return 0; + + if (callback && s->prepare) { + s->prepare = callback; + return 0; + } + + r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare); + if (r < 0) + return r; + + s->prepare = callback; + + if (callback) { + r = prioq_put(s->event->prepare, s, &s->prepare_index); + if (r < 0) + return r; + } else + prioq_remove(s->event->prepare, s, &s->prepare_index); + + return 0; +} + +_public_ void* sd_event_source_get_userdata(sd_event_source *s) { + assert_return(s, NULL); + + return s->userdata; +} + +_public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) { + void *ret; + + assert_return(s, NULL); + + ret = s->userdata; + s->userdata = userdata; + + return ret; +} + +static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) { + usec_t c; + assert(e); + assert(a <= b); + + if (a <= 0) + return 0; + + if (b <= a + 1) + return a; + + /* + Find a good time to wake up again between times a and b. We + have two goals here: + + a) We want to wake up as seldom as possible, hence prefer + later times over earlier times. + + b) But if we have to wake up, then let's make sure to + dispatch as much as possible on the entire system. + + We implement this by waking up everywhere at the same time + within any given minute if we can, synchronised via the + perturbation value determined from the boot ID. If we can't, + then we try to find the same spot in every 10s, then 1s and + then 250ms step. Otherwise, we pick the last possible time + to wake up. + */ + + c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb; + if (c >= b) { + if (_unlikely_(c < USEC_PER_MINUTE)) + return b; + + c -= USEC_PER_MINUTE; + } + + if (c >= a) + return c; + + c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10)); + if (c >= b) { + if (_unlikely_(c < USEC_PER_SEC*10)) + return b; + + c -= USEC_PER_SEC*10; + } + + if (c >= a) + return c; + + c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC); + if (c >= b) { + if (_unlikely_(c < USEC_PER_SEC)) + return b; + + c -= USEC_PER_SEC; + } + + if (c >= a) + return c; + + c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250)); + if (c >= b) { + if (_unlikely_(c < USEC_PER_MSEC*250)) + return b; + + c -= USEC_PER_MSEC*250; + } + + if (c >= a) + return c; + + return b; +} + +static int event_arm_timer( + sd_event *e, + int timer_fd, + Prioq *earliest, + Prioq *latest, + usec_t *next) { + + struct itimerspec its = {}; + sd_event_source *a, *b; + usec_t t; + int r; + + assert(e); + assert(next); + + a = prioq_peek(earliest); + if (!a || a->enabled == SD_EVENT_OFF) { + + if (timer_fd < 0) + return 0; + + if (*next == (usec_t) -1) + return 0; + + /* disarm */ + r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL); + if (r < 0) + return r; + + *next = (usec_t) -1; + + return 0; + } + + b = prioq_peek(latest); + assert_se(b && b->enabled != SD_EVENT_OFF); + + t = sleep_between(e, a->time.next, b->time.next + b->time.accuracy); + if (*next == t) + return 0; + + assert_se(timer_fd >= 0); + + if (t == 0) { + /* We don' want to disarm here, just mean some time looooong ago. */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 1; + } else + timespec_store(&its.it_value, t); + + r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL); + if (r < 0) + return -errno; + + *next = t; + return 0; +} + +static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) { + assert(e); + assert(s); + assert(s->type == SOURCE_IO); + + /* If the event source was already pending, we just OR in the + * new revents, otherwise we reset the value. The ORing is + * necessary to handle EPOLLONESHOT events properly where + * readability might happen independently of writability, and + * we need to keep track of both */ + + if (s->pending) + s->io.revents |= revents; + else + s->io.revents = revents; + + return source_set_pending(s, true); +} + +static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) { + uint64_t x; + ssize_t ss; + + assert(e); + assert(fd >= 0); + + assert_return(events == EPOLLIN, -EIO); + + ss = read(fd, &x, sizeof(x)); + if (ss < 0) { + if (errno == EAGAIN || errno == EINTR) + return 0; + + return -errno; + } + + if (_unlikely_(ss != sizeof(x))) + return -EIO; + + if (next) + *next = (usec_t) -1; + + return 0; +} + +static int process_timer( + sd_event *e, + usec_t n, + Prioq *earliest, + Prioq *latest) { + + sd_event_source *s; + int r; + + assert(e); + + for (;;) { + s = prioq_peek(earliest); + if (!s || + s->time.next > n || + s->enabled == SD_EVENT_OFF || + s->pending) + break; + + r = source_set_pending(s, true); + if (r < 0) + return r; + + prioq_reshuffle(earliest, s, &s->time.earliest_index); + prioq_reshuffle(latest, s, &s->time.latest_index); + } + + return 0; +} + +static int process_child(sd_event *e) { + sd_event_source *s; + Iterator i; + int r; + + assert(e); + + e->need_process_child = false; + + /* + So, this is ugly. We iteratively invoke waitid() with P_PID + + WNOHANG for each PID we wait for, instead of using + P_ALL. This is because we only want to get child + information of very specific child processes, and not all + of them. We might not have processed the SIGCHLD even of a + previous invocation and we don't want to maintain a + unbounded *per-child* event queue, hence we really don't + want anything flushed out of the kernel's queue that we + don't care about. Since this is O(n) this means that if you + have a lot of processes you probably want to handle SIGCHLD + yourself. + + We do not reap the children here (by using WNOWAIT), this + is only done after the event source is dispatched so that + the callback still sees the process as a zombie. + */ + + HASHMAP_FOREACH(s, e->child_sources, i) { + assert(s->type == SOURCE_CHILD); + + if (s->pending) + continue; + + if (s->enabled == SD_EVENT_OFF) + continue; + + zero(s->child.siginfo); + r = waitid(P_PID, s->child.pid, &s->child.siginfo, + WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options); + if (r < 0) + return -errno; + + if (s->child.siginfo.si_pid != 0) { + bool zombie = + s->child.siginfo.si_code == CLD_EXITED || + s->child.siginfo.si_code == CLD_KILLED || + s->child.siginfo.si_code == CLD_DUMPED; + + if (!zombie && (s->child.options & WEXITED)) { + /* If the child isn't dead then let's + * immediately remove the state change + * from the queue, since there's no + * benefit in leaving it queued */ + + assert(s->child.options & (WSTOPPED|WCONTINUED)); + waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED))); + } + + r = source_set_pending(s, true); + if (r < 0) + return r; + } + } + + return 0; +} + +static int process_signal(sd_event *e, uint32_t events) { + bool read_one = false; + int r; + + assert(e); + assert(e->signal_sources); + + assert_return(events == EPOLLIN, -EIO); + + for (;;) { + struct signalfd_siginfo si; + ssize_t ss; + sd_event_source *s; + + ss = read(e->signal_fd, &si, sizeof(si)); + if (ss < 0) { + if (errno == EAGAIN || errno == EINTR) + return read_one; + + return -errno; + } + + if (_unlikely_(ss != sizeof(si))) + return -EIO; + + read_one = true; + + s = e->signal_sources[si.ssi_signo]; + if (si.ssi_signo == SIGCHLD) { + r = process_child(e); + if (r < 0) + return r; + if (r > 0 || !s) + continue; + } else + if (!s) + return -EIO; + + s->signal.siginfo = si; + r = source_set_pending(s, true); + if (r < 0) + return r; + } + + return 0; +} + +static int source_dispatch(sd_event_source *s) { + int r = 0; + + assert(s); + assert(s->pending || s->type == SOURCE_EXIT); + + if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) { + r = source_set_pending(s, false); + if (r < 0) + return r; + } + + if (s->enabled == SD_EVENT_ONESHOT) { + r = sd_event_source_set_enabled(s, SD_EVENT_OFF); + if (r < 0) + return r; + } + + s->dispatching = true; + + switch (s->type) { + + case SOURCE_IO: + r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata); + break; + + case SOURCE_MONOTONIC: + r = s->time.callback(s, s->time.next, s->userdata); + break; + + case SOURCE_REALTIME: + r = s->time.callback(s, s->time.next, s->userdata); + break; + + case SOURCE_SIGNAL: + r = s->signal.callback(s, &s->signal.siginfo, s->userdata); + break; + + case SOURCE_CHILD: { + bool zombie; + + zombie = s->child.siginfo.si_code == CLD_EXITED || + s->child.siginfo.si_code == CLD_KILLED || + s->child.siginfo.si_code == CLD_DUMPED; + + r = s->child.callback(s, &s->child.siginfo, s->userdata); + + /* Now, reap the PID for good. */ + if (zombie) + waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED); + + break; + } + + case SOURCE_DEFER: + r = s->defer.callback(s, s->userdata); + break; + + case SOURCE_EXIT: + r = s->exit.callback(s, s->userdata); + break; + + case SOURCE_WATCHDOG: + assert_not_reached("Wut? I shouldn't exist."); + } + + s->dispatching = false; + + if (r < 0) + log_debug("Event source %p returned error, disabling: %s", s, strerror(-r)); + + if (s->n_ref == 0) + source_free(s); + else if (r < 0) + sd_event_source_set_enabled(s, SD_EVENT_OFF); + + return 1; +} + +static int event_prepare(sd_event *e) { + int r; + + assert(e); + + for (;;) { + sd_event_source *s; + + s = prioq_peek(e->prepare); + if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF) + break; + + s->prepare_iteration = e->iteration; + r = prioq_reshuffle(e->prepare, s, &s->prepare_index); + if (r < 0) + return r; + + assert(s->prepare); + + s->dispatching = true; + r = s->prepare(s, s->userdata); + s->dispatching = false; + + if (r < 0) + log_debug("Prepare callback of event source %p returned error, disabling: %s", s, strerror(-r)); + + if (s->n_ref == 0) + source_free(s); + else if (r < 0) + sd_event_source_set_enabled(s, SD_EVENT_OFF); + } + + return 0; +} + +static int dispatch_exit(sd_event *e) { + sd_event_source *p; + int r; + + assert(e); + + p = prioq_peek(e->exit); + if (!p || p->enabled == SD_EVENT_OFF) { + e->state = SD_EVENT_FINISHED; + return 0; + } + + sd_event_ref(e); + e->iteration++; + e->state = SD_EVENT_EXITING; + + r = source_dispatch(p); + + e->state = SD_EVENT_PASSIVE; + sd_event_unref(e); + + return r; +} + +static sd_event_source* event_next_pending(sd_event *e) { + sd_event_source *p; + + assert(e); + + p = prioq_peek(e->pending); + if (!p) + return NULL; + + if (p->enabled == SD_EVENT_OFF) + return NULL; + + return p; +} + +static int arm_watchdog(sd_event *e) { + struct itimerspec its = {}; + usec_t t; + int r; + + assert(e); + assert(e->watchdog_fd >= 0); + + t = sleep_between(e, + e->watchdog_last + (e->watchdog_period / 2), + e->watchdog_last + (e->watchdog_period * 3 / 4)); + + timespec_store(&its.it_value, t); + + r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL); + if (r < 0) + return -errno; + + return 0; +} + +static int process_watchdog(sd_event *e) { + assert(e); + + if (!e->watchdog) + return 0; + + /* Don't notify watchdog too often */ + if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic) + return 0; + + sd_notify(false, "WATCHDOG=1"); + e->watchdog_last = e->timestamp.monotonic; + + return arm_watchdog(e); +} + +_public_ int sd_event_run(sd_event *e, uint64_t timeout) { + struct epoll_event *ev_queue; + unsigned ev_queue_max; + sd_event_source *p; + int r, i, m; + + assert_return(e, -EINVAL); + assert_return(!event_pid_changed(e), -ECHILD); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY); + + if (e->exit_requested) + return dispatch_exit(e); + + sd_event_ref(e); + e->iteration++; + e->state = SD_EVENT_RUNNING; + + r = event_prepare(e); + if (r < 0) + goto finish; + + r = event_arm_timer(e, e->monotonic_fd, e->monotonic_earliest, e->monotonic_latest, &e->monotonic_next); + if (r < 0) + goto finish; + + r = event_arm_timer(e, e->realtime_fd, e->realtime_earliest, e->realtime_latest, &e->realtime_next); + if (r < 0) + goto finish; + + if (event_next_pending(e) || e->need_process_child) + timeout = 0; + ev_queue_max = CLAMP(e->n_sources, 1U, EPOLL_QUEUE_MAX); + ev_queue = newa(struct epoll_event, ev_queue_max); + + m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max, + timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC)); + if (m < 0) { + r = errno == EAGAIN || errno == EINTR ? 1 : -errno; + goto finish; + } + + dual_timestamp_get(&e->timestamp); + + for (i = 0; i < m; i++) { + + if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_MONOTONIC)) + r = flush_timer(e, e->monotonic_fd, ev_queue[i].events, &e->monotonic_next); + else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_REALTIME)) + r = flush_timer(e, e->realtime_fd, ev_queue[i].events, &e->realtime_next); + else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_SIGNAL)) + r = process_signal(e, ev_queue[i].events); + else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG)) + r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL); + else + r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events); + + if (r < 0) + goto finish; + } + + r = process_watchdog(e); + if (r < 0) + goto finish; + + r = process_timer(e, e->timestamp.monotonic, e->monotonic_earliest, e->monotonic_latest); + if (r < 0) + goto finish; + + r = process_timer(e, e->timestamp.realtime, e->realtime_earliest, e->realtime_latest); + if (r < 0) + goto finish; + + if (e->need_process_child) { + r = process_child(e); + if (r < 0) + goto finish; + } + + p = event_next_pending(e); + if (!p) { + r = 1; + goto finish; + } + + r = source_dispatch(p); + +finish: + e->state = SD_EVENT_PASSIVE; + sd_event_unref(e); + + return r; +} + +_public_ int sd_event_loop(sd_event *e) { + int r; + + assert_return(e, -EINVAL); + assert_return(!event_pid_changed(e), -ECHILD); + assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY); + + sd_event_ref(e); + + while (e->state != SD_EVENT_FINISHED) { + r = sd_event_run(e, (uint64_t) -1); + if (r < 0) + goto finish; + } + + r = e->exit_code; + +finish: + sd_event_unref(e); + return r; +} + +_public_ int sd_event_get_state(sd_event *e) { + assert_return(e, -EINVAL); + assert_return(!event_pid_changed(e), -ECHILD); + + return e->state; +} + +_public_ int sd_event_get_exit_code(sd_event *e, int *code) { + assert_return(e, -EINVAL); + assert_return(code, -EINVAL); + assert_return(!event_pid_changed(e), -ECHILD); + + if (!e->exit_requested) + return -ENODATA; + + *code = e->exit_code; + return 0; +} + +_public_ int sd_event_exit(sd_event *e, int code) { + assert_return(e, -EINVAL); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(e), -ECHILD); + + e->exit_requested = true; + e->exit_code = code; + + return 0; +} + +_public_ int sd_event_get_now_realtime(sd_event *e, uint64_t *usec) { + assert_return(e, -EINVAL); + assert_return(usec, -EINVAL); + assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA); + assert_return(!event_pid_changed(e), -ECHILD); + + *usec = e->timestamp.realtime; + return 0; +} + +_public_ int sd_event_get_now_monotonic(sd_event *e, uint64_t *usec) { + assert_return(e, -EINVAL); + assert_return(usec, -EINVAL); + assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA); + assert_return(!event_pid_changed(e), -ECHILD); + + *usec = e->timestamp.monotonic; + return 0; +} + +_public_ int sd_event_default(sd_event **ret) { + + static thread_local sd_event *default_event = NULL; + sd_event *e; + int r; + + if (!ret) + return !!default_event; + + if (default_event) { + *ret = sd_event_ref(default_event); + return 0; + } + + r = sd_event_new(&e); + if (r < 0) + return r; + + e->default_event_ptr = &default_event; + e->tid = gettid(); + default_event = e; + + *ret = e; + return 1; +} + +_public_ int sd_event_get_tid(sd_event *e, pid_t *tid) { + assert_return(e, -EINVAL); + assert_return(tid, -EINVAL); + assert_return(!event_pid_changed(e), -ECHILD); + + if (e->tid != 0) { + *tid = e->tid; + return 0; + } + + return -ENXIO; +} + +_public_ int sd_event_set_watchdog(sd_event *e, int b) { + int r; + + assert_return(e, -EINVAL); + assert_return(!event_pid_changed(e), -ECHILD); + + if (e->watchdog == !!b) + return e->watchdog; + + if (b) { + struct epoll_event ev = {}; + + r = sd_watchdog_enabled(false, &e->watchdog_period); + if (r <= 0) + return r; + + /* Issue first ping immediately */ + sd_notify(false, "WATCHDOG=1"); + e->watchdog_last = now(CLOCK_MONOTONIC); + + e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC); + if (e->watchdog_fd < 0) + return -errno; + + r = arm_watchdog(e); + if (r < 0) + goto fail; + + ev.events = EPOLLIN; + ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG); + + r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev); + if (r < 0) { + r = -errno; + goto fail; + } + + } else { + if (e->watchdog_fd >= 0) { + epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL); + close_nointr_nofail(e->watchdog_fd); + e->watchdog_fd = -1; + } + } + + e->watchdog = !!b; + return e->watchdog; + +fail: + close_nointr_nofail(e->watchdog_fd); + e->watchdog_fd = -1; + return r; +} + +_public_ int sd_event_get_watchdog(sd_event *e) { + assert_return(e, -EINVAL); + assert_return(!event_pid_changed(e), -ECHILD); + + return e->watchdog; +}