chiark / gitweb /
libsystemd-bus: rename to libsystemd
[elogind.git] / src / libsystemd / sd-event.c
diff --git a/src/libsystemd/sd-event.c b/src/libsystemd/sd-event.c
new file mode 100644 (file)
index 0000000..0b7b71d
--- /dev/null
@@ -0,0 +1,2224 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+/***
+  This file is part of systemd.
+
+  Copyright 2013 Lennart Poettering
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <sys/epoll.h>
+#include <sys/timerfd.h>
+#include <sys/wait.h>
+#include <pthread.h>
+
+#include "sd-id128.h"
+#include "sd-daemon.h"
+#include "macro.h"
+#include "prioq.h"
+#include "hashmap.h"
+#include "util.h"
+#include "time-util.h"
+#include "missing.h"
+
+#include "sd-event.h"
+
+#define EPOLL_QUEUE_MAX 512U
+#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
+
+typedef enum EventSourceType {
+        SOURCE_IO,
+        SOURCE_MONOTONIC,
+        SOURCE_REALTIME,
+        SOURCE_SIGNAL,
+        SOURCE_CHILD,
+        SOURCE_DEFER,
+        SOURCE_EXIT,
+        SOURCE_WATCHDOG
+} EventSourceType;
+
+struct sd_event_source {
+        unsigned n_ref;
+
+        sd_event *event;
+        void *userdata;
+        sd_event_handler_t prepare;
+
+        EventSourceType type:4;
+        int enabled:3;
+        bool pending:1;
+        bool dispatching:1;
+
+        int priority;
+        unsigned pending_index;
+        unsigned prepare_index;
+        unsigned pending_iteration;
+        unsigned prepare_iteration;
+
+        union {
+                struct {
+                        sd_event_io_handler_t callback;
+                        int fd;
+                        uint32_t events;
+                        uint32_t revents;
+                        bool registered:1;
+                } io;
+                struct {
+                        sd_event_time_handler_t callback;
+                        usec_t next, accuracy;
+                        unsigned earliest_index;
+                        unsigned latest_index;
+                } time;
+                struct {
+                        sd_event_signal_handler_t callback;
+                        struct signalfd_siginfo siginfo;
+                        int sig;
+                } signal;
+                struct {
+                        sd_event_child_handler_t callback;
+                        siginfo_t siginfo;
+                        pid_t pid;
+                        int options;
+                } child;
+                struct {
+                        sd_event_handler_t callback;
+                } defer;
+                struct {
+                        sd_event_handler_t callback;
+                        unsigned prioq_index;
+                } exit;
+        };
+};
+
+struct sd_event {
+        unsigned n_ref;
+
+        int epoll_fd;
+        int signal_fd;
+        int realtime_fd;
+        int monotonic_fd;
+        int watchdog_fd;
+
+        Prioq *pending;
+        Prioq *prepare;
+
+        /* For both clocks we maintain two priority queues each, one
+         * ordered for the earliest times the events may be
+         * dispatched, and one ordered by the latest times they must
+         * have been dispatched. The range between the top entries in
+         * the two prioqs is the time window we can freely schedule
+         * wakeups in */
+        Prioq *monotonic_earliest;
+        Prioq *monotonic_latest;
+        Prioq *realtime_earliest;
+        Prioq *realtime_latest;
+
+        usec_t realtime_next, monotonic_next;
+        usec_t perturb;
+
+        sigset_t sigset;
+        sd_event_source **signal_sources;
+
+        Hashmap *child_sources;
+        unsigned n_enabled_child_sources;
+
+        Prioq *exit;
+
+        pid_t original_pid;
+
+        unsigned iteration;
+        dual_timestamp timestamp;
+        int state;
+
+        bool exit_requested:1;
+        bool need_process_child:1;
+        bool watchdog:1;
+
+        int exit_code;
+
+        pid_t tid;
+        sd_event **default_event_ptr;
+
+        usec_t watchdog_last, watchdog_period;
+
+        unsigned n_sources;
+};
+
+static int pending_prioq_compare(const void *a, const void *b) {
+        const sd_event_source *x = a, *y = b;
+
+        assert(x->pending);
+        assert(y->pending);
+
+        /* Enabled ones first */
+        if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
+                return -1;
+        if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
+                return 1;
+
+        /* Lower priority values first */
+        if (x->priority < y->priority)
+                return -1;
+        if (x->priority > y->priority)
+                return 1;
+
+        /* Older entries first */
+        if (x->pending_iteration < y->pending_iteration)
+                return -1;
+        if (x->pending_iteration > y->pending_iteration)
+                return 1;
+
+        /* Stability for the rest */
+        if (x < y)
+                return -1;
+        if (x > y)
+                return 1;
+
+        return 0;
+}
+
+static int prepare_prioq_compare(const void *a, const void *b) {
+        const sd_event_source *x = a, *y = b;
+
+        assert(x->prepare);
+        assert(y->prepare);
+
+        /* Move most recently prepared ones last, so that we can stop
+         * preparing as soon as we hit one that has already been
+         * prepared in the current iteration */
+        if (x->prepare_iteration < y->prepare_iteration)
+                return -1;
+        if (x->prepare_iteration > y->prepare_iteration)
+                return 1;
+
+        /* Enabled ones first */
+        if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
+                return -1;
+        if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
+                return 1;
+
+        /* Lower priority values first */
+        if (x->priority < y->priority)
+                return -1;
+        if (x->priority > y->priority)
+                return 1;
+
+        /* Stability for the rest */
+        if (x < y)
+                return -1;
+        if (x > y)
+                return 1;
+
+        return 0;
+}
+
+static int earliest_time_prioq_compare(const void *a, const void *b) {
+        const sd_event_source *x = a, *y = b;
+
+        assert(x->type == SOURCE_MONOTONIC || x->type == SOURCE_REALTIME);
+        assert(y->type == SOURCE_MONOTONIC || y->type == SOURCE_REALTIME);
+
+        /* Enabled ones first */
+        if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
+                return -1;
+        if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
+                return 1;
+
+        /* Move the pending ones to the end */
+        if (!x->pending && y->pending)
+                return -1;
+        if (x->pending && !y->pending)
+                return 1;
+
+        /* Order by time */
+        if (x->time.next < y->time.next)
+                return -1;
+        if (x->time.next > y->time.next)
+                return 1;
+
+        /* Stability for the rest */
+        if (x < y)
+                return -1;
+        if (x > y)
+                return 1;
+
+        return 0;
+}
+
+static int latest_time_prioq_compare(const void *a, const void *b) {
+        const sd_event_source *x = a, *y = b;
+
+        assert((x->type == SOURCE_MONOTONIC && y->type == SOURCE_MONOTONIC) ||
+               (x->type == SOURCE_REALTIME && y->type == SOURCE_REALTIME));
+
+        /* Enabled ones first */
+        if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
+                return -1;
+        if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
+                return 1;
+
+        /* Move the pending ones to the end */
+        if (!x->pending && y->pending)
+                return -1;
+        if (x->pending && !y->pending)
+                return 1;
+
+        /* Order by time */
+        if (x->time.next + x->time.accuracy < y->time.next + y->time.accuracy)
+                return -1;
+        if (x->time.next + x->time.accuracy > y->time.next + y->time.accuracy)
+                return 1;
+
+        /* Stability for the rest */
+        if (x < y)
+                return -1;
+        if (x > y)
+                return 1;
+
+        return 0;
+}
+
+static int exit_prioq_compare(const void *a, const void *b) {
+        const sd_event_source *x = a, *y = b;
+
+        assert(x->type == SOURCE_EXIT);
+        assert(y->type == SOURCE_EXIT);
+
+        /* Enabled ones first */
+        if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
+                return -1;
+        if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
+                return 1;
+
+        /* Lower priority values first */
+        if (x->priority < y->priority)
+                return -1;
+        if (x->priority > y->priority)
+                return 1;
+
+        /* Stability for the rest */
+        if (x < y)
+                return -1;
+        if (x > y)
+                return 1;
+
+        return 0;
+}
+
+static void event_free(sd_event *e) {
+        assert(e);
+        assert(e->n_sources == 0);
+
+        if (e->default_event_ptr)
+                *(e->default_event_ptr) = NULL;
+
+        if (e->epoll_fd >= 0)
+                close_nointr_nofail(e->epoll_fd);
+
+        if (e->signal_fd >= 0)
+                close_nointr_nofail(e->signal_fd);
+
+        if (e->realtime_fd >= 0)
+                close_nointr_nofail(e->realtime_fd);
+
+        if (e->monotonic_fd >= 0)
+                close_nointr_nofail(e->monotonic_fd);
+
+        if (e->watchdog_fd >= 0)
+                close_nointr_nofail(e->watchdog_fd);
+
+        prioq_free(e->pending);
+        prioq_free(e->prepare);
+        prioq_free(e->monotonic_earliest);
+        prioq_free(e->monotonic_latest);
+        prioq_free(e->realtime_earliest);
+        prioq_free(e->realtime_latest);
+        prioq_free(e->exit);
+
+        free(e->signal_sources);
+
+        hashmap_free(e->child_sources);
+        free(e);
+}
+
+_public_ int sd_event_new(sd_event** ret) {
+        sd_event *e;
+        int r;
+
+        assert_return(ret, -EINVAL);
+
+        e = new0(sd_event, 1);
+        if (!e)
+                return -ENOMEM;
+
+        e->n_ref = 1;
+        e->signal_fd = e->realtime_fd = e->monotonic_fd = e->watchdog_fd = e->epoll_fd = -1;
+        e->realtime_next = e->monotonic_next = (usec_t) -1;
+        e->original_pid = getpid();
+
+        assert_se(sigemptyset(&e->sigset) == 0);
+
+        e->pending = prioq_new(pending_prioq_compare);
+        if (!e->pending) {
+                r = -ENOMEM;
+                goto fail;
+        }
+
+        e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+        if (e->epoll_fd < 0) {
+                r = -errno;
+                goto fail;
+        }
+
+        *ret = e;
+        return 0;
+
+fail:
+        event_free(e);
+        return r;
+}
+
+_public_ sd_event* sd_event_ref(sd_event *e) {
+        assert_return(e, NULL);
+
+        assert(e->n_ref >= 1);
+        e->n_ref++;
+
+        return e;
+}
+
+_public_ sd_event* sd_event_unref(sd_event *e) {
+
+        if (!e)
+                return NULL;
+
+        assert(e->n_ref >= 1);
+        e->n_ref--;
+
+        if (e->n_ref <= 0)
+                event_free(e);
+
+        return NULL;
+}
+
+static bool event_pid_changed(sd_event *e) {
+        assert(e);
+
+        /* We don't support people creating am event loop and keeping
+         * it around over a fork(). Let's complain. */
+
+        return e->original_pid != getpid();
+}
+
+static int source_io_unregister(sd_event_source *s) {
+        int r;
+
+        assert(s);
+        assert(s->type == SOURCE_IO);
+
+        if (!s->io.registered)
+                return 0;
+
+        r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
+        if (r < 0)
+                return -errno;
+
+        s->io.registered = false;
+        return 0;
+}
+
+static int source_io_register(
+                sd_event_source *s,
+                int enabled,
+                uint32_t events) {
+
+        struct epoll_event ev = {};
+        int r;
+
+        assert(s);
+        assert(s->type == SOURCE_IO);
+        assert(enabled != SD_EVENT_OFF);
+
+        ev.events = events;
+        ev.data.ptr = s;
+
+        if (enabled == SD_EVENT_ONESHOT)
+                ev.events |= EPOLLONESHOT;
+
+        if (s->io.registered)
+                r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
+        else
+                r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
+
+        if (r < 0)
+                return -errno;
+
+        s->io.registered = true;
+
+        return 0;
+}
+
+static void source_free(sd_event_source *s) {
+        assert(s);
+
+        if (s->event) {
+                assert(s->event->n_sources > 0);
+
+                switch (s->type) {
+
+                case SOURCE_IO:
+                        if (s->io.fd >= 0)
+                                source_io_unregister(s);
+
+                        break;
+
+                case SOURCE_MONOTONIC:
+                        prioq_remove(s->event->monotonic_earliest, s, &s->time.earliest_index);
+                        prioq_remove(s->event->monotonic_latest, s, &s->time.latest_index);
+                        break;
+
+                case SOURCE_REALTIME:
+                        prioq_remove(s->event->realtime_earliest, s, &s->time.earliest_index);
+                        prioq_remove(s->event->realtime_latest, s, &s->time.latest_index);
+                        break;
+
+                case SOURCE_SIGNAL:
+                        if (s->signal.sig > 0) {
+                                if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0)
+                                        assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
+
+                                if (s->event->signal_sources)
+                                        s->event->signal_sources[s->signal.sig] = NULL;
+                        }
+
+                        break;
+
+                case SOURCE_CHILD:
+                        if (s->child.pid > 0) {
+                                if (s->enabled != SD_EVENT_OFF) {
+                                        assert(s->event->n_enabled_child_sources > 0);
+                                        s->event->n_enabled_child_sources--;
+                                }
+
+                                if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD])
+                                        assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
+
+                                hashmap_remove(s->event->child_sources, INT_TO_PTR(s->child.pid));
+                        }
+
+                        break;
+
+                case SOURCE_DEFER:
+                        /* nothing */
+                        break;
+
+                case SOURCE_EXIT:
+                        prioq_remove(s->event->exit, s, &s->exit.prioq_index);
+                        break;
+
+                case SOURCE_WATCHDOG:
+                        assert_not_reached("Wut? I shouldn't exist.");
+                }
+
+                if (s->pending)
+                        prioq_remove(s->event->pending, s, &s->pending_index);
+
+                if (s->prepare)
+                        prioq_remove(s->event->prepare, s, &s->prepare_index);
+
+                s->event->n_sources--;
+                sd_event_unref(s->event);
+        }
+
+        free(s);
+}
+
+static int source_set_pending(sd_event_source *s, bool b) {
+        int r;
+
+        assert(s);
+        assert(s->type != SOURCE_EXIT);
+
+        if (s->pending == b)
+                return 0;
+
+        s->pending = b;
+
+        if (b) {
+                s->pending_iteration = s->event->iteration;
+
+                r = prioq_put(s->event->pending, s, &s->pending_index);
+                if (r < 0) {
+                        s->pending = false;
+                        return r;
+                }
+        } else
+                assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
+
+        if (s->type == SOURCE_REALTIME) {
+                prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
+                prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
+        } else if (s->type == SOURCE_MONOTONIC) {
+                prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
+                prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
+        }
+
+        return 0;
+}
+
+static sd_event_source *source_new(sd_event *e, EventSourceType type) {
+        sd_event_source *s;
+
+        assert(e);
+
+        s = new0(sd_event_source, 1);
+        if (!s)
+                return NULL;
+
+        s->n_ref = 1;
+        s->event = sd_event_ref(e);
+        s->type = type;
+        s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
+
+        e->n_sources ++;
+
+        return s;
+}
+
+_public_ int sd_event_add_io(
+                sd_event *e,
+                int fd,
+                uint32_t events,
+                sd_event_io_handler_t callback,
+                void *userdata,
+                sd_event_source **ret) {
+
+        sd_event_source *s;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(fd >= 0, -EINVAL);
+        assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(ret, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        s = source_new(e, SOURCE_IO);
+        if (!s)
+                return -ENOMEM;
+
+        s->io.fd = fd;
+        s->io.events = events;
+        s->io.callback = callback;
+        s->userdata = userdata;
+        s->enabled = SD_EVENT_ON;
+
+        r = source_io_register(s, s->enabled, events);
+        if (r < 0) {
+                source_free(s);
+                return -errno;
+        }
+
+        *ret = s;
+        return 0;
+}
+
+static int event_setup_timer_fd(
+                sd_event *e,
+                EventSourceType type,
+                int *timer_fd,
+                clockid_t id) {
+
+        struct epoll_event ev = {};
+        int r, fd;
+        sd_id128_t bootid;
+
+        assert(e);
+        assert(timer_fd);
+
+        if (_likely_(*timer_fd >= 0))
+                return 0;
+
+        fd = timerfd_create(id, TFD_NONBLOCK|TFD_CLOEXEC);
+        if (fd < 0)
+                return -errno;
+
+        ev.events = EPOLLIN;
+        ev.data.ptr = INT_TO_PTR(type);
+
+        r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
+        if (r < 0) {
+                close_nointr_nofail(fd);
+                return -errno;
+        }
+
+        /* When we sleep for longer, we try to realign the wakeup to
+           the same time wihtin each minute/second/250ms, so that
+           events all across the system can be coalesced into a single
+           CPU wakeup. However, let's take some system-specific
+           randomness for this value, so that in a network of systems
+           with synced clocks timer events are distributed a
+           bit. Here, we calculate a perturbation usec offset from the
+           boot ID. */
+
+        if (sd_id128_get_boot(&bootid) >= 0)
+                e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
+
+        *timer_fd = fd;
+        return 0;
+}
+
+static int event_add_time_internal(
+                sd_event *e,
+                EventSourceType type,
+                int *timer_fd,
+                clockid_t id,
+                Prioq **earliest,
+                Prioq **latest,
+                uint64_t usec,
+                uint64_t accuracy,
+                sd_event_time_handler_t callback,
+                void *userdata,
+                sd_event_source **ret) {
+
+        sd_event_source *s;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(ret, -EINVAL);
+        assert_return(usec != (uint64_t) -1, -EINVAL);
+        assert_return(accuracy != (uint64_t) -1, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        assert(timer_fd);
+        assert(earliest);
+        assert(latest);
+
+        if (!*earliest) {
+                *earliest = prioq_new(earliest_time_prioq_compare);
+                if (!*earliest)
+                        return -ENOMEM;
+        }
+
+        if (!*latest) {
+                *latest = prioq_new(latest_time_prioq_compare);
+                if (!*latest)
+                        return -ENOMEM;
+        }
+
+        if (*timer_fd < 0) {
+                r = event_setup_timer_fd(e, type, timer_fd, id);
+                if (r < 0)
+                        return r;
+        }
+
+        s = source_new(e, type);
+        if (!s)
+                return -ENOMEM;
+
+        s->time.next = usec;
+        s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
+        s->time.callback = callback;
+        s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
+        s->userdata = userdata;
+        s->enabled = SD_EVENT_ONESHOT;
+
+        r = prioq_put(*earliest, s, &s->time.earliest_index);
+        if (r < 0)
+                goto fail;
+
+        r = prioq_put(*latest, s, &s->time.latest_index);
+        if (r < 0)
+                goto fail;
+
+        *ret = s;
+        return 0;
+
+fail:
+        source_free(s);
+        return r;
+}
+
+_public_ int sd_event_add_monotonic(sd_event *e,
+                                    uint64_t usec,
+                                    uint64_t accuracy,
+                                    sd_event_time_handler_t callback,
+                                    void *userdata,
+                                    sd_event_source **ret) {
+
+        return event_add_time_internal(e, SOURCE_MONOTONIC, &e->monotonic_fd, CLOCK_MONOTONIC, &e->monotonic_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
+}
+
+_public_ int sd_event_add_realtime(sd_event *e,
+                                   uint64_t usec,
+                                   uint64_t accuracy,
+                                   sd_event_time_handler_t callback,
+                                   void *userdata,
+                                   sd_event_source **ret) {
+
+        return event_add_time_internal(e, SOURCE_REALTIME, &e->realtime_fd, CLOCK_REALTIME, &e->realtime_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
+}
+
+static int event_update_signal_fd(sd_event *e) {
+        struct epoll_event ev = {};
+        bool add_to_epoll;
+        int r;
+
+        assert(e);
+
+        add_to_epoll = e->signal_fd < 0;
+
+        r = signalfd(e->signal_fd, &e->sigset, SFD_NONBLOCK|SFD_CLOEXEC);
+        if (r < 0)
+                return -errno;
+
+        e->signal_fd = r;
+
+        if (!add_to_epoll)
+                return 0;
+
+        ev.events = EPOLLIN;
+        ev.data.ptr = INT_TO_PTR(SOURCE_SIGNAL);
+
+        r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->signal_fd, &ev);
+        if (r < 0) {
+                close_nointr_nofail(e->signal_fd);
+                e->signal_fd = -1;
+
+                return -errno;
+        }
+
+        return 0;
+}
+
+_public_ int sd_event_add_signal(
+                sd_event *e,
+                int sig,
+                sd_event_signal_handler_t callback,
+                void *userdata,
+                sd_event_source **ret) {
+
+        sd_event_source *s;
+        sigset_t ss;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(sig > 0, -EINVAL);
+        assert_return(sig < _NSIG, -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(ret, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
+        if (r < 0)
+                return -errno;
+
+        if (!sigismember(&ss, sig))
+                return -EBUSY;
+
+        if (!e->signal_sources) {
+                e->signal_sources = new0(sd_event_source*, _NSIG);
+                if (!e->signal_sources)
+                        return -ENOMEM;
+        } else if (e->signal_sources[sig])
+                return -EBUSY;
+
+        s = source_new(e, SOURCE_SIGNAL);
+        if (!s)
+                return -ENOMEM;
+
+        s->signal.sig = sig;
+        s->signal.callback = callback;
+        s->userdata = userdata;
+        s->enabled = SD_EVENT_ON;
+
+        e->signal_sources[sig] = s;
+        assert_se(sigaddset(&e->sigset, sig) == 0);
+
+        if (sig != SIGCHLD || e->n_enabled_child_sources == 0) {
+                r = event_update_signal_fd(e);
+                if (r < 0) {
+                        source_free(s);
+                        return r;
+                }
+        }
+
+        *ret = s;
+        return 0;
+}
+
+_public_ int sd_event_add_child(
+                sd_event *e,
+                pid_t pid,
+                int options,
+                sd_event_child_handler_t callback,
+                void *userdata,
+                sd_event_source **ret) {
+
+        sd_event_source *s;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(pid > 1, -EINVAL);
+        assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
+        assert_return(options != 0, -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(ret, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        r = hashmap_ensure_allocated(&e->child_sources, trivial_hash_func, trivial_compare_func);
+        if (r < 0)
+                return r;
+
+        if (hashmap_contains(e->child_sources, INT_TO_PTR(pid)))
+                return -EBUSY;
+
+        s = source_new(e, SOURCE_CHILD);
+        if (!s)
+                return -ENOMEM;
+
+        s->child.pid = pid;
+        s->child.options = options;
+        s->child.callback = callback;
+        s->userdata = userdata;
+        s->enabled = SD_EVENT_ONESHOT;
+
+        r = hashmap_put(e->child_sources, INT_TO_PTR(pid), s);
+        if (r < 0) {
+                source_free(s);
+                return r;
+        }
+
+        e->n_enabled_child_sources ++;
+
+        assert_se(sigaddset(&e->sigset, SIGCHLD) == 0);
+
+        if (!e->signal_sources || !e->signal_sources[SIGCHLD]) {
+                r = event_update_signal_fd(e);
+                if (r < 0) {
+                        source_free(s);
+                        return -errno;
+                }
+        }
+
+        e->need_process_child = true;
+
+        *ret = s;
+        return 0;
+}
+
+_public_ int sd_event_add_defer(
+                sd_event *e,
+                sd_event_handler_t callback,
+                void *userdata,
+                sd_event_source **ret) {
+
+        sd_event_source *s;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(ret, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        s = source_new(e, SOURCE_DEFER);
+        if (!s)
+                return -ENOMEM;
+
+        s->defer.callback = callback;
+        s->userdata = userdata;
+        s->enabled = SD_EVENT_ONESHOT;
+
+        r = source_set_pending(s, true);
+        if (r < 0) {
+                source_free(s);
+                return r;
+        }
+
+        *ret = s;
+        return 0;
+}
+
+_public_ int sd_event_add_exit(
+                sd_event *e,
+                sd_event_handler_t callback,
+                void *userdata,
+                sd_event_source **ret) {
+
+        sd_event_source *s;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(ret, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (!e->exit) {
+                e->exit = prioq_new(exit_prioq_compare);
+                if (!e->exit)
+                        return -ENOMEM;
+        }
+
+        s = source_new(e, SOURCE_EXIT);
+        if (!s)
+                return -ENOMEM;
+
+        s->exit.callback = callback;
+        s->userdata = userdata;
+        s->exit.prioq_index = PRIOQ_IDX_NULL;
+        s->enabled = SD_EVENT_ONESHOT;
+
+        r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
+        if (r < 0) {
+                source_free(s);
+                return r;
+        }
+
+        *ret = s;
+        return 0;
+}
+
+_public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
+        assert_return(s, NULL);
+
+        assert(s->n_ref >= 1);
+        s->n_ref++;
+
+        return s;
+}
+
+_public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
+
+        if (!s)
+                return NULL;
+
+        assert(s->n_ref >= 1);
+        s->n_ref--;
+
+        if (s->n_ref <= 0) {
+                /* Here's a special hack: when we are called from a
+                 * dispatch handler we won't free the event source
+                 * immediately, but we will detach the fd from the
+                 * epoll. This way it is safe for the caller to unref
+                 * the event source and immediately close the fd, but
+                 * we still retain a valid event source object after
+                 * the callback. */
+
+                if (s->dispatching) {
+                        if (s->type == SOURCE_IO)
+                                source_io_unregister(s);
+                } else
+                        source_free(s);
+        }
+
+        return NULL;
+}
+
+_public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
+        assert_return(s, NULL);
+
+        return s->event;
+}
+
+_public_ int sd_event_source_get_pending(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type != SOURCE_EXIT, -EDOM);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        return s->pending;
+}
+
+_public_ int sd_event_source_get_io_fd(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        return s->io.fd;
+}
+
+_public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
+        int r;
+
+        assert_return(s, -EINVAL);
+        assert_return(fd >= 0, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        if (s->io.fd == fd)
+                return 0;
+
+        if (s->enabled == SD_EVENT_OFF) {
+                s->io.fd = fd;
+                s->io.registered = false;
+        } else {
+                int saved_fd;
+
+                saved_fd = s->io.fd;
+                assert(s->io.registered);
+
+                s->io.fd = fd;
+                s->io.registered = false;
+
+                r = source_io_register(s, s->enabled, s->io.events);
+                if (r < 0) {
+                        s->io.fd = saved_fd;
+                        s->io.registered = true;
+                        return r;
+                }
+
+                epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
+        }
+
+        return 0;
+}
+
+_public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
+        assert_return(s, -EINVAL);
+        assert_return(events, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *events = s->io.events;
+        return 0;
+}
+
+_public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
+        int r;
+
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        if (s->io.events == events)
+                return 0;
+
+        if (s->enabled != SD_EVENT_OFF) {
+                r = source_io_register(s, s->enabled, events);
+                if (r < 0)
+                        return r;
+        }
+
+        s->io.events = events;
+        source_set_pending(s, false);
+
+        return 0;
+}
+
+_public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
+        assert_return(s, -EINVAL);
+        assert_return(revents, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(s->pending, -ENODATA);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *revents = s->io.revents;
+        return 0;
+}
+
+_public_ int sd_event_source_get_signal(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_SIGNAL, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        return s->signal.sig;
+}
+
+_public_ int sd_event_source_get_priority(sd_event_source *s, int *priority) {
+        assert_return(s, -EINVAL);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        return s->priority;
+}
+
+_public_ int sd_event_source_set_priority(sd_event_source *s, int priority) {
+        assert_return(s, -EINVAL);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        if (s->priority == priority)
+                return 0;
+
+        s->priority = priority;
+
+        if (s->pending)
+                prioq_reshuffle(s->event->pending, s, &s->pending_index);
+
+        if (s->prepare)
+                prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
+
+        if (s->type == SOURCE_EXIT)
+                prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
+
+        return 0;
+}
+
+_public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
+        assert_return(s, -EINVAL);
+        assert_return(m, -EINVAL);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *m = s->enabled;
+        return 0;
+}
+
+_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
+        int r;
+
+        assert_return(s, -EINVAL);
+        assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        if (s->enabled == m)
+                return 0;
+
+        if (m == SD_EVENT_OFF) {
+
+                switch (s->type) {
+
+                case SOURCE_IO:
+                        r = source_io_unregister(s);
+                        if (r < 0)
+                                return r;
+
+                        s->enabled = m;
+                        break;
+
+                case SOURCE_MONOTONIC:
+                        s->enabled = m;
+                        prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
+                        prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
+                        break;
+
+                case SOURCE_REALTIME:
+                        s->enabled = m;
+                        prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
+                        prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
+                        break;
+
+                case SOURCE_SIGNAL:
+                        s->enabled = m;
+                        if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) {
+                                assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
+                                event_update_signal_fd(s->event);
+                        }
+
+                        break;
+
+                case SOURCE_CHILD:
+                        s->enabled = m;
+
+                        assert(s->event->n_enabled_child_sources > 0);
+                        s->event->n_enabled_child_sources--;
+
+                        if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
+                                assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
+                                event_update_signal_fd(s->event);
+                        }
+
+                        break;
+
+                case SOURCE_EXIT:
+                        s->enabled = m;
+                        prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
+                        break;
+
+                case SOURCE_DEFER:
+                        s->enabled = m;
+                        break;
+
+                case SOURCE_WATCHDOG:
+                        assert_not_reached("Wut? I shouldn't exist.");
+                }
+
+        } else {
+                switch (s->type) {
+
+                case SOURCE_IO:
+                        r = source_io_register(s, m, s->io.events);
+                        if (r < 0)
+                                return r;
+
+                        s->enabled = m;
+                        break;
+
+                case SOURCE_MONOTONIC:
+                        s->enabled = m;
+                        prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
+                        prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
+                        break;
+
+                case SOURCE_REALTIME:
+                        s->enabled = m;
+                        prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
+                        prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
+                        break;
+
+                case SOURCE_SIGNAL:
+                        s->enabled = m;
+
+                        if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0)  {
+                                assert_se(sigaddset(&s->event->sigset, s->signal.sig) == 0);
+                                event_update_signal_fd(s->event);
+                        }
+                        break;
+
+                case SOURCE_CHILD:
+                        s->enabled = m;
+
+                        if (s->enabled == SD_EVENT_OFF) {
+                                s->event->n_enabled_child_sources++;
+
+                                if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
+                                        assert_se(sigaddset(&s->event->sigset, SIGCHLD) == 0);
+                                        event_update_signal_fd(s->event);
+                                }
+                        }
+                        break;
+
+                case SOURCE_EXIT:
+                        s->enabled = m;
+                        prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
+                        break;
+
+                case SOURCE_DEFER:
+                        s->enabled = m;
+                        break;
+
+                case SOURCE_WATCHDOG:
+                        assert_not_reached("Wut? I shouldn't exist.");
+                }
+        }
+
+        if (s->pending)
+                prioq_reshuffle(s->event->pending, s, &s->pending_index);
+
+        if (s->prepare)
+                prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
+
+        return 0;
+}
+
+_public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
+        assert_return(s, -EINVAL);
+        assert_return(usec, -EINVAL);
+        assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *usec = s->time.next;
+        return 0;
+}
+
+_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
+        assert_return(s, -EINVAL);
+        assert_return(usec != (uint64_t) -1, -EINVAL);
+        assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        s->time.next = usec;
+
+        source_set_pending(s, false);
+
+        if (s->type == SOURCE_REALTIME) {
+                prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
+                prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
+        } else {
+                prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
+                prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
+        }
+
+        return 0;
+}
+
+_public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
+        assert_return(s, -EINVAL);
+        assert_return(usec, -EINVAL);
+        assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *usec = s->time.accuracy;
+        return 0;
+}
+
+_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
+        assert_return(s, -EINVAL);
+        assert_return(usec != (uint64_t) -1, -EINVAL);
+        assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        if (usec == 0)
+                usec = DEFAULT_ACCURACY_USEC;
+
+        s->time.accuracy = usec;
+
+        source_set_pending(s, false);
+
+        if (s->type == SOURCE_REALTIME)
+                prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
+        else
+                prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
+
+        return 0;
+}
+
+_public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
+        assert_return(s, -EINVAL);
+        assert_return(pid, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *pid = s->child.pid;
+        return 0;
+}
+
+_public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
+        int r;
+
+        assert_return(s, -EINVAL);
+        assert_return(s->type != SOURCE_EXIT, -EDOM);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        if (s->prepare == callback)
+                return 0;
+
+        if (callback && s->prepare) {
+                s->prepare = callback;
+                return 0;
+        }
+
+        r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
+        if (r < 0)
+                return r;
+
+        s->prepare = callback;
+
+        if (callback) {
+                r = prioq_put(s->event->prepare, s, &s->prepare_index);
+                if (r < 0)
+                        return r;
+        } else
+                prioq_remove(s->event->prepare, s, &s->prepare_index);
+
+        return 0;
+}
+
+_public_ void* sd_event_source_get_userdata(sd_event_source *s) {
+        assert_return(s, NULL);
+
+        return s->userdata;
+}
+
+_public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
+        void *ret;
+
+        assert_return(s, NULL);
+
+        ret = s->userdata;
+        s->userdata = userdata;
+
+        return ret;
+}
+
+static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
+        usec_t c;
+        assert(e);
+        assert(a <= b);
+
+        if (a <= 0)
+                return 0;
+
+        if (b <= a + 1)
+                return a;
+
+        /*
+          Find a good time to wake up again between times a and b. We
+          have two goals here:
+
+          a) We want to wake up as seldom as possible, hence prefer
+             later times over earlier times.
+
+          b) But if we have to wake up, then let's make sure to
+             dispatch as much as possible on the entire system.
+
+          We implement this by waking up everywhere at the same time
+          within any given minute if we can, synchronised via the
+          perturbation value determined from the boot ID. If we can't,
+          then we try to find the same spot in every 10s, then 1s and
+          then 250ms step. Otherwise, we pick the last possible time
+          to wake up.
+        */
+
+        c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
+        if (c >= b) {
+                if (_unlikely_(c < USEC_PER_MINUTE))
+                        return b;
+
+                c -= USEC_PER_MINUTE;
+        }
+
+        if (c >= a)
+                return c;
+
+        c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
+        if (c >= b) {
+                if (_unlikely_(c < USEC_PER_SEC*10))
+                        return b;
+
+                c -= USEC_PER_SEC*10;
+        }
+
+        if (c >= a)
+                return c;
+
+        c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
+        if (c >= b) {
+                if (_unlikely_(c < USEC_PER_SEC))
+                        return b;
+
+                c -= USEC_PER_SEC;
+        }
+
+        if (c >= a)
+                return c;
+
+        c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
+        if (c >= b) {
+                if (_unlikely_(c < USEC_PER_MSEC*250))
+                        return b;
+
+                c -= USEC_PER_MSEC*250;
+        }
+
+        if (c >= a)
+                return c;
+
+        return b;
+}
+
+static int event_arm_timer(
+                sd_event *e,
+                int timer_fd,
+                Prioq *earliest,
+                Prioq *latest,
+                usec_t *next) {
+
+        struct itimerspec its = {};
+        sd_event_source *a, *b;
+        usec_t t;
+        int r;
+
+        assert(e);
+        assert(next);
+
+        a = prioq_peek(earliest);
+        if (!a || a->enabled == SD_EVENT_OFF) {
+
+                if (timer_fd < 0)
+                        return 0;
+
+                if (*next == (usec_t) -1)
+                        return 0;
+
+                /* disarm */
+                r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
+                if (r < 0)
+                        return r;
+
+                *next = (usec_t) -1;
+
+                return 0;
+        }
+
+        b = prioq_peek(latest);
+        assert_se(b && b->enabled != SD_EVENT_OFF);
+
+        t = sleep_between(e, a->time.next, b->time.next + b->time.accuracy);
+        if (*next == t)
+                return 0;
+
+        assert_se(timer_fd >= 0);
+
+        if (t == 0) {
+                /* We don' want to disarm here, just mean some time looooong ago. */
+                its.it_value.tv_sec = 0;
+                its.it_value.tv_nsec = 1;
+        } else
+                timespec_store(&its.it_value, t);
+
+        r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
+        if (r < 0)
+                return -errno;
+
+        *next = t;
+        return 0;
+}
+
+static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
+        assert(e);
+        assert(s);
+        assert(s->type == SOURCE_IO);
+
+        /* If the event source was already pending, we just OR in the
+         * new revents, otherwise we reset the value. The ORing is
+         * necessary to handle EPOLLONESHOT events properly where
+         * readability might happen independently of writability, and
+         * we need to keep track of both */
+
+        if (s->pending)
+                s->io.revents |= revents;
+        else
+                s->io.revents = revents;
+
+        return source_set_pending(s, true);
+}
+
+static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
+        uint64_t x;
+        ssize_t ss;
+
+        assert(e);
+        assert(fd >= 0);
+
+        assert_return(events == EPOLLIN, -EIO);
+
+        ss = read(fd, &x, sizeof(x));
+        if (ss < 0) {
+                if (errno == EAGAIN || errno == EINTR)
+                        return 0;
+
+                return -errno;
+        }
+
+        if (_unlikely_(ss != sizeof(x)))
+                return -EIO;
+
+        if (next)
+                *next = (usec_t) -1;
+
+        return 0;
+}
+
+static int process_timer(
+                sd_event *e,
+                usec_t n,
+                Prioq *earliest,
+                Prioq *latest) {
+
+        sd_event_source *s;
+        int r;
+
+        assert(e);
+
+        for (;;) {
+                s = prioq_peek(earliest);
+                if (!s ||
+                    s->time.next > n ||
+                    s->enabled == SD_EVENT_OFF ||
+                    s->pending)
+                        break;
+
+                r = source_set_pending(s, true);
+                if (r < 0)
+                        return r;
+
+                prioq_reshuffle(earliest, s, &s->time.earliest_index);
+                prioq_reshuffle(latest, s, &s->time.latest_index);
+        }
+
+        return 0;
+}
+
+static int process_child(sd_event *e) {
+        sd_event_source *s;
+        Iterator i;
+        int r;
+
+        assert(e);
+
+        e->need_process_child = false;
+
+        /*
+           So, this is ugly. We iteratively invoke waitid() with P_PID
+           + WNOHANG for each PID we wait for, instead of using
+           P_ALL. This is because we only want to get child
+           information of very specific child processes, and not all
+           of them. We might not have processed the SIGCHLD even of a
+           previous invocation and we don't want to maintain a
+           unbounded *per-child* event queue, hence we really don't
+           want anything flushed out of the kernel's queue that we
+           don't care about. Since this is O(n) this means that if you
+           have a lot of processes you probably want to handle SIGCHLD
+           yourself.
+
+           We do not reap the children here (by using WNOWAIT), this
+           is only done after the event source is dispatched so that
+           the callback still sees the process as a zombie.
+        */
+
+        HASHMAP_FOREACH(s, e->child_sources, i) {
+                assert(s->type == SOURCE_CHILD);
+
+                if (s->pending)
+                        continue;
+
+                if (s->enabled == SD_EVENT_OFF)
+                        continue;
+
+                zero(s->child.siginfo);
+                r = waitid(P_PID, s->child.pid, &s->child.siginfo,
+                           WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
+                if (r < 0)
+                        return -errno;
+
+                if (s->child.siginfo.si_pid != 0) {
+                        bool zombie =
+                                s->child.siginfo.si_code == CLD_EXITED ||
+                                s->child.siginfo.si_code == CLD_KILLED ||
+                                s->child.siginfo.si_code == CLD_DUMPED;
+
+                        if (!zombie && (s->child.options & WEXITED)) {
+                                /* If the child isn't dead then let's
+                                 * immediately remove the state change
+                                 * from the queue, since there's no
+                                 * benefit in leaving it queued */
+
+                                assert(s->child.options & (WSTOPPED|WCONTINUED));
+                                waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
+                        }
+
+                        r = source_set_pending(s, true);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        return 0;
+}
+
+static int process_signal(sd_event *e, uint32_t events) {
+        bool read_one = false;
+        int r;
+
+        assert(e);
+        assert(e->signal_sources);
+
+        assert_return(events == EPOLLIN, -EIO);
+
+        for (;;) {
+                struct signalfd_siginfo si;
+                ssize_t ss;
+                sd_event_source *s;
+
+                ss = read(e->signal_fd, &si, sizeof(si));
+                if (ss < 0) {
+                        if (errno == EAGAIN || errno == EINTR)
+                                return read_one;
+
+                        return -errno;
+                }
+
+                if (_unlikely_(ss != sizeof(si)))
+                        return -EIO;
+
+                read_one = true;
+
+                s = e->signal_sources[si.ssi_signo];
+                if (si.ssi_signo == SIGCHLD) {
+                        r = process_child(e);
+                        if (r < 0)
+                                return r;
+                        if (r > 0 || !s)
+                                continue;
+                } else
+                        if (!s)
+                                return -EIO;
+
+                s->signal.siginfo = si;
+                r = source_set_pending(s, true);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int source_dispatch(sd_event_source *s) {
+        int r = 0;
+
+        assert(s);
+        assert(s->pending || s->type == SOURCE_EXIT);
+
+        if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) {
+                r = source_set_pending(s, false);
+                if (r < 0)
+                        return r;
+        }
+
+        if (s->enabled == SD_EVENT_ONESHOT) {
+                r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
+                if (r < 0)
+                        return r;
+        }
+
+        s->dispatching = true;
+
+        switch (s->type) {
+
+        case SOURCE_IO:
+                r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
+                break;
+
+        case SOURCE_MONOTONIC:
+                r = s->time.callback(s, s->time.next, s->userdata);
+                break;
+
+        case SOURCE_REALTIME:
+                r = s->time.callback(s, s->time.next, s->userdata);
+                break;
+
+        case SOURCE_SIGNAL:
+                r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
+                break;
+
+        case SOURCE_CHILD: {
+                bool zombie;
+
+                zombie = s->child.siginfo.si_code == CLD_EXITED ||
+                         s->child.siginfo.si_code == CLD_KILLED ||
+                         s->child.siginfo.si_code == CLD_DUMPED;
+
+                r = s->child.callback(s, &s->child.siginfo, s->userdata);
+
+                /* Now, reap the PID for good. */
+                if (zombie)
+                        waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
+
+                break;
+        }
+
+        case SOURCE_DEFER:
+                r = s->defer.callback(s, s->userdata);
+                break;
+
+        case SOURCE_EXIT:
+                r = s->exit.callback(s, s->userdata);
+                break;
+
+        case SOURCE_WATCHDOG:
+                assert_not_reached("Wut? I shouldn't exist.");
+        }
+
+        s->dispatching = false;
+
+        if (r < 0)
+                log_debug("Event source %p returned error, disabling: %s", s, strerror(-r));
+
+        if (s->n_ref == 0)
+                source_free(s);
+        else if (r < 0)
+                sd_event_source_set_enabled(s, SD_EVENT_OFF);
+
+        return 1;
+}
+
+static int event_prepare(sd_event *e) {
+        int r;
+
+        assert(e);
+
+        for (;;) {
+                sd_event_source *s;
+
+                s = prioq_peek(e->prepare);
+                if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
+                        break;
+
+                s->prepare_iteration = e->iteration;
+                r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
+                if (r < 0)
+                        return r;
+
+                assert(s->prepare);
+
+                s->dispatching = true;
+                r = s->prepare(s, s->userdata);
+                s->dispatching = false;
+
+                if (r < 0)
+                        log_debug("Prepare callback of event source %p returned error, disabling: %s", s, strerror(-r));
+
+                if (s->n_ref == 0)
+                        source_free(s);
+                else if (r < 0)
+                        sd_event_source_set_enabled(s, SD_EVENT_OFF);
+        }
+
+        return 0;
+}
+
+static int dispatch_exit(sd_event *e) {
+        sd_event_source *p;
+        int r;
+
+        assert(e);
+
+        p = prioq_peek(e->exit);
+        if (!p || p->enabled == SD_EVENT_OFF) {
+                e->state = SD_EVENT_FINISHED;
+                return 0;
+        }
+
+        sd_event_ref(e);
+        e->iteration++;
+        e->state = SD_EVENT_EXITING;
+
+        r = source_dispatch(p);
+
+        e->state = SD_EVENT_PASSIVE;
+        sd_event_unref(e);
+
+        return r;
+}
+
+static sd_event_source* event_next_pending(sd_event *e) {
+        sd_event_source *p;
+
+        assert(e);
+
+        p = prioq_peek(e->pending);
+        if (!p)
+                return NULL;
+
+        if (p->enabled == SD_EVENT_OFF)
+                return NULL;
+
+        return p;
+}
+
+static int arm_watchdog(sd_event *e) {
+        struct itimerspec its = {};
+        usec_t t;
+        int r;
+
+        assert(e);
+        assert(e->watchdog_fd >= 0);
+
+        t = sleep_between(e,
+                          e->watchdog_last + (e->watchdog_period / 2),
+                          e->watchdog_last + (e->watchdog_period * 3 / 4));
+
+        timespec_store(&its.it_value, t);
+
+        r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
+        if (r < 0)
+                return -errno;
+
+        return 0;
+}
+
+static int process_watchdog(sd_event *e) {
+        assert(e);
+
+        if (!e->watchdog)
+                return 0;
+
+        /* Don't notify watchdog too often */
+        if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
+                return 0;
+
+        sd_notify(false, "WATCHDOG=1");
+        e->watchdog_last = e->timestamp.monotonic;
+
+        return arm_watchdog(e);
+}
+
+_public_ int sd_event_run(sd_event *e, uint64_t timeout) {
+        struct epoll_event *ev_queue;
+        unsigned ev_queue_max;
+        sd_event_source *p;
+        int r, i, m;
+
+        assert_return(e, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
+
+        if (e->exit_requested)
+                return dispatch_exit(e);
+
+        sd_event_ref(e);
+        e->iteration++;
+        e->state = SD_EVENT_RUNNING;
+
+        r = event_prepare(e);
+        if (r < 0)
+                goto finish;
+
+        r = event_arm_timer(e, e->monotonic_fd, e->monotonic_earliest, e->monotonic_latest, &e->monotonic_next);
+        if (r < 0)
+                goto finish;
+
+        r = event_arm_timer(e, e->realtime_fd, e->realtime_earliest, e->realtime_latest, &e->realtime_next);
+        if (r < 0)
+                goto finish;
+
+        if (event_next_pending(e) || e->need_process_child)
+                timeout = 0;
+        ev_queue_max = CLAMP(e->n_sources, 1U, EPOLL_QUEUE_MAX);
+        ev_queue = newa(struct epoll_event, ev_queue_max);
+
+        m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
+                       timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
+        if (m < 0) {
+                r = errno == EAGAIN || errno == EINTR ? 1 : -errno;
+                goto finish;
+        }
+
+        dual_timestamp_get(&e->timestamp);
+
+        for (i = 0; i < m; i++) {
+
+                if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_MONOTONIC))
+                        r = flush_timer(e, e->monotonic_fd, ev_queue[i].events, &e->monotonic_next);
+                else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_REALTIME))
+                        r = flush_timer(e, e->realtime_fd, ev_queue[i].events, &e->realtime_next);
+                else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_SIGNAL))
+                        r = process_signal(e, ev_queue[i].events);
+                else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
+                        r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
+                else
+                        r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
+
+                if (r < 0)
+                        goto finish;
+        }
+
+        r = process_watchdog(e);
+        if (r < 0)
+                goto finish;
+
+        r = process_timer(e, e->timestamp.monotonic, e->monotonic_earliest, e->monotonic_latest);
+        if (r < 0)
+                goto finish;
+
+        r = process_timer(e, e->timestamp.realtime, e->realtime_earliest, e->realtime_latest);
+        if (r < 0)
+                goto finish;
+
+        if (e->need_process_child) {
+                r = process_child(e);
+                if (r < 0)
+                        goto finish;
+        }
+
+        p = event_next_pending(e);
+        if (!p) {
+                r = 1;
+                goto finish;
+        }
+
+        r = source_dispatch(p);
+
+finish:
+        e->state = SD_EVENT_PASSIVE;
+        sd_event_unref(e);
+
+        return r;
+}
+
+_public_ int sd_event_loop(sd_event *e) {
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+        assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
+
+        sd_event_ref(e);
+
+        while (e->state != SD_EVENT_FINISHED) {
+                r = sd_event_run(e, (uint64_t) -1);
+                if (r < 0)
+                        goto finish;
+        }
+
+        r = e->exit_code;
+
+finish:
+        sd_event_unref(e);
+        return r;
+}
+
+_public_ int sd_event_get_state(sd_event *e) {
+        assert_return(e, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        return e->state;
+}
+
+_public_ int sd_event_get_exit_code(sd_event *e, int *code) {
+        assert_return(e, -EINVAL);
+        assert_return(code, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (!e->exit_requested)
+                return -ENODATA;
+
+        *code = e->exit_code;
+        return 0;
+}
+
+_public_ int sd_event_exit(sd_event *e, int code) {
+        assert_return(e, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        e->exit_requested = true;
+        e->exit_code = code;
+
+        return 0;
+}
+
+_public_ int sd_event_get_now_realtime(sd_event *e, uint64_t *usec) {
+        assert_return(e, -EINVAL);
+        assert_return(usec, -EINVAL);
+        assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        *usec = e->timestamp.realtime;
+        return 0;
+}
+
+_public_ int sd_event_get_now_monotonic(sd_event *e, uint64_t *usec) {
+        assert_return(e, -EINVAL);
+        assert_return(usec, -EINVAL);
+        assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        *usec = e->timestamp.monotonic;
+        return 0;
+}
+
+_public_ int sd_event_default(sd_event **ret) {
+
+        static thread_local sd_event *default_event = NULL;
+        sd_event *e;
+        int r;
+
+        if (!ret)
+                return !!default_event;
+
+        if (default_event) {
+                *ret = sd_event_ref(default_event);
+                return 0;
+        }
+
+        r = sd_event_new(&e);
+        if (r < 0)
+                return r;
+
+        e->default_event_ptr = &default_event;
+        e->tid = gettid();
+        default_event = e;
+
+        *ret = e;
+        return 1;
+}
+
+_public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
+        assert_return(e, -EINVAL);
+        assert_return(tid, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (e->tid != 0) {
+                *tid = e->tid;
+                return 0;
+        }
+
+        return -ENXIO;
+}
+
+_public_ int sd_event_set_watchdog(sd_event *e, int b) {
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (e->watchdog == !!b)
+                return e->watchdog;
+
+        if (b) {
+                struct epoll_event ev = {};
+
+                r = sd_watchdog_enabled(false, &e->watchdog_period);
+                if (r <= 0)
+                        return r;
+
+                /* Issue first ping immediately */
+                sd_notify(false, "WATCHDOG=1");
+                e->watchdog_last = now(CLOCK_MONOTONIC);
+
+                e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
+                if (e->watchdog_fd < 0)
+                        return -errno;
+
+                r = arm_watchdog(e);
+                if (r < 0)
+                        goto fail;
+
+                ev.events = EPOLLIN;
+                ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
+
+                r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
+                if (r < 0) {
+                        r = -errno;
+                        goto fail;
+                }
+
+        } else {
+                if (e->watchdog_fd >= 0) {
+                        epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
+                        close_nointr_nofail(e->watchdog_fd);
+                        e->watchdog_fd = -1;
+                }
+        }
+
+        e->watchdog = !!b;
+        return e->watchdog;
+
+fail:
+        close_nointr_nofail(e->watchdog_fd);
+        e->watchdog_fd = -1;
+        return r;
+}
+
+_public_ int sd_event_get_watchdog(sd_event *e) {
+        assert_return(e, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        return e->watchdog;
+}