chiark / gitweb /
event: clear pending-state when re-arming timers
[elogind.git] / src / libsystemd-bus / sd-event.c
index de96fde..d01e82d 100644 (file)
 #include <sys/timerfd.h>
 #include <sys/wait.h>
 
+#include "sd-id128.h"
 #include "macro.h"
-#include "refcnt.h"
 #include "prioq.h"
 #include "hashmap.h"
 #include "util.h"
 #include "time-util.h"
+#include "missing.h"
 
 #include "sd-event.h"
 
 #define EPOLL_QUEUE_MAX 64
+#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
 
 typedef enum EventSourceType {
         SOURCE_IO,
@@ -40,18 +42,19 @@ typedef enum EventSourceType {
         SOURCE_REALTIME,
         SOURCE_SIGNAL,
         SOURCE_CHILD,
-        SOURCE_DEFER
+        SOURCE_DEFER,
+        SOURCE_QUIT
 } EventSourceType;
 
 struct sd_event_source {
-        RefCount n_ref;
+        unsigned n_ref;
 
         sd_event *event;
         void *userdata;
         sd_prepare_handler_t prepare;
 
         EventSourceType type:4;
-        sd_event_mute_t mute:3;
+        int enabled:3;
         bool pending:1;
 
         int priority;
@@ -70,8 +73,9 @@ struct sd_event_source {
                 } io;
                 struct {
                         sd_time_handler_t callback;
-                        usec_t next;
-                        unsigned prioq_index;
+                        usec_t next, accuracy;
+                        unsigned earliest_index;
+                        unsigned latest_index;
                 } time;
                 struct {
                         sd_signal_handler_t callback;
@@ -87,11 +91,15 @@ struct sd_event_source {
                 struct {
                         sd_defer_handler_t callback;
                 } defer;
+                struct {
+                        sd_quit_handler_t callback;
+                        unsigned prioq_index;
+                } quit;
         };
 };
 
 struct sd_event {
-        RefCount n_ref;
+        unsigned n_ref;
 
         int epoll_fd;
         int signal_fd;
@@ -100,21 +108,40 @@ struct sd_event {
 
         Prioq *pending;
         Prioq *prepare;
-        Prioq *monotonic;
-        Prioq *realtime;
+
+        /* For both clocks we maintain two priority queues each, one
+         * ordered for the earliest times the events may be
+         * dispatched, and one ordered by the latest times they must
+         * have been dispatched. The range between the top entries in
+         * the two prioqs is the time window we can freely schedule
+         * wakeups in */
+        Prioq *monotonic_earliest;
+        Prioq *monotonic_latest;
+        Prioq *realtime_earliest;
+        Prioq *realtime_latest;
+
+        usec_t realtime_next, monotonic_next;
+        usec_t perturb;
 
         sigset_t sigset;
         sd_event_source **signal_sources;
 
         Hashmap *child_sources;
-        unsigned n_unmuted_child_sources;
+        unsigned n_enabled_child_sources;
+
+        Prioq *quit;
+
+        pid_t original_pid;
 
         unsigned iteration;
-        unsigned processed_children;
+        dual_timestamp timestamp;
+        int state;
 
-        usec_t realtime_next, monotonic_next;
+        bool quit_requested:1;
+        bool need_process_child:1;
 
-        bool quit;
+        pid_t tid;
+        sd_event **default_event_ptr;
 };
 
 static int pending_prioq_compare(const void *a, const void *b) {
@@ -123,10 +150,10 @@ static int pending_prioq_compare(const void *a, const void *b) {
         assert(x->pending);
         assert(y->pending);
 
-        /* Unmuted ones first */
-        if (x->mute != SD_EVENT_MUTED && y->mute == SD_EVENT_MUTED)
+        /* Enabled ones first */
+        if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
                 return -1;
-        if (x->mute == SD_EVENT_MUTED && y->mute != SD_EVENT_MUTED)
+        if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
                 return 1;
 
         /* Lower priority values first */
@@ -164,10 +191,10 @@ static int prepare_prioq_compare(const void *a, const void *b) {
         if (x->prepare_iteration > y->prepare_iteration)
                 return 1;
 
-        /* Unmuted ones first */
-        if (x->mute != SD_EVENT_MUTED && y->mute == SD_EVENT_MUTED)
+        /* Enabled ones first */
+        if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
                 return -1;
-        if (x->mute == SD_EVENT_MUTED && y->mute != SD_EVENT_MUTED)
+        if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
                 return 1;
 
         /* Lower priority values first */
@@ -185,16 +212,16 @@ static int prepare_prioq_compare(const void *a, const void *b) {
         return 0;
 }
 
-static int time_prioq_compare(const void *a, const void *b) {
+static int earliest_time_prioq_compare(const void *a, const void *b) {
         const sd_event_source *x = a, *y = b;
 
         assert(x->type == SOURCE_MONOTONIC || x->type == SOURCE_REALTIME);
         assert(y->type == SOURCE_MONOTONIC || y->type == SOURCE_REALTIME);
 
-        /* Unmuted ones first */
-        if (x->mute != SD_EVENT_MUTED && y->mute == SD_EVENT_MUTED)
+        /* Enabled ones first */
+        if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
                 return -1;
-        if (x->mute == SD_EVENT_MUTED && y->mute != SD_EVENT_MUTED)
+        if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
                 return 1;
 
         /* Move the pending ones to the end */
@@ -218,9 +245,72 @@ static int time_prioq_compare(const void *a, const void *b) {
         return 0;
 }
 
+static int latest_time_prioq_compare(const void *a, const void *b) {
+        const sd_event_source *x = a, *y = b;
+
+        assert((x->type == SOURCE_MONOTONIC && y->type == SOURCE_MONOTONIC) ||
+               (x->type == SOURCE_REALTIME && y->type == SOURCE_REALTIME));
+
+        /* Enabled ones first */
+        if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
+                return -1;
+        if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
+                return 1;
+
+        /* Move the pending ones to the end */
+        if (!x->pending && y->pending)
+                return -1;
+        if (x->pending && !y->pending)
+                return 1;
+
+        /* Order by time */
+        if (x->time.next + x->time.accuracy < y->time.next + y->time.accuracy)
+                return -1;
+        if (x->time.next + x->time.accuracy > y->time.next + y->time.accuracy)
+                return -1;
+
+        /* Stability for the rest */
+        if (x < y)
+                return -1;
+        if (x > y)
+                return 1;
+
+        return 0;
+}
+
+static int quit_prioq_compare(const void *a, const void *b) {
+        const sd_event_source *x = a, *y = b;
+
+        assert(x->type == SOURCE_QUIT);
+        assert(y->type == SOURCE_QUIT);
+
+        /* Enabled ones first */
+        if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
+                return -1;
+        if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
+                return 1;
+
+        /* Lower priority values first */
+        if (x->priority < y->priority)
+                return -1;
+        if (x->priority > y->priority)
+                return 1;
+
+        /* Stability for the rest */
+        if (x < y)
+                return -1;
+        if (x > y)
+                return 1;
+
+        return 0;
+}
+
 static void event_free(sd_event *e) {
         assert(e);
 
+        if (e->default_event_ptr)
+                *(e->default_event_ptr) = NULL;
+
         if (e->epoll_fd >= 0)
                 close_nointr_nofail(e->epoll_fd);
 
@@ -235,8 +325,11 @@ static void event_free(sd_event *e) {
 
         prioq_free(e->pending);
         prioq_free(e->prepare);
-        prioq_free(e->monotonic);
-        prioq_free(e->realtime);
+        prioq_free(e->monotonic_earliest);
+        prioq_free(e->monotonic_latest);
+        prioq_free(e->realtime_earliest);
+        prioq_free(e->realtime_latest);
+        prioq_free(e->quit);
 
         free(e->signal_sources);
 
@@ -244,20 +337,20 @@ static void event_free(sd_event *e) {
         free(e);
 }
 
-int sd_event_new(sd_event** ret) {
+_public_ int sd_event_new(sd_event** ret) {
         sd_event *e;
         int r;
 
-        if (!ret)
-                return -EINVAL;
+        assert_return(ret, -EINVAL);
 
         e = new0(sd_event, 1);
         if (!e)
                 return -ENOMEM;
 
-        e->n_ref = REFCNT_INIT;
+        e->n_ref = 1;
         e->signal_fd = e->realtime_fd = e->monotonic_fd = e->epoll_fd = -1;
         e->realtime_next = e->monotonic_next = (usec_t) -1;
+        e->original_pid = getpid();
 
         assert_se(sigemptyset(&e->sigset) == 0);
 
@@ -281,25 +374,36 @@ fail:
         return r;
 }
 
-sd_event* sd_event_ref(sd_event *e) {
-        if (!e)
-                return NULL;
+_public_ sd_event* sd_event_ref(sd_event *e) {
+        assert_return(e, NULL);
 
-        assert_se(REFCNT_INC(e->n_ref) >= 2);
+        assert(e->n_ref >= 1);
+        e->n_ref++;
 
         return e;
 }
 
-sd_event* sd_event_unref(sd_event *e) {
-        if (!e)
-                return NULL;
+_public_ sd_event* sd_event_unref(sd_event *e) {
+        assert_return(e, NULL);
 
-        if (REFCNT_DEC(e->n_ref) <= 0)
+        assert(e->n_ref >= 1);
+        e->n_ref--;
+
+        if (e->n_ref <= 0)
                 event_free(e);
 
         return NULL;
 }
 
+static bool event_pid_changed(sd_event *e) {
+        assert(e);
+
+        /* We don't support people creating am event loop and keeping
+         * it around over a fork(). Let's complain. */
+
+        return e->original_pid != getpid();
+}
+
 static int source_io_unregister(sd_event_source *s) {
         int r;
 
@@ -317,18 +421,22 @@ static int source_io_unregister(sd_event_source *s) {
         return 0;
 }
 
-static int source_io_register(sd_event_source *s, sd_event_mute_t m, uint32_t events) {
+static int source_io_register(
+                sd_event_source *s,
+                int enabled,
+                uint32_t events) {
+
         struct epoll_event ev = {};
         int r;
 
         assert(s);
         assert(s->type == SOURCE_IO);
-        assert(m != SD_EVENT_MUTED);
+        assert(enabled != SD_EVENT_OFF);
 
         ev.events = events;
         ev.data.ptr = s;
 
-        if (m == SD_EVENT_ONESHOT)
+        if (enabled == SD_EVENT_ONESHOT)
                 ev.events |= EPOLLONESHOT;
 
         if (s->io.registered)
@@ -357,16 +465,18 @@ static void source_free(sd_event_source *s) {
                         break;
 
                 case SOURCE_MONOTONIC:
-                        prioq_remove(s->event->monotonic, s, &s->time.prioq_index);
+                        prioq_remove(s->event->monotonic_earliest, s, &s->time.earliest_index);
+                        prioq_remove(s->event->monotonic_latest, s, &s->time.latest_index);
                         break;
 
                 case SOURCE_REALTIME:
-                        prioq_remove(s->event->realtime, s, &s->time.prioq_index);
+                        prioq_remove(s->event->realtime_earliest, s, &s->time.earliest_index);
+                        prioq_remove(s->event->realtime_latest, s, &s->time.latest_index);
                         break;
 
                 case SOURCE_SIGNAL:
                         if (s->signal.sig > 0) {
-                                if (s->signal.sig != SIGCHLD || s->event->n_unmuted_child_sources == 0)
+                                if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0)
                                         assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
 
                                 if (s->event->signal_sources)
@@ -377,9 +487,9 @@ static void source_free(sd_event_source *s) {
 
                 case SOURCE_CHILD:
                         if (s->child.pid > 0) {
-                                if (s->mute != SD_EVENT_MUTED) {
-                                        assert(s->event->n_unmuted_child_sources > 0);
-                                        s->event->n_unmuted_child_sources--;
+                                if (s->enabled != SD_EVENT_OFF) {
+                                        assert(s->event->n_enabled_child_sources > 0);
+                                        s->event->n_enabled_child_sources--;
                                 }
 
                                 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD])
@@ -389,6 +499,14 @@ static void source_free(sd_event_source *s) {
                         }
 
                         break;
+
+                case SOURCE_DEFER:
+                        /* nothing */
+                        break;
+
+                case SOURCE_QUIT:
+                        prioq_remove(s->event->quit, s, &s->quit.prioq_index);
+                        break;
                 }
 
                 if (s->pending)
@@ -407,6 +525,7 @@ static int source_set_pending(sd_event_source *s, bool b) {
         int r;
 
         assert(s);
+        assert(s->type != SOURCE_QUIT);
 
         if (s->pending == b)
                 return 0;
@@ -436,16 +555,15 @@ static sd_event_source *source_new(sd_event *e, EventSourceType type) {
         if (!s)
                 return NULL;
 
-        s->n_ref = REFCNT_INIT;
+        s->n_ref = 1;
         s->event = sd_event_ref(e);
         s->type = type;
-        s->mute = SD_EVENT_UNMUTED;
         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
 
         return s;
 }
 
-int sd_event_add_io(
+_public_ int sd_event_add_io(
                 sd_event *e,
                 int fd,
                 uint32_t events,
@@ -456,16 +574,13 @@ int sd_event_add_io(
         sd_event_source *s;
         int r;
 
-        if (!e)
-                return -EINVAL;
-        if (fd < 0)
-                return -EINVAL;
-        if (events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP))
-                return -EINVAL;
-        if (!callback)
-                return -EINVAL;
-        if (!ret)
-                return -EINVAL;
+        assert_return(e, -EINVAL);
+        assert_return(fd >= 0, -EINVAL);
+        assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP)), -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(ret, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
 
         s = source_new(e, SOURCE_IO);
         if (!s)
@@ -475,8 +590,9 @@ int sd_event_add_io(
         s->io.events = events;
         s->io.callback = callback;
         s->userdata = userdata;
+        s->enabled = SD_EVENT_ON;
 
-        r = source_io_register(s, s->mute, events);
+        r = source_io_register(s, s->enabled, events);
         if (r < 0) {
                 source_free(s);
                 return -errno;
@@ -494,6 +610,7 @@ static int event_setup_timer_fd(
 
         struct epoll_event ev = {};
         int r, fd;
+        sd_id128_t bootid;
 
         assert(e);
         assert(timer_fd);
@@ -514,6 +631,17 @@ static int event_setup_timer_fd(
                 return -errno;
         }
 
+        /* When we sleep for longer, we try to realign the wakeup to
+           the same time wihtin each second, so that events all across
+           the system can be coalesced into a single CPU
+           wakeup. However, let's take some system-specific randomness
+           for this value, so that in a network of systems with synced
+           clocks timer events are distributed a bit. Here, we
+           calculate a perturbation usec offset from the boot ID. */
+
+        if (sd_id128_get_boot(&bootid) >= 0)
+                e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_SEC;
+
         *timer_fd = fd;
         return 0;
 }
@@ -523,8 +651,10 @@ static int event_add_time_internal(
                 EventSourceType type,
                 int *timer_fd,
                 clockid_t id,
-                Prioq **prioq,
+                Prioq **earliest,
+                Prioq **latest,
                 uint64_t usec,
+                uint64_t accuracy,
                 sd_time_handler_t callback,
                 void *userdata,
                 sd_event_source **ret) {
@@ -532,19 +662,27 @@ static int event_add_time_internal(
         sd_event_source *s;
         int r;
 
-        if (!e)
-                return -EINVAL;
-        if (!callback)
-                return -EINVAL;
-        if (!ret)
-                return -EINVAL;
+        assert_return(e, -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(ret, -EINVAL);
+        assert_return(usec != (uint64_t) -1, -EINVAL);
+        assert_return(accuracy != (uint64_t) -1, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
 
         assert(timer_fd);
-        assert(prioq);
+        assert(earliest);
+        assert(latest);
 
-        if (!*prioq) {
-                *prioq = prioq_new(time_prioq_compare);
-                if (!*prioq)
+        if (!*earliest) {
+                *earliest = prioq_new(earliest_time_prioq_compare);
+                if (!*earliest)
+                        return -ENOMEM;
+        }
+
+        if (!*latest) {
+                *latest = prioq_new(latest_time_prioq_compare);
+                if (!*latest)
                         return -ENOMEM;
         }
 
@@ -559,26 +697,46 @@ static int event_add_time_internal(
                 return -ENOMEM;
 
         s->time.next = usec;
+        s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
         s->time.callback = callback;
-        s->time.prioq_index = PRIOQ_IDX_NULL;
+        s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
         s->userdata = userdata;
+        s->enabled = SD_EVENT_ONESHOT;
 
-        r = prioq_put(*prioq, s, &s->time.prioq_index);
-        if (r < 0) {
-                source_free(s);
-                return r;
-        }
+        r = prioq_put(*earliest, s, &s->time.earliest_index);
+        if (r < 0)
+                goto fail;
+
+        r = prioq_put(*latest, s, &s->time.latest_index);
+        if (r < 0)
+                goto fail;
 
         *ret = s;
         return 0;
+
+fail:
+        source_free(s);
+        return r;
 }
 
-int sd_event_add_monotonic(sd_event *e, uint64_t usec, sd_time_handler_t callback, void *userdata, sd_event_source **ret) {
-        return event_add_time_internal(e, SOURCE_MONOTONIC, &e->monotonic_fd, CLOCK_MONOTONIC, &e->monotonic, usec, callback, userdata, ret);
+_public_ int sd_event_add_monotonic(sd_event *e,
+                                    uint64_t usec,
+                                    uint64_t accuracy,
+                                    sd_time_handler_t callback,
+                                    void *userdata,
+                                    sd_event_source **ret) {
+
+        return event_add_time_internal(e, SOURCE_MONOTONIC, &e->monotonic_fd, CLOCK_MONOTONIC, &e->monotonic_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
 }
 
-int sd_event_add_realtime(sd_event *e, uint64_t usec, sd_time_handler_t callback, void *userdata, sd_event_source **ret) {
-        return event_add_time_internal(e, SOURCE_REALTIME, &e->realtime_fd, CLOCK_REALTIME, &e->realtime, usec, callback, userdata, ret);
+_public_ int sd_event_add_realtime(sd_event *e,
+                                   uint64_t usec,
+                                   uint64_t accuracy,
+                                   sd_time_handler_t callback,
+                                   void *userdata,
+                                   sd_event_source **ret) {
+
+        return event_add_time_internal(e, SOURCE_REALTIME, &e->realtime_fd, CLOCK_REALTIME, &e->realtime_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
 }
 
 static int event_update_signal_fd(sd_event *e) {
@@ -613,20 +771,23 @@ static int event_update_signal_fd(sd_event *e) {
         return 0;
 }
 
-int sd_event_add_signal(sd_event *e, int sig, sd_signal_handler_t callback, void *userdata, sd_event_source **ret) {
+_public_ int sd_event_add_signal(
+                sd_event *e,
+                int sig,
+                sd_signal_handler_t callback,
+                void *userdata,
+                sd_event_source **ret) {
+
         sd_event_source *s;
         int r;
 
-        if (!e)
-                return -EINVAL;
-        if (sig <= 0)
-                return -EINVAL;
-        if (sig >= _NSIG)
-                return -EINVAL;
-        if (!callback)
-                return -EINVAL;
-        if (!ret)
-                return -EINVAL;
+        assert_return(e, -EINVAL);
+        assert_return(sig > 0, -EINVAL);
+        assert_return(sig < _NSIG, -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(ret, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
 
         if (!e->signal_sources) {
                 e->signal_sources = new0(sd_event_source*, _NSIG);
@@ -642,11 +803,12 @@ int sd_event_add_signal(sd_event *e, int sig, sd_signal_handler_t callback, void
         s->signal.sig = sig;
         s->signal.callback = callback;
         s->userdata = userdata;
+        s->enabled = SD_EVENT_ON;
 
         e->signal_sources[sig] = s;
         assert_se(sigaddset(&e->sigset, sig) == 0);
 
-        if (sig != SIGCHLD || e->n_unmuted_child_sources == 0) {
+        if (sig != SIGCHLD || e->n_enabled_child_sources == 0) {
                 r = event_update_signal_fd(e);
                 if (r < 0) {
                         source_free(s);
@@ -658,20 +820,25 @@ int sd_event_add_signal(sd_event *e, int sig, sd_signal_handler_t callback, void
         return 0;
 }
 
-int sd_event_add_child(sd_event *e, pid_t pid, int options, sd_child_handler_t callback, void *userdata, sd_event_source **ret) {
+_public_ int sd_event_add_child(
+                sd_event *e,
+                pid_t pid,
+                int options,
+                sd_child_handler_t callback,
+                void *userdata,
+                sd_event_source **ret) {
+
         sd_event_source *s;
         int r;
 
-        if (!e)
-                return -EINVAL;
-        if (pid <= 1)
-                return -EINVAL;
-        if (options & ~(WEXITED|WSTOPPED|WCONTINUED))
-                return -EINVAL;
-        if (!callback)
-                return -EINVAL;
-        if (!ret)
-                return -EINVAL;
+        assert_return(e, -EINVAL);
+        assert_return(pid > 1, -EINVAL);
+        assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
+        assert_return(options != 0, -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(ret, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
 
         r = hashmap_ensure_allocated(&e->child_sources, trivial_hash_func, trivial_compare_func);
         if (r < 0)
@@ -688,6 +855,7 @@ int sd_event_add_child(sd_event *e, pid_t pid, int options, sd_child_handler_t c
         s->child.options = options;
         s->child.callback = callback;
         s->userdata = userdata;
+        s->enabled = SD_EVENT_ONESHOT;
 
         r = hashmap_put(e->child_sources, INT_TO_PTR(pid), s);
         if (r < 0) {
@@ -695,7 +863,7 @@ int sd_event_add_child(sd_event *e, pid_t pid, int options, sd_child_handler_t c
                 return r;
         }
 
-        e->n_unmuted_child_sources ++;
+        e->n_enabled_child_sources ++;
 
         assert_se(sigaddset(&e->sigset, SIGCHLD) == 0);
 
@@ -707,18 +875,26 @@ int sd_event_add_child(sd_event *e, pid_t pid, int options, sd_child_handler_t c
                 }
         }
 
+        e->need_process_child = true;
+
         *ret = s;
         return 0;
 }
 
-int sd_event_add_defer(sd_event *e, sd_defer_handler_t callback, void *userdata, sd_event_source **ret) {
+_public_ int sd_event_add_defer(
+                sd_event *e,
+                sd_defer_handler_t callback,
+                void *userdata,
+                sd_event_source **ret) {
+
         sd_event_source *s;
         int r;
 
-        if (!e)
-                return -EINVAL;
-        if (!ret)
-                return -EINVAL;
+        assert_return(e, -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(ret, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
 
         s = source_new(e, SOURCE_DEFER);
         if (!s)
@@ -726,6 +902,7 @@ int sd_event_add_defer(sd_event *e, sd_defer_handler_t callback, void *userdata,
 
         s->defer.callback = callback;
         s->userdata = userdata;
+        s->enabled = SD_EVENT_ONESHOT;
 
         r = source_set_pending(s, true);
         if (r < 0) {
@@ -737,68 +914,114 @@ int sd_event_add_defer(sd_event *e, sd_defer_handler_t callback, void *userdata,
         return 0;
 }
 
-sd_event_source* sd_event_source_ref(sd_event_source *s) {
+_public_ int sd_event_add_quit(
+                sd_event *e,
+                sd_quit_handler_t callback,
+                void *userdata,
+                sd_event_source **ret) {
+
+        sd_event_source *s;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(ret, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (!e->quit) {
+                e->quit = prioq_new(quit_prioq_compare);
+                if (!e->quit)
+                        return -ENOMEM;
+        }
+
+        s = source_new(e, SOURCE_QUIT);
         if (!s)
-                return NULL;
+                return -ENOMEM;
 
-        assert_se(REFCNT_INC(s->n_ref) >= 2);
+        s->quit.callback = callback;
+        s->userdata = userdata;
+        s->quit.prioq_index = PRIOQ_IDX_NULL;
+        s->enabled = SD_EVENT_ONESHOT;
+
+        r = prioq_put(s->event->quit, s, &s->quit.prioq_index);
+        if (r < 0) {
+                source_free(s);
+                return r;
+        }
+
+        *ret = s;
+        return 0;
+}
+
+_public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
+        assert_return(s, NULL);
+
+        assert(s->n_ref >= 1);
+        s->n_ref++;
 
         return s;
 }
 
-sd_event_source* sd_event_source_unref(sd_event_source *s) {
-        if (!s)
-                return NULL;
+_public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
+        assert_return(s, NULL);
+
+        assert(s->n_ref >= 1);
+        s->n_ref--;
 
-        if (REFCNT_DEC(s->n_ref) <= 0)
+        if (s->n_ref <= 0)
                 source_free(s);
 
         return NULL;
 }
 
-int sd_event_source_get_pending(sd_event_source *s) {
-        if (!s)
-                return -EINVAL;
+_public_ sd_event *sd_event_get(sd_event_source *s) {
+        assert_return(s, NULL);
+
+        return s->event;
+}
+
+_public_ int sd_event_source_get_pending(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type != SOURCE_QUIT, -EDOM);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
         return s->pending;
 }
 
-int sd_event_source_get_io_fd(sd_event_source *s) {
-        if (!s)
-                return -EINVAL;
-        if (s->type != SOURCE_IO)
-                return -EDOM;
+_public_ int sd_event_source_get_io_fd(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
         return s->io.fd;
 }
 
-int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
-        if (!s)
-                return -EINVAL;
-        if (s->type != SOURCE_IO)
-                return -EDOM;
-        if (!events)
-                return -EINVAL;
+_public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
+        assert_return(s, -EINVAL);
+        assert_return(events, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
         *events = s->io.events;
         return 0;
 }
 
-int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
+_public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
         int r;
 
-        if (!s)
-                return -EINVAL;
-        if (!s->type != SOURCE_IO)
-                return -EDOM;
-        if (events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP))
-                return -EINVAL;
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP)), -EINVAL);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
         if (s->io.events == events)
                 return 0;
 
-        if (s->mute != SD_EVENT_MUTED) {
-                r = source_io_register(s, s->io.events, events);
+        if (s->enabled != SD_EVENT_OFF) {
+                r = source_io_register(s, s->enabled, events);
                 if (r < 0)
                         return r;
         }
@@ -808,39 +1031,36 @@ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
         return 0;
 }
 
-int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
-        if (!s)
-                return -EINVAL;
-        if (s->type != SOURCE_IO)
-                return -EDOM;
-        if (!revents)
-                return -EINVAL;
-        if (!s->pending)
-                return -ENODATA;
+_public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
+        assert_return(s, -EINVAL);
+        assert_return(revents, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(s->pending, -ENODATA);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
         *revents = s->io.revents;
         return 0;
 }
 
-int sd_event_source_get_signal(sd_event_source *s) {
-        if (!s)
-                return -EINVAL;
-        if (s->type != SOURCE_SIGNAL)
-                return -EDOM;
+_public_ int sd_event_source_get_signal(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_SIGNAL, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
         return s->signal.sig;
 }
 
-int sd_event_source_get_priority(sd_event_source *s, int *priority) {
-        if (!s)
-                return -EINVAL;
+_public_ int sd_event_source_get_priority(sd_event_source *s, int *priority) {
+        assert_return(s, -EINVAL);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
         return s->priority;
 }
 
-int sd_event_source_set_priority(sd_event_source *s, int priority) {
-        if (!s)
-                return -EINVAL;
+_public_ int sd_event_source_set_priority(sd_event_source *s, int priority) {
+        assert_return(s, -EINVAL);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
         if (s->priority == priority)
                 return 0;
@@ -848,36 +1068,38 @@ int sd_event_source_set_priority(sd_event_source *s, int priority) {
         s->priority = priority;
 
         if (s->pending)
-                assert_se(prioq_reshuffle(s->event->pending, s, &s->pending_index) == 0);
+                prioq_reshuffle(s->event->pending, s, &s->pending_index);
 
         if (s->prepare)
-                assert_se(prioq_reshuffle(s->event->prepare, s, &s->prepare_index) == 0);
+                prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
+
+        if (s->type == SOURCE_QUIT)
+                prioq_reshuffle(s->event->quit, s, &s->quit.prioq_index);
 
         return 0;
 }
 
-int sd_event_source_get_mute(sd_event_source *s, sd_event_mute_t *m) {
-        if (!s)
-                return -EINVAL;
-        if (!m)
-                return -EINVAL;
+_public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
+        assert_return(s, -EINVAL);
+        assert_return(m, -EINVAL);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
-        *m = s->mute;
+        *m = s->enabled;
         return 0;
 }
 
-int sd_event_source_set_mute(sd_event_source *s, sd_event_mute_t m) {
+_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
         int r;
 
-        if (!s)
-                return -EINVAL;
-        if (m != SD_EVENT_MUTED && m != SD_EVENT_UNMUTED && !SD_EVENT_ONESHOT)
-                return -EINVAL;
+        assert_return(s, -EINVAL);
+        assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
-        if (s->mute == m)
+        if (s->enabled == m)
                 return 0;
 
-        if (m == SD_EVENT_MUTED) {
+        if (m == SD_EVENT_OFF) {
 
                 switch (s->type) {
 
@@ -886,22 +1108,24 @@ int sd_event_source_set_mute(sd_event_source *s, sd_event_mute_t m) {
                         if (r < 0)
                                 return r;
 
-                        s->mute = m;
+                        s->enabled = m;
                         break;
 
                 case SOURCE_MONOTONIC:
-                        s->mute = m;
-                        prioq_reshuffle(s->event->monotonic, s, &s->time.prioq_index);
+                        s->enabled = m;
+                        prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
+                        prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
                         break;
 
                 case SOURCE_REALTIME:
-                        s->mute = m;
-                        prioq_reshuffle(s->event->realtime, s, &s->time.prioq_index);
+                        s->enabled = m;
+                        prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
+                        prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
                         break;
 
                 case SOURCE_SIGNAL:
-                        s->mute = m;
-                        if (s->signal.sig != SIGCHLD || s->event->n_unmuted_child_sources == 0) {
+                        s->enabled = m;
+                        if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) {
                                 assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
                                 event_update_signal_fd(s->event);
                         }
@@ -909,10 +1133,10 @@ int sd_event_source_set_mute(sd_event_source *s, sd_event_mute_t m) {
                         break;
 
                 case SOURCE_CHILD:
-                        s->mute = m;
+                        s->enabled = m;
 
-                        assert(s->event->n_unmuted_child_sources > 0);
-                        s->event->n_unmuted_child_sources--;
+                        assert(s->event->n_enabled_child_sources > 0);
+                        s->event->n_enabled_child_sources--;
 
                         if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
                                 assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
@@ -921,8 +1145,13 @@ int sd_event_source_set_mute(sd_event_source *s, sd_event_mute_t m) {
 
                         break;
 
+                case SOURCE_QUIT:
+                        s->enabled = m;
+                        prioq_reshuffle(s->event->quit, s, &s->quit.prioq_index);
+                        break;
+
                 case SOURCE_DEFER:
-                        s->mute = m;
+                        s->enabled = m;
                         break;
                 }
 
@@ -934,33 +1163,35 @@ int sd_event_source_set_mute(sd_event_source *s, sd_event_mute_t m) {
                         if (r < 0)
                                 return r;
 
-                        s->mute = m;
+                        s->enabled = m;
                         break;
 
                 case SOURCE_MONOTONIC:
-                        s->mute = m;
-                        prioq_reshuffle(s->event->monotonic, s, &s->time.prioq_index);
+                        s->enabled = m;
+                        prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
+                        prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
                         break;
 
                 case SOURCE_REALTIME:
-                        s->mute = m;
-                        prioq_reshuffle(s->event->realtime, s, &s->time.prioq_index);
+                        s->enabled = m;
+                        prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
+                        prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
                         break;
 
                 case SOURCE_SIGNAL:
-                        s->mute = m;
+                        s->enabled = m;
 
-                        if (s->signal.sig != SIGCHLD || s->event->n_unmuted_child_sources == 0)  {
+                        if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0)  {
                                 assert_se(sigaddset(&s->event->sigset, s->signal.sig) == 0);
                                 event_update_signal_fd(s->event);
                         }
                         break;
 
                 case SOURCE_CHILD:
-                        s->mute = m;
+                        s->enabled = m;
 
-                        if (s->mute == SD_EVENT_MUTED) {
-                                s->event->n_unmuted_child_sources++;
+                        if (s->enabled == SD_EVENT_OFF) {
+                                s->event->n_enabled_child_sources++;
 
                                 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
                                         assert_se(sigaddset(&s->event->sigset, SIGCHLD) == 0);
@@ -969,8 +1200,13 @@ int sd_event_source_set_mute(sd_event_source *s, sd_event_mute_t m) {
                         }
                         break;
 
+                case SOURCE_QUIT:
+                        s->enabled = m;
+                        prioq_reshuffle(s->event->quit, s, &s->quit.prioq_index);
+                        break;
+
                 case SOURCE_DEFER:
-                        s->mute = m;
+                        s->enabled = m;
                         break;
                 }
         }
@@ -984,42 +1220,90 @@ int sd_event_source_set_mute(sd_event_source *s, sd_event_mute_t m) {
         return 0;
 }
 
-int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
-        if (!s)
-                return -EINVAL;
-        if (!usec)
-                return -EINVAL;
-        if (s->type != SOURCE_REALTIME && s->type != SOURCE_MONOTONIC)
-                return -EDOM;
+_public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
+        assert_return(s, -EINVAL);
+        assert_return(usec, -EINVAL);
+        assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
         *usec = s->time.next;
         return 0;
 }
 
-int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
-        if (!s)
-                return -EINVAL;
-        if (s->type != SOURCE_REALTIME && s->type != SOURCE_MONOTONIC)
-                return -EDOM;
+_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
+        assert_return(s, -EINVAL);
+        assert_return(usec != (uint64_t) -1, -EINVAL);
+        assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
         if (s->time.next == usec)
                 return 0;
 
         s->time.next = usec;
+        source_set_pending(s, false);
+
+        if (s->type == SOURCE_REALTIME) {
+                prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
+                prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
+        } else {
+                prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
+                prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
+        }
+
+        return 0;
+}
+
+_public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
+        assert_return(s, -EINVAL);
+        assert_return(usec, -EINVAL);
+        assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *usec = s->time.accuracy;
+        return 0;
+}
+
+_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
+        assert_return(s, -EINVAL);
+        assert_return(usec != (uint64_t) -1, -EINVAL);
+        assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        if (usec == 0)
+                usec = DEFAULT_ACCURACY_USEC;
+
+        if (s->time.accuracy == usec)
+                return 0;
+
+        s->time.accuracy = usec;
 
         if (s->type == SOURCE_REALTIME)
-                prioq_reshuffle(s->event->realtime, s, &s->time.prioq_index);
+                prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
         else
-                prioq_reshuffle(s->event->monotonic, s, &s->time.prioq_index);
+                prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
+
+        return 0;
+}
+
+_public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
+        assert_return(s, -EINVAL);
+        assert_return(pid, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
+        *pid = s->child.pid;
         return 0;
 }
 
-int sd_event_source_set_prepare(sd_event_source *s, sd_prepare_handler_t callback) {
+_public_ int sd_event_source_set_prepare(sd_event_source *s, sd_prepare_handler_t callback) {
         int r;
 
-        if (!s)
-                return -EINVAL;
+        assert_return(s, -EINVAL);
+        assert_return(s->type != SOURCE_QUIT, -EDOM);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
 
         if (s->prepare == callback)
                 return 0;
@@ -1045,47 +1329,117 @@ int sd_event_source_set_prepare(sd_event_source *s, sd_prepare_handler_t callbac
         return 0;
 }
 
-void* sd_event_source_get_userdata(sd_event_source *s) {
-        if (!s)
-                return NULL;
+_public_ void* sd_event_source_get_userdata(sd_event_source *s) {
+        assert_return(s, NULL);
 
         return s->userdata;
 }
 
+static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
+        usec_t c;
+        assert(e);
+        assert(a <= b);
+
+        if (a <= 0)
+                return 0;
+
+        if (b <= a + 1)
+                return a;
+
+        /*
+          Find a good time to wake up again between times a and b. We
+          have two goals here:
+
+          a) We want to wake up as seldom as possible, hence prefer
+             later times over earlier times.
+
+          b) But if we have to wake up, then let's make sure to
+             dispatch as much as possible on the entire system.
+
+          We implement this by waking up everywhere at the same time
+          within any given second if we can, synchronised via the
+          perturbation value determined from the boot ID. If we can't,
+          then we try to find the same spot in every a 250ms
+          step. Otherwise, we pick the last possible time to wake up.
+        */
+
+        c = (b / USEC_PER_SEC) * USEC_PER_SEC + e->perturb;
+        if (c >= b) {
+                if (_unlikely_(c < USEC_PER_SEC))
+                        return b;
+
+                c -= USEC_PER_SEC;
+        }
+
+        if (c >= a)
+                return c;
+
+        c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
+        if (c >= b) {
+                if (_unlikely_(c < USEC_PER_MSEC*250))
+                        return b;
+
+                c -= USEC_PER_MSEC*250;
+        }
+
+        if (c >= a)
+                return c;
+
+        return b;
+}
+
 static int event_arm_timer(
                 sd_event *e,
                 int timer_fd,
-                Prioq *prioq,
+                Prioq *earliest,
+                Prioq *latest,
                 usec_t *next) {
 
         struct itimerspec its = {};
-        sd_event_source *s;
+        sd_event_source *a, *b;
+        usec_t t;
         int r;
 
         assert_se(e);
         assert_se(next);
 
-        s = prioq_peek(prioq);
-        if (!s || s->mute == SD_EVENT_MUTED)
+        a = prioq_peek(earliest);
+        if (!a || a->enabled == SD_EVENT_OFF) {
+
+                if (*next == (usec_t) -1)
+                        return 0;
+
+                /* disarm */
+                r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
+                if (r < 0)
+                        return r;
+
+                *next = (usec_t) -1;
+
                 return 0;
+        }
+
+        b = prioq_peek(latest);
+        assert_se(b && b->enabled != SD_EVENT_OFF);
 
-        if (*next == s->time.next)
+        t = sleep_between(e, a->time.next, b->time.next + b->time.accuracy);
+        if (*next == t)
                 return 0;
 
         assert_se(timer_fd >= 0);
 
-        if (s->time.next == 0) {
+        if (t == 0) {
                 /* We don' want to disarm here, just mean some time looooong ago. */
                 its.it_value.tv_sec = 0;
                 its.it_value.tv_nsec = 1;
         } else
-                timespec_store(&its.it_value, s->time.next);
+                timespec_store(&its.it_value, t);
 
         r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
         if (r < 0)
                 return r;
 
-        *next = s->time.next;
+        *next = t;
         return 0;
 }
 
@@ -1096,26 +1450,18 @@ static int process_io(sd_event *e, sd_event_source *s, uint32_t events) {
 
         s->io.revents = events;
 
-        /*
-           If this is a oneshot event source, then we added it to the
-           epoll with EPOLLONESHOT, hence we know it's not registered
-           anymore. We can save a syscall here...
-        */
-
-        if (s->mute == SD_EVENT_ONESHOT)
-                s->io.registered = false;
-
         return source_set_pending(s, true);
 }
 
-static int flush_timer(sd_event *e, int fd, uint32_t events) {
+static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
         uint64_t x;
         ssize_t ss;
 
         assert(e);
+        assert(fd >= 0);
+        assert(next);
 
-        if (events != EPOLLIN)
-                return -EIO;
+        assert_return(events == EPOLLIN, -EIO);
 
         ss = read(fd, &x, sizeof(x));
         if (ss < 0) {
@@ -1128,20 +1474,27 @@ static int flush_timer(sd_event *e, int fd, uint32_t events) {
         if (ss != sizeof(x))
                 return -EIO;
 
+        *next = (usec_t) -1;
+
         return 0;
 }
 
-static int process_timer(sd_event *e, usec_t n, Prioq *prioq) {
+static int process_timer(
+                sd_event *e,
+                usec_t n,
+                Prioq *earliest,
+                Prioq *latest) {
+
         sd_event_source *s;
         int r;
 
         assert(e);
 
         for (;;) {
-                s = prioq_peek(prioq);
+                s = prioq_peek(earliest);
                 if (!s ||
                     s->time.next > n ||
-                    s->mute == SD_EVENT_MUTED ||
+                    s->enabled == SD_EVENT_OFF ||
                     s->pending)
                         break;
 
@@ -1149,9 +1502,8 @@ static int process_timer(sd_event *e, usec_t n, Prioq *prioq) {
                 if (r < 0)
                         return r;
 
-                r = prioq_reshuffle(prioq, s, &s->time.prioq_index);
-                if (r < 0)
-                        return r;
+                prioq_reshuffle(earliest, s, &s->time.earliest_index);
+                prioq_reshuffle(latest, s, &s->time.latest_index);
         }
 
         return 0;
@@ -1164,6 +1516,8 @@ static int process_child(sd_event *e) {
 
         assert(e);
 
+        e->need_process_child = false;
+
         /*
            So, this is ugly. We iteratively invoke waitid() with P_PID
            + WNOHANG for each PID we wait for, instead of using
@@ -1184,7 +1538,7 @@ static int process_child(sd_event *e) {
                 if (s->pending)
                         continue;
 
-                if (s->mute == SD_EVENT_MUTED)
+                if (s->enabled == SD_EVENT_OFF)
                         continue;
 
                 zero(s->child.siginfo);
@@ -1199,20 +1553,21 @@ static int process_child(sd_event *e) {
                 }
         }
 
-        e->processed_children = e->iteration;
         return 0;
 }
 
 static int process_signal(sd_event *e, uint32_t events) {
-        struct signalfd_siginfo si;
         bool read_one = false;
-        ssize_t ss;
         int r;
 
-        if (events != EPOLLIN)
-                return -EIO;
+        assert(e);
+        assert(e->signal_sources);
+
+        assert_return(events == EPOLLIN, -EIO);
 
         for (;;) {
+                struct signalfd_siginfo si;
+                ssize_t ss;
                 sd_event_source *s;
 
                 ss = read(e->signal_fd, &si, sizeof(si));
@@ -1228,17 +1583,16 @@ static int process_signal(sd_event *e, uint32_t events) {
 
                 read_one = true;
 
+                s = e->signal_sources[si.ssi_signo];
                 if (si.ssi_signo == SIGCHLD) {
                         r = process_child(e);
                         if (r < 0)
                                 return r;
-                        if (r > 0 || !e->signal_sources[si.ssi_signo])
+                        if (r > 0 || !s)
                                 continue;
-                } else {
-                        s = e->signal_sources[si.ssi_signo];
+                } else
                         if (!s)
                                 return -EIO;
-                }
 
                 s->signal.siginfo = si;
                 r = source_set_pending(s, true);
@@ -1251,21 +1605,25 @@ static int process_signal(sd_event *e, uint32_t events) {
 }
 
 static int source_dispatch(sd_event_source *s) {
-        int r;
+        int r = 0;
 
         assert(s);
-        assert(s->pending);
+        assert(s->pending || s->type == SOURCE_QUIT);
 
-        r = source_set_pending(s, false);
-        if (r < 0)
-                return r;
+        if (s->type != SOURCE_DEFER && s->type != SOURCE_QUIT) {
+                r = source_set_pending(s, false);
+                if (r < 0)
+                        return r;
+        }
 
-        if (s->mute == SD_EVENT_ONESHOT) {
-                r = sd_event_source_set_mute(s, SD_EVENT_MUTED);
+        if (s->enabled == SD_EVENT_ONESHOT) {
+                r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
                 if (r < 0)
                         return r;
         }
 
+        sd_event_source_ref(s);
+
         switch (s->type) {
 
         case SOURCE_IO:
@@ -1291,8 +1649,14 @@ static int source_dispatch(sd_event_source *s) {
         case SOURCE_DEFER:
                 r = s->defer.callback(s, s->userdata);
                 break;
+
+        case SOURCE_QUIT:
+                r = s->quit.callback(s, s->userdata);
+                break;
         }
 
+        sd_event_source_unref(s);
+
         return r;
 }
 
@@ -1305,7 +1669,7 @@ static int event_prepare(sd_event *e) {
                 sd_event_source *s;
 
                 s = prioq_peek(e->prepare);
-                if (!s || s->prepare_iteration == e->iteration || s->mute == SD_EVENT_MUTED)
+                if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
                         break;
 
                 s->prepare_iteration = e->iteration;
@@ -1323,118 +1687,232 @@ static int event_prepare(sd_event *e) {
         return 0;
 }
 
-int sd_event_run(sd_event *e, uint64_t timeout) {
+static int dispatch_quit(sd_event *e) {
+        sd_event_source *p;
+        int r;
+
+        assert(e);
+
+        p = prioq_peek(e->quit);
+        if (!p || p->enabled == SD_EVENT_OFF) {
+                e->state = SD_EVENT_FINISHED;
+                return 0;
+        }
+
+        sd_event_ref(e);
+        e->iteration++;
+        e->state = SD_EVENT_QUITTING;
+
+        r = source_dispatch(p);
+
+        e->state = SD_EVENT_PASSIVE;
+        sd_event_unref(e);
+
+        return r;
+}
+
+static sd_event_source* event_next_pending(sd_event *e) {
+        sd_event_source *p;
+
+        assert(e);
+
+        p = prioq_peek(e->pending);
+        if (!p)
+                return NULL;
+
+        if (p->enabled == SD_EVENT_OFF)
+                return NULL;
+
+        return p;
+}
+
+_public_ int sd_event_run(sd_event *e, uint64_t timeout) {
         struct epoll_event ev_queue[EPOLL_QUEUE_MAX];
         sd_event_source *p;
         int r, i, m;
-        dual_timestamp n;
 
-        if (!e)
-                return -EINVAL;
-        if (e->quit)
-                return -ESTALE;
+        assert_return(e, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
+
+        if (e->quit_requested)
+                return dispatch_quit(e);
 
+        sd_event_ref(e);
         e->iteration++;
+        e->state = SD_EVENT_RUNNING;
 
         r = event_prepare(e);
         if (r < 0)
-                return r;
+                goto finish;
 
-        r = event_arm_timer(e, e->monotonic_fd, e->monotonic, &e->monotonic_next);
-        if (r < 0)
-                return r;
+        if (event_next_pending(e) || e->need_process_child)
+                timeout = 0;
 
-        r = event_arm_timer(e, e->realtime_fd, e->realtime, &e->realtime_next);
-        if (r < 0)
-                return r;
+        if (timeout > 0) {
+                r = event_arm_timer(e, e->monotonic_fd, e->monotonic_earliest, e->monotonic_latest, &e->monotonic_next);
+                if (r < 0)
+                        goto finish;
 
-        if (e->iteration == 1 && !hashmap_isempty(e->child_sources))
-                /* On the first iteration, there might be already some
-                 * zombies for us to care for, hence, don't wait */
-                timeout = 0;
-        else {
-                p = prioq_peek(e->pending);
-                if (p && p->mute != SD_EVENT_MUTED)
-                        timeout = 0;
+                r = event_arm_timer(e, e->realtime_fd, e->realtime_earliest, e->realtime_latest, &e->realtime_next);
+                if (r < 0)
+                        goto finish;
         }
 
-        m = epoll_wait(e->epoll_fd, ev_queue, EPOLL_QUEUE_MAX, timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
-        if (m < 0)
-                return m;
+        m = epoll_wait(e->epoll_fd, ev_queue, EPOLL_QUEUE_MAX,
+                       timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
+        if (m < 0) {
+                r = errno == EAGAIN || errno == EINTR ? 0 : -errno;
+                goto finish;
+        }
 
-        dual_timestamp_get(&n);
+        dual_timestamp_get(&e->timestamp);
 
         for (i = 0; i < m; i++) {
 
                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_MONOTONIC))
-                        r = flush_timer(e, e->monotonic_fd, ev_queue[i].events);
+                        r = flush_timer(e, e->monotonic_fd, ev_queue[i].events, &e->monotonic_next);
                 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_REALTIME))
-                        r = flush_timer(e, e->realtime_fd, ev_queue[i].events);
+                        r = flush_timer(e, e->realtime_fd, ev_queue[i].events, &e->realtime_next);
                 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_SIGNAL))
                         r = process_signal(e, ev_queue[i].events);
                 else
                         r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
 
                 if (r < 0)
-                        return r;
+                        goto finish;
         }
 
-        r = process_timer(e, n.monotonic, e->monotonic);
+        r = process_timer(e, e->timestamp.monotonic, e->monotonic_earliest, e->monotonic_latest);
         if (r < 0)
-                return r;
+                goto finish;
 
-        r = process_timer(e, n.realtime, e->realtime);
+        r = process_timer(e, e->timestamp.realtime, e->realtime_earliest, e->realtime_latest);
         if (r < 0)
-                return r;
+                goto finish;
 
-        if (e->iteration == 1 && e->processed_children != 1) {
-                /* On the first iteration, make sure we really process
-                 * all children which might already be zombies. */
+        if (e->need_process_child) {
                 r = process_child(e);
                 if (r < 0)
-                        return r;
+                        goto finish;
         }
 
-        p = prioq_peek(e->pending);
-        if (!p || p->mute == SD_EVENT_MUTED)
-                return 0;
+        p = event_next_pending(e);
+        if (!p) {
+                r = 0;
+                goto finish;
+        }
+
+        r = source_dispatch(p);
 
-        return source_dispatch(p);
+finish:
+        e->state = SD_EVENT_PASSIVE;
+        sd_event_unref(e);
+
+        return r;
 }
 
-int sd_event_loop(sd_event *e) {
+_public_ int sd_event_loop(sd_event *e) {
         int r;
 
-        if (!e)
-                return -EINVAL;
+        assert_return(e, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+        assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
 
-        while (!e->quit) {
+        sd_event_ref(e);
+
+        while (e->state != SD_EVENT_FINISHED) {
                 r = sd_event_run(e, (uint64_t) -1);
                 if (r < 0)
-                        return r;
+                        goto finish;
         }
 
+        r = 0;
+
+finish:
+        sd_event_unref(e);
+        return r;
+}
+
+_public_ int sd_event_get_state(sd_event *e) {
+        assert_return(e, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        return e->state;
+}
+
+_public_ int sd_event_get_quit(sd_event *e) {
+        assert_return(e, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        return e->quit_requested;
+}
+
+_public_ int sd_event_request_quit(sd_event *e) {
+        assert_return(e, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        e->quit_requested = true;
         return 0;
 }
 
-int sd_event_quit(sd_event *e) {
-        if (!e)
-                return EINVAL;
+_public_ int sd_event_get_now_realtime(sd_event *e, uint64_t *usec) {
+        assert_return(e, -EINVAL);
+        assert_return(usec, -EINVAL);
+        assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
+        assert_return(!event_pid_changed(e), -ECHILD);
 
-        return e->quit;
+        *usec = e->timestamp.realtime;
+        return 0;
 }
 
-int sd_event_request_quit(sd_event *e) {
-        if (!e)
-                return -EINVAL;
+_public_ int sd_event_get_now_monotonic(sd_event *e, uint64_t *usec) {
+        assert_return(e, -EINVAL);
+        assert_return(usec, -EINVAL);
+        assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
+        assert_return(!event_pid_changed(e), -ECHILD);
 
-        e->quit = true;
+        *usec = e->timestamp.monotonic;
         return 0;
 }
 
-sd_event *sd_event_get(sd_event_source *s) {
-        if (!s)
-                return NULL;
+_public_ int sd_event_default(sd_event **ret) {
 
-        return s->event;
+        static __thread sd_event *default_event = NULL;
+        sd_event *e;
+        int r;
+
+        if (!ret)
+                return !!default_event;
+
+        if (default_event) {
+                *ret = sd_event_ref(default_event);
+                return 0;
+        }
+
+        r = sd_event_new(&e);
+        if (r < 0)
+                return r;
+
+        e->default_event_ptr = &default_event;
+        e->tid = gettid();
+        default_event = e;
+
+        *ret = e;
+        return 1;
+}
+
+_public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
+        assert_return(e, -EINVAL);
+        assert_return(tid, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (e->tid != 0) {
+                *tid = e->tid;
+                return 0;
+        }
+
+        return -ENXIO;
 }