chiark / gitweb /
event: hook up sd-event with the service watchdog logic
[elogind.git] / src / libsystemd-bus / sd-event.c
index eab92bded0e0ad67ae27ddf3cfd22a3edd594e4b..9fceb7b13edd794b99655e0d644244047678d71b 100644 (file)
@@ -24,6 +24,7 @@
 #include <sys/wait.h>
 
 #include "sd-id128.h"
 #include <sys/wait.h>
 
 #include "sd-id128.h"
+#include "sd-daemon.h"
 #include "macro.h"
 #include "prioq.h"
 #include "hashmap.h"
 #include "macro.h"
 #include "prioq.h"
 #include "hashmap.h"
@@ -43,7 +44,8 @@ typedef enum EventSourceType {
         SOURCE_SIGNAL,
         SOURCE_CHILD,
         SOURCE_DEFER,
         SOURCE_SIGNAL,
         SOURCE_CHILD,
         SOURCE_DEFER,
-        SOURCE_QUIT
+        SOURCE_QUIT,
+        SOURCE_WATCHDOG
 } EventSourceType;
 
 struct sd_event_source {
 } EventSourceType;
 
 struct sd_event_source {
@@ -105,6 +107,7 @@ struct sd_event {
         int signal_fd;
         int realtime_fd;
         int monotonic_fd;
         int signal_fd;
         int realtime_fd;
         int monotonic_fd;
+        int watchdog_fd;
 
         Prioq *pending;
         Prioq *prepare;
 
         Prioq *pending;
         Prioq *prepare;
@@ -139,9 +142,12 @@ struct sd_event {
 
         bool quit_requested:1;
         bool need_process_child:1;
 
         bool quit_requested:1;
         bool need_process_child:1;
+        bool watchdog:1;
 
         pid_t tid;
         sd_event **default_event_ptr;
 
         pid_t tid;
         sd_event **default_event_ptr;
+
+        usec_t watchdog_last, watchdog_period;
 };
 
 static int pending_prioq_compare(const void *a, const void *b) {
 };
 
 static int pending_prioq_compare(const void *a, const void *b) {
@@ -323,6 +329,9 @@ static void event_free(sd_event *e) {
         if (e->monotonic_fd >= 0)
                 close_nointr_nofail(e->monotonic_fd);
 
         if (e->monotonic_fd >= 0)
                 close_nointr_nofail(e->monotonic_fd);
 
+        if (e->watchdog_fd >= 0)
+                close_nointr_nofail(e->watchdog_fd);
+
         prioq_free(e->pending);
         prioq_free(e->prepare);
         prioq_free(e->monotonic_earliest);
         prioq_free(e->pending);
         prioq_free(e->prepare);
         prioq_free(e->monotonic_earliest);
@@ -348,7 +357,7 @@ _public_ int sd_event_new(sd_event** ret) {
                 return -ENOMEM;
 
         e->n_ref = 1;
                 return -ENOMEM;
 
         e->n_ref = 1;
-        e->signal_fd = e->realtime_fd = e->monotonic_fd = e->epoll_fd = -1;
+        e->signal_fd = e->realtime_fd = e->monotonic_fd = e->watchdog_fd = e->epoll_fd = -1;
         e->realtime_next = e->monotonic_next = (usec_t) -1;
         e->original_pid = getpid();
 
         e->realtime_next = e->monotonic_next = (usec_t) -1;
         e->original_pid = getpid();
 
@@ -384,7 +393,9 @@ _public_ sd_event* sd_event_ref(sd_event *e) {
 }
 
 _public_ sd_event* sd_event_unref(sd_event *e) {
 }
 
 _public_ sd_event* sd_event_unref(sd_event *e) {
-        assert_return(e, NULL);
+
+        if (!e)
+                return NULL;
 
         assert(e->n_ref >= 1);
         e->n_ref--;
 
         assert(e->n_ref >= 1);
         e->n_ref--;
@@ -543,6 +554,14 @@ static int source_set_pending(sd_event_source *s, bool b) {
         } else
                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
 
         } else
                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
 
+        if (s->type == SOURCE_REALTIME) {
+                prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
+                prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
+        } else if (s->type == SOURCE_MONOTONIC) {
+                prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
+                prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
+        }
+
         return 0;
 }
 
         return 0;
 }
 
@@ -576,7 +595,7 @@ _public_ int sd_event_add_io(
 
         assert_return(e, -EINVAL);
         assert_return(fd >= 0, -EINVAL);
 
         assert_return(e, -EINVAL);
         assert_return(fd >= 0, -EINVAL);
-        assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP)), -EINVAL);
+        assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
         assert_return(callback, -EINVAL);
         assert_return(ret, -EINVAL);
         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
         assert_return(callback, -EINVAL);
         assert_return(ret, -EINVAL);
         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
@@ -632,15 +651,16 @@ static int event_setup_timer_fd(
         }
 
         /* When we sleep for longer, we try to realign the wakeup to
         }
 
         /* When we sleep for longer, we try to realign the wakeup to
-           the same time wihtin each second, so that events all across
-           the system can be coalesced into a single CPU
-           wakeup. However, let's take some system-specific randomness
-           for this value, so that in a network of systems with synced
-           clocks timer events are distributed a bit. Here, we
-           calculate a perturbation usec offset from the boot ID. */
+           the same time wihtin each minute/second/250ms, so that
+           events all across the system can be coalesced into a single
+           CPU wakeup. However, let's take some system-specific
+           randomness for this value, so that in a network of systems
+           with synced clocks timer events are distributed a
+           bit. Here, we calculate a perturbation usec offset from the
+           boot ID. */
 
         if (sd_id128_get_boot(&bootid) >= 0)
 
         if (sd_id128_get_boot(&bootid) >= 0)
-                e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_SEC;
+                e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
 
         *timer_fd = fd;
         return 0;
 
         *timer_fd = fd;
         return 0;
@@ -964,7 +984,9 @@ _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
 }
 
 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
 }
 
 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
-        assert_return(s, NULL);
+
+        if (!s)
+                return NULL;
 
         assert(s->n_ref >= 1);
         s->n_ref--;
 
         assert(s->n_ref >= 1);
         s->n_ref--;
@@ -975,7 +997,7 @@ _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
         return NULL;
 }
 
         return NULL;
 }
 
-_public_ sd_event *sd_event_get(sd_event_source *s) {
+_public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
         assert_return(s, NULL);
 
         return s->event;
         assert_return(s, NULL);
 
         return s->event;
@@ -1013,7 +1035,7 @@ _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events)
 
         assert_return(s, -EINVAL);
         assert_return(s->type == SOURCE_IO, -EDOM);
 
         assert_return(s, -EINVAL);
         assert_return(s->type == SOURCE_IO, -EDOM);
-        assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP)), -EINVAL);
+        assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
         assert_return(!event_pid_changed(s->event), -ECHILD);
 
         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
         assert_return(!event_pid_changed(s->event), -ECHILD);
 
@@ -1027,6 +1049,7 @@ _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events)
         }
 
         s->io.events = events;
         }
 
         s->io.events = events;
+        source_set_pending(s, false);
 
         return 0;
 }
 
         return 0;
 }
@@ -1237,10 +1260,8 @@ _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
         assert_return(!event_pid_changed(s->event), -ECHILD);
 
         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
         assert_return(!event_pid_changed(s->event), -ECHILD);
 
-        if (s->time.next == usec)
-                return 0;
-
         s->time.next = usec;
         s->time.next = usec;
+
         source_set_pending(s, false);
 
         if (s->type == SOURCE_REALTIME) {
         source_set_pending(s, false);
 
         if (s->type == SOURCE_REALTIME) {
@@ -1274,11 +1295,10 @@ _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec
         if (usec == 0)
                 usec = DEFAULT_ACCURACY_USEC;
 
         if (usec == 0)
                 usec = DEFAULT_ACCURACY_USEC;
 
-        if (s->time.accuracy == usec)
-                return 0;
-
         s->time.accuracy = usec;
 
         s->time.accuracy = usec;
 
+        source_set_pending(s, false);
+
         if (s->type == SOURCE_REALTIME)
                 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
         else
         if (s->type == SOURCE_REALTIME)
                 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
         else
@@ -1357,13 +1377,24 @@ static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
              dispatch as much as possible on the entire system.
 
           We implement this by waking up everywhere at the same time
              dispatch as much as possible on the entire system.
 
           We implement this by waking up everywhere at the same time
-          within any given second if we can, synchronised via the
+          within any given minute if we can, synchronised via the
           perturbation value determined from the boot ID. If we can't,
           perturbation value determined from the boot ID. If we can't,
-          then we try to find the same spot in every a 250ms
+          then we try to find the same spot in every 1s and then 250ms
           step. Otherwise, we pick the last possible time to wake up.
         */
 
           step. Otherwise, we pick the last possible time to wake up.
         */
 
-        c = (b / USEC_PER_SEC) * USEC_PER_SEC + e->perturb;
+        c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
+        if (c >= b) {
+                if (_unlikely_(c < USEC_PER_MINUTE))
+                        return b;
+
+                c -= USEC_PER_MINUTE;
+        }
+
+        if (c >= a)
+                return c;
+
+        c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
         if (c >= b) {
                 if (_unlikely_(c < USEC_PER_SEC))
                         return b;
         if (c >= b) {
                 if (_unlikely_(c < USEC_PER_SEC))
                         return b;
@@ -1400,12 +1431,15 @@ static int event_arm_timer(
         usec_t t;
         int r;
 
         usec_t t;
         int r;
 
-        assert_se(e);
-        assert_se(next);
+        assert(e);
+        assert(next);
 
         a = prioq_peek(earliest);
         if (!a || a->enabled == SD_EVENT_OFF) {
 
 
         a = prioq_peek(earliest);
         if (!a || a->enabled == SD_EVENT_OFF) {
 
+                if (timer_fd < 0)
+                        return 0;
+
                 if (*next == (usec_t) -1)
                         return 0;
 
                 if (*next == (usec_t) -1)
                         return 0;
 
@@ -1437,7 +1471,7 @@ static int event_arm_timer(
 
         r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
         if (r < 0)
 
         r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
         if (r < 0)
-                return r;
+                return -errno;
 
         *next = t;
         return 0;
 
         *next = t;
         return 0;
@@ -1459,7 +1493,6 @@ static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
 
         assert(e);
         assert(fd >= 0);
 
         assert(e);
         assert(fd >= 0);
-        assert(next);
 
         assert_return(events == EPOLLIN, -EIO);
 
 
         assert_return(events == EPOLLIN, -EIO);
 
@@ -1474,7 +1507,8 @@ static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
         if (ss != sizeof(x))
                 return -EIO;
 
         if (ss != sizeof(x))
                 return -EIO;
 
-        *next = (usec_t) -1;
+        if (next)
+                *next = (usec_t) -1;
 
         return 0;
 }
 
         return 0;
 }
@@ -1530,6 +1564,10 @@ static int process_child(sd_event *e) {
            don't care about. Since this is O(n) this means that if you
            have a lot of processes you probably want to handle SIGCHLD
            yourself.
            don't care about. Since this is O(n) this means that if you
            have a lot of processes you probably want to handle SIGCHLD
            yourself.
+
+           We do not reap the children here (by using WNOWAIT), this
+           is only done after the event source is dispatched so that
+           the callback still sees the process as a zombie.
         */
 
         HASHMAP_FOREACH(s, e->child_sources, i) {
         */
 
         HASHMAP_FOREACH(s, e->child_sources, i) {
@@ -1542,11 +1580,27 @@ static int process_child(sd_event *e) {
                         continue;
 
                 zero(s->child.siginfo);
                         continue;
 
                 zero(s->child.siginfo);
-                r = waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|s->child.options);
+                r = waitid(P_PID, s->child.pid, &s->child.siginfo,
+                           WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
                 if (r < 0)
                         return -errno;
 
                 if (s->child.siginfo.si_pid != 0) {
                 if (r < 0)
                         return -errno;
 
                 if (s->child.siginfo.si_pid != 0) {
+                        bool zombie =
+                                s->child.siginfo.si_code == CLD_EXITED ||
+                                s->child.siginfo.si_code == CLD_KILLED ||
+                                s->child.siginfo.si_code == CLD_DUMPED;
+
+                        if (!zombie && (s->child.options & WEXITED)) {
+                                /* If the child isn't dead then let's
+                                 * immediately remove the state change
+                                 * from the queue, since there's no
+                                 * benefit in leaving it queued */
+
+                                assert(s->child.options & (WSTOPPED|WCONTINUED));
+                                waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
+                        }
+
                         r = source_set_pending(s, true);
                         if (r < 0)
                                 return r;
                         r = source_set_pending(s, true);
                         if (r < 0)
                                 return r;
@@ -1600,7 +1654,6 @@ static int process_signal(sd_event *e, uint32_t events) {
                         return r;
         }
 
                         return r;
         }
 
-
         return 0;
 }
 
         return 0;
 }
 
@@ -1642,9 +1695,21 @@ static int source_dispatch(sd_event_source *s) {
                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
                 break;
 
                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
                 break;
 
-        case SOURCE_CHILD:
+        case SOURCE_CHILD: {
+                bool zombie;
+
+                zombie = s->child.siginfo.si_code == CLD_EXITED ||
+                         s->child.siginfo.si_code == CLD_KILLED ||
+                         s->child.siginfo.si_code == CLD_DUMPED;
+
                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
+
+                /* Now, reap the PID for good. */
+                if (zombie)
+                        waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
+
                 break;
                 break;
+        }
 
         case SOURCE_DEFER:
                 r = s->defer.callback(s, s->userdata);
 
         case SOURCE_DEFER:
                 r = s->defer.callback(s, s->userdata);
@@ -1726,6 +1791,43 @@ static sd_event_source* event_next_pending(sd_event *e) {
         return p;
 }
 
         return p;
 }
 
+static int arm_watchdog(sd_event *e) {
+        struct itimerspec its = {};
+        usec_t t;
+        int r;
+
+        assert(e);
+        assert(e->watchdog_fd >= 0);
+
+        t = sleep_between(e,
+                          e->watchdog_last + (e->watchdog_period / 2),
+                          e->watchdog_last + (e->watchdog_period * 3 / 4));
+
+        timespec_store(&its.it_value, t);
+
+        r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
+        if (r < 0)
+                return -errno;
+
+        return 0;
+}
+
+static int process_watchdog(sd_event *e) {
+        assert(e);
+
+        if (!e->watchdog)
+                return 0;
+
+        /* Don't notify watchdog too often */
+        if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
+                return 0;
+
+        sd_notify(false, "WATCHDOG=1");
+        e->watchdog_last = e->timestamp.monotonic;
+
+        return arm_watchdog(e);
+}
+
 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
         struct epoll_event ev_queue[EPOLL_QUEUE_MAX];
         sd_event_source *p;
 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
         struct epoll_event ev_queue[EPOLL_QUEUE_MAX];
         sd_event_source *p;
@@ -1747,18 +1849,16 @@ _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
         if (r < 0)
                 goto finish;
 
         if (r < 0)
                 goto finish;
 
-        if (event_next_pending(e) || e->need_process_child)
-                timeout = 0;
+        r = event_arm_timer(e, e->monotonic_fd, e->monotonic_earliest, e->monotonic_latest, &e->monotonic_next);
+        if (r < 0)
+                goto finish;
 
 
-        if (timeout > 0) {
-                r = event_arm_timer(e, e->monotonic_fd, e->monotonic_earliest, e->monotonic_latest, &e->monotonic_next);
-                if (r < 0)
-                        goto finish;
+        r = event_arm_timer(e, e->realtime_fd, e->realtime_earliest, e->realtime_latest, &e->realtime_next);
+        if (r < 0)
+                goto finish;
 
 
-                r = event_arm_timer(e, e->realtime_fd, e->realtime_earliest, e->realtime_latest, &e->realtime_next);
-                if (r < 0)
-                        goto finish;
-        }
+        if (event_next_pending(e) || e->need_process_child)
+                timeout = 0;
 
         m = epoll_wait(e->epoll_fd, ev_queue, EPOLL_QUEUE_MAX,
                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
 
         m = epoll_wait(e->epoll_fd, ev_queue, EPOLL_QUEUE_MAX,
                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
@@ -1777,6 +1877,8 @@ _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
                         r = flush_timer(e, e->realtime_fd, ev_queue[i].events, &e->realtime_next);
                 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_SIGNAL))
                         r = process_signal(e, ev_queue[i].events);
                         r = flush_timer(e, e->realtime_fd, ev_queue[i].events, &e->realtime_next);
                 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_SIGNAL))
                         r = process_signal(e, ev_queue[i].events);
+                else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
+                        r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
                 else
                         r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
 
                 else
                         r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
 
@@ -1784,6 +1886,10 @@ _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
                         goto finish;
         }
 
                         goto finish;
         }
 
+        r = process_watchdog(e);
+        if (r < 0)
+                goto finish;
+
         r = process_timer(e, e->timestamp.monotonic, e->monotonic_earliest, e->monotonic_latest);
         if (r < 0)
                 goto finish;
         r = process_timer(e, e->timestamp.monotonic, e->monotonic_earliest, e->monotonic_latest);
         if (r < 0)
                 goto finish;
@@ -1916,3 +2022,63 @@ _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
 
         return -ENXIO;
 }
 
         return -ENXIO;
 }
+
+_public_ int sd_event_set_watchdog(sd_event *e, int b) {
+        int r;
+
+        assert_return(e, -EINVAL);
+
+        if (e->watchdog == !!b)
+                return e->watchdog;
+
+        if (b) {
+                struct epoll_event ev = {};
+                const char *env;
+
+                env = getenv("WATCHDOG_USEC");
+                if (!env)
+                        return false;
+
+                r = safe_atou64(env, &e->watchdog_period);
+                if (r < 0)
+                        return r;
+                if (e->watchdog_period <= 0)
+                        return -EIO;
+
+                /* Issue first ping immediately */
+                sd_notify(false, "WATCHDOG=1");
+                e->watchdog_last = now(CLOCK_MONOTONIC);
+
+                e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
+                if (e->watchdog_fd < 0)
+                        return -errno;
+
+                r = arm_watchdog(e);
+                if (r < 0)
+                        goto fail;
+
+                ev.events = EPOLLIN;
+                ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
+
+                r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
+                if (r < 0) {
+                        r = -errno;
+                        goto fail;
+                }
+
+        } else {
+                if (e->watchdog_fd >= 0) {
+                        epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
+                        close_nointr_nofail(e->watchdog_fd);
+                        e->watchdog_fd = -1;
+                }
+        }
+
+        e->watchdog = !!b;
+        return e->watchdog;
+
+fail:
+        close_nointr_nofail(e->watchdog_fd);
+        e->watchdog_fd = -1;
+        return r;
+}