X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~ianmdlvl/git?a=blobdiff_plain;f=src%2Flibsystemd-bus%2Fsd-event.c;h=9fceb7b13edd794b99655e0d644244047678d71b;hb=cde93897cdefdd7c7f66c400a61e42ceee5f6a46;hp=ec18af43d51f385d90ccef2bcaca8cf58bb48c02;hpb=2576a19ed25dff1adc7fd2c0b874c74946fb35b4;p=elogind.git diff --git a/src/libsystemd-bus/sd-event.c b/src/libsystemd-bus/sd-event.c index ec18af43d..9fceb7b13 100644 --- a/src/libsystemd-bus/sd-event.c +++ b/src/libsystemd-bus/sd-event.c @@ -24,6 +24,7 @@ #include #include "sd-id128.h" +#include "sd-daemon.h" #include "macro.h" #include "prioq.h" #include "hashmap.h" @@ -43,7 +44,8 @@ typedef enum EventSourceType { SOURCE_SIGNAL, SOURCE_CHILD, SOURCE_DEFER, - SOURCE_QUIT + SOURCE_QUIT, + SOURCE_WATCHDOG } EventSourceType; struct sd_event_source { @@ -105,6 +107,7 @@ struct sd_event { int signal_fd; int realtime_fd; int monotonic_fd; + int watchdog_fd; Prioq *pending; Prioq *prepare; @@ -139,9 +142,12 @@ struct sd_event { bool quit_requested:1; bool need_process_child:1; + bool watchdog:1; pid_t tid; sd_event **default_event_ptr; + + usec_t watchdog_last, watchdog_period; }; static int pending_prioq_compare(const void *a, const void *b) { @@ -323,6 +329,9 @@ static void event_free(sd_event *e) { if (e->monotonic_fd >= 0) close_nointr_nofail(e->monotonic_fd); + if (e->watchdog_fd >= 0) + close_nointr_nofail(e->watchdog_fd); + prioq_free(e->pending); prioq_free(e->prepare); prioq_free(e->monotonic_earliest); @@ -348,7 +357,7 @@ _public_ int sd_event_new(sd_event** ret) { return -ENOMEM; e->n_ref = 1; - e->signal_fd = e->realtime_fd = e->monotonic_fd = e->epoll_fd = -1; + e->signal_fd = e->realtime_fd = e->monotonic_fd = e->watchdog_fd = e->epoll_fd = -1; e->realtime_next = e->monotonic_next = (usec_t) -1; e->original_pid = getpid(); @@ -384,7 +393,9 @@ _public_ sd_event* sd_event_ref(sd_event *e) { } _public_ sd_event* sd_event_unref(sd_event *e) { - assert_return(e, NULL); + + if (!e) + return NULL; assert(e->n_ref >= 1); e->n_ref--; @@ -584,7 +595,7 @@ _public_ int sd_event_add_io( assert_return(e, -EINVAL); assert_return(fd >= 0, -EINVAL); - assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP)), -EINVAL); + assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL); assert_return(callback, -EINVAL); assert_return(ret, -EINVAL); assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); @@ -640,15 +651,16 @@ static int event_setup_timer_fd( } /* When we sleep for longer, we try to realign the wakeup to - the same time wihtin each second, so that events all across - the system can be coalesced into a single CPU - wakeup. However, let's take some system-specific randomness - for this value, so that in a network of systems with synced - clocks timer events are distributed a bit. Here, we - calculate a perturbation usec offset from the boot ID. */ + the same time wihtin each minute/second/250ms, so that + events all across the system can be coalesced into a single + CPU wakeup. However, let's take some system-specific + randomness for this value, so that in a network of systems + with synced clocks timer events are distributed a + bit. Here, we calculate a perturbation usec offset from the + boot ID. */ if (sd_id128_get_boot(&bootid) >= 0) - e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_SEC; + e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE; *timer_fd = fd; return 0; @@ -972,7 +984,9 @@ _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) { } _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) { - assert_return(s, NULL); + + if (!s) + return NULL; assert(s->n_ref >= 1); s->n_ref--; @@ -983,7 +997,7 @@ _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) { return NULL; } -_public_ sd_event *sd_event_get(sd_event_source *s) { +_public_ sd_event *sd_event_source_get_event(sd_event_source *s) { assert_return(s, NULL); return s->event; @@ -1021,7 +1035,7 @@ _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) assert_return(s, -EINVAL); assert_return(s->type == SOURCE_IO, -EDOM); - assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP)), -EINVAL); + assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL); assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE); assert_return(!event_pid_changed(s->event), -ECHILD); @@ -1363,13 +1377,24 @@ static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) { dispatch as much as possible on the entire system. We implement this by waking up everywhere at the same time - within any given second if we can, synchronised via the + within any given minute if we can, synchronised via the perturbation value determined from the boot ID. If we can't, - then we try to find the same spot in every a 250ms + then we try to find the same spot in every 1s and then 250ms step. Otherwise, we pick the last possible time to wake up. */ - c = (b / USEC_PER_SEC) * USEC_PER_SEC + e->perturb; + c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb; + if (c >= b) { + if (_unlikely_(c < USEC_PER_MINUTE)) + return b; + + c -= USEC_PER_MINUTE; + } + + if (c >= a) + return c; + + c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC); if (c >= b) { if (_unlikely_(c < USEC_PER_SEC)) return b; @@ -1406,12 +1431,15 @@ static int event_arm_timer( usec_t t; int r; - assert_se(e); - assert_se(next); + assert(e); + assert(next); a = prioq_peek(earliest); if (!a || a->enabled == SD_EVENT_OFF) { + if (timer_fd < 0) + return 0; + if (*next == (usec_t) -1) return 0; @@ -1443,7 +1471,7 @@ static int event_arm_timer( r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL); if (r < 0) - return r; + return -errno; *next = t; return 0; @@ -1465,7 +1493,6 @@ static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) { assert(e); assert(fd >= 0); - assert(next); assert_return(events == EPOLLIN, -EIO); @@ -1480,7 +1507,8 @@ static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) { if (ss != sizeof(x)) return -EIO; - *next = (usec_t) -1; + if (next) + *next = (usec_t) -1; return 0; } @@ -1536,6 +1564,10 @@ static int process_child(sd_event *e) { don't care about. Since this is O(n) this means that if you have a lot of processes you probably want to handle SIGCHLD yourself. + + We do not reap the children here (by using WNOWAIT), this + is only done after the event source is dispatched so that + the callback still sees the process as a zombie. */ HASHMAP_FOREACH(s, e->child_sources, i) { @@ -1548,11 +1580,27 @@ static int process_child(sd_event *e) { continue; zero(s->child.siginfo); - r = waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|s->child.options); + r = waitid(P_PID, s->child.pid, &s->child.siginfo, + WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options); if (r < 0) return -errno; if (s->child.siginfo.si_pid != 0) { + bool zombie = + s->child.siginfo.si_code == CLD_EXITED || + s->child.siginfo.si_code == CLD_KILLED || + s->child.siginfo.si_code == CLD_DUMPED; + + if (!zombie && (s->child.options & WEXITED)) { + /* If the child isn't dead then let's + * immediately remove the state change + * from the queue, since there's no + * benefit in leaving it queued */ + + assert(s->child.options & (WSTOPPED|WCONTINUED)); + waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED))); + } + r = source_set_pending(s, true); if (r < 0) return r; @@ -1606,7 +1654,6 @@ static int process_signal(sd_event *e, uint32_t events) { return r; } - return 0; } @@ -1648,9 +1695,21 @@ static int source_dispatch(sd_event_source *s) { r = s->signal.callback(s, &s->signal.siginfo, s->userdata); break; - case SOURCE_CHILD: + case SOURCE_CHILD: { + bool zombie; + + zombie = s->child.siginfo.si_code == CLD_EXITED || + s->child.siginfo.si_code == CLD_KILLED || + s->child.siginfo.si_code == CLD_DUMPED; + r = s->child.callback(s, &s->child.siginfo, s->userdata); + + /* Now, reap the PID for good. */ + if (zombie) + waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED); + break; + } case SOURCE_DEFER: r = s->defer.callback(s, s->userdata); @@ -1732,6 +1791,43 @@ static sd_event_source* event_next_pending(sd_event *e) { return p; } +static int arm_watchdog(sd_event *e) { + struct itimerspec its = {}; + usec_t t; + int r; + + assert(e); + assert(e->watchdog_fd >= 0); + + t = sleep_between(e, + e->watchdog_last + (e->watchdog_period / 2), + e->watchdog_last + (e->watchdog_period * 3 / 4)); + + timespec_store(&its.it_value, t); + + r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL); + if (r < 0) + return -errno; + + return 0; +} + +static int process_watchdog(sd_event *e) { + assert(e); + + if (!e->watchdog) + return 0; + + /* Don't notify watchdog too often */ + if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic) + return 0; + + sd_notify(false, "WATCHDOG=1"); + e->watchdog_last = e->timestamp.monotonic; + + return arm_watchdog(e); +} + _public_ int sd_event_run(sd_event *e, uint64_t timeout) { struct epoll_event ev_queue[EPOLL_QUEUE_MAX]; sd_event_source *p; @@ -1753,18 +1849,16 @@ _public_ int sd_event_run(sd_event *e, uint64_t timeout) { if (r < 0) goto finish; - if (event_next_pending(e) || e->need_process_child) - timeout = 0; + r = event_arm_timer(e, e->monotonic_fd, e->monotonic_earliest, e->monotonic_latest, &e->monotonic_next); + if (r < 0) + goto finish; - if (timeout > 0) { - r = event_arm_timer(e, e->monotonic_fd, e->monotonic_earliest, e->monotonic_latest, &e->monotonic_next); - if (r < 0) - goto finish; + r = event_arm_timer(e, e->realtime_fd, e->realtime_earliest, e->realtime_latest, &e->realtime_next); + if (r < 0) + goto finish; - r = event_arm_timer(e, e->realtime_fd, e->realtime_earliest, e->realtime_latest, &e->realtime_next); - if (r < 0) - goto finish; - } + if (event_next_pending(e) || e->need_process_child) + timeout = 0; m = epoll_wait(e->epoll_fd, ev_queue, EPOLL_QUEUE_MAX, timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC)); @@ -1783,6 +1877,8 @@ _public_ int sd_event_run(sd_event *e, uint64_t timeout) { r = flush_timer(e, e->realtime_fd, ev_queue[i].events, &e->realtime_next); else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_SIGNAL)) r = process_signal(e, ev_queue[i].events); + else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG)) + r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL); else r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events); @@ -1790,6 +1886,10 @@ _public_ int sd_event_run(sd_event *e, uint64_t timeout) { goto finish; } + r = process_watchdog(e); + if (r < 0) + goto finish; + r = process_timer(e, e->timestamp.monotonic, e->monotonic_earliest, e->monotonic_latest); if (r < 0) goto finish; @@ -1922,3 +2022,63 @@ _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) { return -ENXIO; } + +_public_ int sd_event_set_watchdog(sd_event *e, int b) { + int r; + + assert_return(e, -EINVAL); + + if (e->watchdog == !!b) + return e->watchdog; + + if (b) { + struct epoll_event ev = {}; + const char *env; + + env = getenv("WATCHDOG_USEC"); + if (!env) + return false; + + r = safe_atou64(env, &e->watchdog_period); + if (r < 0) + return r; + if (e->watchdog_period <= 0) + return -EIO; + + /* Issue first ping immediately */ + sd_notify(false, "WATCHDOG=1"); + e->watchdog_last = now(CLOCK_MONOTONIC); + + e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC); + if (e->watchdog_fd < 0) + return -errno; + + r = arm_watchdog(e); + if (r < 0) + goto fail; + + ev.events = EPOLLIN; + ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG); + + r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev); + if (r < 0) { + r = -errno; + goto fail; + } + + } else { + if (e->watchdog_fd >= 0) { + epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL); + close_nointr_nofail(e->watchdog_fd); + e->watchdog_fd = -1; + } + } + + e->watchdog = !!b; + return e->watchdog; + +fail: + close_nointr_nofail(e->watchdog_fd); + e->watchdog_fd = -1; + return r; +}