chiark / gitweb /
service: handle services with racy daemonization gracefully
authorMichal Schmidt <mschmidt@redhat.com>
Sat, 3 Dec 2011 01:13:30 +0000 (02:13 +0100)
committerMichal Schmidt <mschmidt@redhat.com>
Sat, 3 Dec 2011 20:50:27 +0000 (21:50 +0100)
There are a lot of forking daemons that do not exactly follow the
initialization steps as described in daemon(7). It is common that they
do not bother waiting in the parent process for the child to write the
PID file before exiting. The daemons' developers often do not perceive
this as a bug and they're unwilling to change.

Currently systemd warns about the missing PID file and falls back to
guessing the main PID. Being not quite deterministic, the guess can be
wrong with bad consequences. If the guessing is disabled, determinism is
achieved at the cost of losing the ability of noticing when the main
process of the service dies.

As long as it does not negatively affect properly written services,
systemd should strive for compatibility even with services with racy
daemonization. It is possible to provide determinism _and_ main process
supervision to them.

If the PID file is not there, rather than guessing and considering the
service running immediately after getting the SIGCHLD from the ExecStart
(or ExecStartPost) process, we can keep the service in the activating
state for a bit longer. We can use inotify to wait for the PID file to
appear. Only when it finally does appear and we read a valid PID from
it, we'll move the service to the running state. If the PID file never
appears, the usual timeout kicks in and the service fails.

src/service.c
src/service.h

index 6fc2484f1321be2cc2d274bb932ca36d12f644a3..07137d265d3fab1b05c5ebac5b9601da8fc4c0f7 100644 (file)
@@ -1302,8 +1302,8 @@ static int service_load_pid_file(Service *s, bool may_warn) {
 
         if ((r = read_one_line_file(s->pid_file, &k)) < 0) {
                 if (may_warn)
-                        log_warning("Failed to read PID file %s after %s. The service might be broken.",
-                                    s->pid_file, service_state_to_string(s->state));
+                        log_info("PID file %s not readable (yet?) after %s.",
+                                 s->pid_file, service_state_to_string(s->state));
                 return r;
         }
 
@@ -1315,8 +1315,8 @@ static int service_load_pid_file(Service *s, bool may_warn) {
 
         if (kill(pid, 0) < 0 && errno != EPERM) {
                 if (may_warn)
-                        log_warning("PID %lu read from file %s does not exist. Your service or init script might be broken.",
-                                    (unsigned long) pid, s->pid_file);
+                        log_info("PID %lu read from file %s does not exist.",
+                                 (unsigned long) pid, s->pid_file);
                 return -ESRCH;
         }
 
@@ -1328,7 +1328,8 @@ static int service_load_pid_file(Service *s, bool may_warn) {
                           (unsigned long) s->main_pid, (unsigned long) pid);
                 service_unwatch_main_pid(s);
                 s->main_pid_known = false;
-        }
+        } else
+                log_debug("Main PID loaded: %lu", (unsigned long) pid);
 
         if ((r = service_set_main_pid(s, pid)) < 0)
                 return r;
@@ -1359,6 +1360,7 @@ static int service_search_main_pid(Service *s) {
         if ((pid = cgroup_bonding_search_main_pid_list(s->meta.cgroup_bondings)) <= 0)
                 return -ENOENT;
 
+        log_debug("Main PID guessed: %lu", (unsigned long) pid);
         if ((r = service_set_main_pid(s, pid)) < 0)
                 return r;
 
@@ -1451,6 +1453,17 @@ static int service_notify_sockets_dead(Service *s) {
         return 0;
 }
 
+static void service_unwatch_pid_file(Service *s) {
+        if (!s->pid_file_pathspec)
+                return;
+
+        log_debug("Stopping watch for %s's PID file %s", s->meta.id, s->pid_file_pathspec->path);
+        pathspec_unwatch(s->pid_file_pathspec, UNIT(s));
+        pathspec_done(s->pid_file_pathspec);
+        free(s->pid_file_pathspec);
+        s->pid_file_pathspec = NULL;
+}
+
 static void service_set_state(Service *s, ServiceState state) {
         ServiceState old_state;
         assert(s);
@@ -1458,6 +1471,8 @@ static void service_set_state(Service *s, ServiceState state) {
         old_state = s->state;
         s->state = state;
 
+        service_unwatch_pid_file(s);
+
         if (state != SERVICE_START_PRE &&
             state != SERVICE_START &&
             state != SERVICE_START_POST &&
@@ -2602,6 +2617,95 @@ static bool service_check_snapshot(Unit *u) {
         return !s->got_socket_fd;
 }
 
+static int service_retry_pid_file(Service *s) {
+        int r;
+
+        assert(s->pid_file);
+        assert(s->state == SERVICE_START || s->state == SERVICE_START_POST);
+
+        r = service_load_pid_file(s, false);
+        if (r < 0)
+                return r;
+
+        service_unwatch_pid_file(s);
+
+        service_enter_running(s, true);
+        return 0;
+}
+
+static int service_watch_pid_file(Service *s) {
+        int r;
+
+        log_debug("Setting watch for %s's PID file %s", s->meta.id, s->pid_file_pathspec->path);
+        r = pathspec_watch(s->pid_file_pathspec, UNIT(s));
+        if (r < 0)
+                goto fail;
+
+        /* the pidfile might have appeared just before we set the watch */
+        service_retry_pid_file(s);
+
+        return 0;
+fail:
+        log_error("Failed to set a watch for %s's PID file %s: %s",
+                  s->meta.id, s->pid_file_pathspec->path, strerror(-r));
+        service_unwatch_pid_file(s);
+        return r;
+}
+
+static int service_demand_pid_file(Service *s) {
+        PathSpec *ps;
+
+        assert(s->pid_file);
+        assert(!s->pid_file_pathspec);
+
+        ps = new0(PathSpec, 1);
+        if (!ps)
+                return -ENOMEM;
+
+        ps->path = strdup(s->pid_file);
+        if (!ps->path) {
+                free(ps);
+                return -ENOMEM;
+        }
+
+        path_kill_slashes(ps->path);
+
+        /* PATH_CHANGED would not be enough. There are daemons (sendmail) that
+         * keep their PID file open all the time. */
+        ps->type = PATH_MODIFIED;
+        ps->inotify_fd = -1;
+
+        s->pid_file_pathspec = ps;
+
+        return service_watch_pid_file(s);
+}
+
+static void service_fd_event(Unit *u, int fd, uint32_t events, Watch *w) {
+        Service *s = SERVICE(u);
+
+        assert(s);
+        assert(fd >= 0);
+        assert(s->state == SERVICE_START || s->state == SERVICE_START_POST);
+        assert(s->pid_file_pathspec);
+        assert(pathspec_owns_inotify_fd(s->pid_file_pathspec, fd));
+
+        log_debug("inotify event for %s", u->meta.id);
+
+        if (pathspec_fd_event(s->pid_file_pathspec, events) < 0)
+                goto fail;
+
+        if (service_retry_pid_file(s) == 0)
+                return;
+
+        if (service_watch_pid_file(s) < 0)
+                goto fail;
+
+        return;
+fail:
+        service_unwatch_pid_file(s);
+        service_enter_signal(s, SERVICE_STOP_SIGTERM, false);
+}
+
 static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
         Service *s = SERVICE(u);
         bool success;
@@ -2707,7 +2811,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
                                 success = true;
                 }
 
-               log_full(success ? LOG_DEBUG : LOG_NOTICE,
+                log_full(success ? LOG_DEBUG : LOG_NOTICE,
                          "%s: control process exited, code=%s status=%i", u->meta.id, sigchld_code_to_string(code), status);
                 s->failure = s->failure || !success;
 
@@ -2742,27 +2846,41 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
                         case SERVICE_START:
                                 assert(s->type == SERVICE_FORKING);
 
-                                /* Let's try to load the pid
-                                 * file here if we can. We
-                                 * ignore the return value,
-                                 * since the PID file might
-                                 * actually be created by a
-                                 * START_POST script */
-
-                                if (success) {
-                                        service_load_pid_file(s, !s->exec_command[SERVICE_EXEC_START_POST]);
-                                        service_search_main_pid(s);
+                                if (!success) {
+                                        service_enter_signal(s, SERVICE_FINAL_SIGTERM, false);
+                                        break;
+                                }
 
-                                        service_enter_start_post(s);
+                                if (s->pid_file) {
+                                        /* Let's try to load the pid file here if we can.
+                                         * The PID file might actually be created by a START_POST
+                                         * script. In that case don't worry if the loading fails. */
+                                        bool has_start_post = !!s->exec_command[SERVICE_EXEC_START_POST];
+                                        int r = service_load_pid_file(s, !has_start_post);
+                                        if (!has_start_post && r < 0) {
+                                                r = service_demand_pid_file(s);
+                                                if (r < 0 || !cgroup_good(s))
+                                                        service_enter_signal(s, SERVICE_FINAL_SIGTERM, false);
+                                                break;
+                                        }
                                 } else
-                                        service_enter_signal(s, SERVICE_FINAL_SIGTERM, false);
+                                        service_search_main_pid(s);
 
+                                service_enter_start_post(s);
                                 break;
 
                         case SERVICE_START_POST:
                                 if (success) {
-                                        service_load_pid_file(s, true);
-                                        service_search_main_pid(s);
+                                        if (s->pid_file) {
+                                                int r = service_load_pid_file(s, true);
+                                                if (r < 0) {
+                                                        r = service_demand_pid_file(s);
+                                                        if (r < 0 || !cgroup_good(s))
+                                                                service_enter_stop(s, false);
+                                                        break;
+                                                }
+                                        } else
+                                                service_search_main_pid(s);
                                 }
 
                                 s->reload_failure = !success;
@@ -2907,6 +3025,20 @@ static void service_cgroup_notify_event(Unit *u) {
                  * except when we don't know pid which to expect the
                  * SIGCHLD for. */
 
+        case SERVICE_START:
+        case SERVICE_START_POST:
+                /* If we were hoping for the daemon to write its PID file,
+                 * we can give up now. */
+                if (s->pid_file_pathspec) {
+                        log_warning("%s never wrote its PID file. Failing.", s->meta.id);
+                        service_unwatch_pid_file(s);
+                        if (s->state == SERVICE_START)
+                                service_enter_signal(s, SERVICE_FINAL_SIGTERM, false);
+                        else
+                                service_enter_stop(s, false);
+                }
+                break;
+
         case SERVICE_RUNNING:
                 service_enter_running(s, true);
                 break;
@@ -3216,7 +3348,7 @@ static int service_enumerate(Manager *m) {
         r = 0;
 
 #ifdef TARGET_SUSE
-       sysv_facility_in_insserv_conf (m);
+        sysv_facility_in_insserv_conf (m);
 #endif
 
 finish:
@@ -3511,6 +3643,7 @@ const UnitVTable service_vtable = {
 
         .sigchld_event = service_sigchld_event,
         .timer_event = service_timer_event,
+        .fd_event = service_fd_event,
 
         .reset_failed = service_reset_failed,
 
index e28f74b8988c221a9c5eb690424ad635dd2fc2e3..15d58cc8ca984a76c32a209df67868a399f717a9 100644 (file)
@@ -86,6 +86,8 @@ typedef enum NotifyAccess {
         _NOTIFY_ACCESS_INVALID = -1
 } NotifyAccess;
 
+typedef struct PathSpec PathSpec;
+
 struct Service {
         Meta meta;
 
@@ -157,6 +159,7 @@ struct Service {
         Set *configured_sockets;
 
         Watch timer_watch;
+        PathSpec *pid_file_pathspec;
 
         NotifyAccess notify_access;
 };