chiark / gitweb /
core: send sigabrt on watchdog timeout to get the stacktrace
authorUmut Tezduyar Lindskog <umut.tezduyar@axis.com>
Tue, 28 Oct 2014 15:35:40 +0000 (16:35 +0100)
committerLennart Poettering <lennart@poettering.net>
Tue, 28 Oct 2014 16:37:39 +0000 (17:37 +0100)
if sigabrt doesn't do the job, follow regular shutdown
routine, sigterm > sigkill.

TODO
man/systemd.service.xml
src/core/busname.c
src/core/mount.c
src/core/scope.c
src/core/service.c
src/core/service.h
src/core/socket.c
src/core/swap.c
src/core/unit.c
src/core/unit.h

diff --git a/TODO b/TODO
index 10b2258..b07d664 100644 (file)
--- a/TODO
+++ b/TODO
@@ -48,8 +48,6 @@ Features:
 
 * consider showing the unit names during boot up in the status output, not just the unit descriptions
 
-* send SIGABRT when a service watchdog is triggered, by default, so that we acquire a backtrace of the hang.
-
 * dhcp: do we allow configuring dhcp routes on interfaces that are not the one we got the dhcp info from?
 
 * maybe allow timer units with an empty Units= setting, so that they
index 115d169..e563b19 100644 (file)
                                 (i.e. the "keep-alive ping"). If the time
                                 between two such calls is larger than
                                 the configured time, then the service
-                                is placed in a failed state. By
-                                setting <varname>Restart=</varname> to
+                                is placed in a failed state and it will
+                                be terminated with <varname>SIGABRT</varname>.
+                                By setting <varname>Restart=</varname> to
                                 <option>on-failure</option> or
                                 <option>always</option>, the service
                                 will be automatically restarted. The
index 22d2a6d..68cb6ca 100644 (file)
@@ -446,7 +446,7 @@ static void busname_enter_signal(BusName *n, BusNameState state, BusNameResult f
 
         r = unit_kill_context(UNIT(n),
                               &kill_context,
-                              state != BUSNAME_SIGTERM,
+                              state != BUSNAME_SIGTERM ? KILL_KILL : KILL_TERMINATE,
                               -1,
                               n->control_pid,
                               false);
index e284357..01243c3 100644 (file)
@@ -775,7 +775,8 @@ static void mount_enter_signal(Mount *m, MountState state, MountResult f) {
         r = unit_kill_context(
                         UNIT(m),
                         &m->kill_context,
-                        state != MOUNT_MOUNTING_SIGTERM && state != MOUNT_UNMOUNTING_SIGTERM && state != MOUNT_REMOUNTING_SIGTERM,
+                        (state != MOUNT_MOUNTING_SIGTERM && state != MOUNT_UNMOUNTING_SIGTERM && state != MOUNT_REMOUNTING_SIGTERM) ?
+                        KILL_KILL : KILL_TERMINATE,
                         -1,
                         m->control_pid,
                         false);
index e8f9e8d..0f7c1f9 100644 (file)
@@ -243,7 +243,7 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
                 r = unit_kill_context(
                                 UNIT(s),
                                 &s->kill_context,
-                                state != SCOPE_STOP_SIGTERM,
+                                state != SCOPE_STOP_SIGTERM ? KILL_KILL : KILL_TERMINATE,
                                 -1, -1, false);
                 if (r < 0)
                         goto fail;
index d160c4e..2b16778 100644 (file)
@@ -56,6 +56,7 @@ static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = {
         [SERVICE_EXITED] = UNIT_ACTIVE,
         [SERVICE_RELOAD] = UNIT_RELOADING,
         [SERVICE_STOP] = UNIT_DEACTIVATING,
+        [SERVICE_STOP_SIGABRT] = UNIT_DEACTIVATING,
         [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
         [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
         [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
@@ -76,6 +77,7 @@ static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] =
         [SERVICE_EXITED] = UNIT_ACTIVE,
         [SERVICE_RELOAD] = UNIT_RELOADING,
         [SERVICE_STOP] = UNIT_DEACTIVATING,
+        [SERVICE_STOP_SIGABRT] = UNIT_DEACTIVATING,
         [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
         [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
         [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
@@ -663,7 +665,7 @@ static void service_set_state(Service *s, ServiceState state) {
                     SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
                     SERVICE_RELOAD,
                     SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
-                    SERVICE_STOP_POST,
+                    SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
                     SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
                     SERVICE_AUTO_RESTART))
                 s->timer_event_source = sd_event_source_unref(s->timer_event_source);
@@ -672,7 +674,7 @@ static void service_set_state(Service *s, ServiceState state) {
                     SERVICE_START, SERVICE_START_POST,
                     SERVICE_RUNNING, SERVICE_RELOAD,
                     SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
-                    SERVICE_STOP_POST,
+                    SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
                     SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
                 service_unwatch_main_pid(s);
                 s->main_command = NULL;
@@ -682,7 +684,7 @@ static void service_set_state(Service *s, ServiceState state) {
                     SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
                     SERVICE_RELOAD,
                     SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
-                    SERVICE_STOP_POST,
+                    SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
                     SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
                 service_unwatch_control_pid(s);
                 s->control_command = NULL;
@@ -696,7 +698,7 @@ static void service_set_state(Service *s, ServiceState state) {
                     SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
                     SERVICE_RUNNING, SERVICE_RELOAD,
                     SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
-                    SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL) &&
+                    SERVICE_STOP_SIGABRT, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL) &&
             !(state == SERVICE_DEAD && UNIT(s)->job)) {
                 service_close_socket_fd(s);
                 service_connection_unref(s);
@@ -750,7 +752,7 @@ static int service_coldplug(Unit *u) {
                            SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
                            SERVICE_RELOAD,
                            SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
-                           SERVICE_STOP_POST,
+                           SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
                            SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
 
                         usec_t k;
@@ -779,7 +781,7 @@ static int service_coldplug(Unit *u) {
                             SERVICE_START, SERVICE_START_POST,
                             SERVICE_RUNNING, SERVICE_RELOAD,
                             SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
-                            SERVICE_STOP_POST,
+                            SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
                             SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) {
                         r = unit_watch_pid(UNIT(s), s->main_pid);
                         if (r < 0)
@@ -791,7 +793,7 @@ static int service_coldplug(Unit *u) {
                            SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
                            SERVICE_RELOAD,
                            SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
-                           SERVICE_STOP_POST,
+                           SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
                            SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
                         r = unit_watch_pid(UNIT(s), s->control_pid);
                         if (r < 0)
@@ -1181,7 +1183,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
         r = unit_kill_context(
                         UNIT(s),
                         &s->kill_context,
-                        state != SERVICE_STOP_SIGTERM && state != SERVICE_FINAL_SIGTERM,
+                        (state != SERVICE_STOP_SIGTERM && state != SERVICE_FINAL_SIGTERM && state != SERVICE_STOP_SIGABRT) ?
+                        KILL_KILL : (state == SERVICE_STOP_SIGABRT ? KILL_ABORT : KILL_TERMINATE),
                         s->main_pid,
                         s->control_pid,
                         s->main_pid_alien);
@@ -1197,7 +1200,7 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
                 }
 
                 service_set_state(s, state);
-        } else if (state == SERVICE_STOP_SIGTERM)
+        } else if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGABRT)
                 service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_SUCCESS);
         else if (state == SERVICE_STOP_SIGKILL)
                 service_enter_stop_post(s, SERVICE_SUCCESS);
@@ -1211,7 +1214,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
 fail:
         log_warning_unit(UNIT(s)->id, "%s failed to kill processes: %s", UNIT(s)->id, strerror(-r));
 
-        if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGKILL)
+        if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGKILL ||
+            state == SERVICE_STOP_SIGABRT)
                 service_enter_stop_post(s, SERVICE_FAILURE_RESOURCES);
         else
                 service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true);
@@ -1637,6 +1641,7 @@ static int service_start(Unit *u) {
         /* We cannot fulfill this request right now, try again later
          * please! */
         if (s->state == SERVICE_STOP ||
+            s->state == SERVICE_STOP_SIGABRT ||
             s->state == SERVICE_STOP_SIGTERM ||
             s->state == SERVICE_STOP_SIGKILL ||
             s->state == SERVICE_STOP_POST ||
@@ -1695,6 +1700,7 @@ static int service_stop(Unit *u) {
 
         /* Already on it */
         if (s->state == SERVICE_STOP ||
+            s->state == SERVICE_STOP_SIGABRT ||
             s->state == SERVICE_STOP_SIGTERM ||
             s->state == SERVICE_STOP_SIGKILL ||
             s->state == SERVICE_STOP_POST ||
@@ -2126,6 +2132,7 @@ static void service_notify_cgroup_empty_event(Unit *u) {
                 service_enter_running(s, SERVICE_SUCCESS);
                 break;
 
+        case SERVICE_STOP_SIGABRT:
         case SERVICE_STOP_SIGTERM:
         case SERVICE_STOP_SIGKILL:
 
@@ -2252,6 +2259,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
                                 service_enter_running(s, f);
                                 break;
 
+                        case SERVICE_STOP_SIGABRT:
                         case SERVICE_STOP_SIGTERM:
                         case SERVICE_STOP_SIGKILL:
 
@@ -2392,6 +2400,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
                                 service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
                                 break;
 
+                        case SERVICE_STOP_SIGABRT:
                         case SERVICE_STOP_SIGTERM:
                         case SERVICE_STOP_SIGKILL:
                                 if (main_pid_good(s) <= 0)
@@ -2461,6 +2470,12 @@ static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *us
                 service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT);
                 break;
 
+        case SERVICE_STOP_SIGABRT:
+                log_warning_unit(UNIT(s)->id,
+                                 "%s stop-sigabrt timed out. Terminating.", UNIT(s)->id);
+                service_enter_signal(s, SERVICE_STOP_SIGTERM, s->result);
+                break;
+
         case SERVICE_STOP_SIGTERM:
                 if (s->kill_context.send_sigkill) {
                         log_warning_unit(UNIT(s)->id, "%s stop-sigterm timed out. Killing.", UNIT(s)->id);
@@ -2528,7 +2543,7 @@ static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void
         log_error_unit(UNIT(s)->id, "%s watchdog timeout (limit %s)!", UNIT(s)->id,
                        format_timespan(t, sizeof(t), s->watchdog_usec, 1));
 
-        service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_WATCHDOG);
+        service_enter_signal(s, SERVICE_STOP_SIGABRT, SERVICE_FAILURE_WATCHDOG);
 
         return 0;
 }
index 0db0c4d..54fbe46 100644 (file)
@@ -39,6 +39,7 @@ typedef enum ServiceState {
         SERVICE_EXITED,            /* Nothing is running anymore, but RemainAfterExit is true hence this is OK */
         SERVICE_RELOAD,
         SERVICE_STOP,              /* No STOP_PRE state, instead just register multiple STOP executables */
+        SERVICE_STOP_SIGABRT,      /* Watchdog timeout */
         SERVICE_STOP_SIGTERM,
         SERVICE_STOP_SIGKILL,
         SERVICE_STOP_POST,
index 9004cb4..6ba8338 100644 (file)
@@ -1578,7 +1578,8 @@ static void socket_enter_signal(Socket *s, SocketState state, SocketResult f) {
         r = unit_kill_context(
                         UNIT(s),
                         &s->kill_context,
-                        state != SOCKET_STOP_PRE_SIGTERM && state != SOCKET_FINAL_SIGTERM,
+                        (state != SOCKET_STOP_PRE_SIGTERM && state != SOCKET_FINAL_SIGTERM) ?
+                        KILL_KILL : KILL_TERMINATE,
                         -1,
                         s->control_pid,
                         false);
index 13e12ad..1add722 100644 (file)
@@ -687,7 +687,8 @@ static void swap_enter_signal(Swap *s, SwapState state, SwapResult f) {
         r = unit_kill_context(
                         UNIT(s),
                         &s->kill_context,
-                        state != SWAP_ACTIVATING_SIGTERM && state != SWAP_DEACTIVATING_SIGTERM,
+                        (state != SWAP_ACTIVATING_SIGTERM && state != SWAP_DEACTIVATING_SIGTERM) ?
+                        KILL_KILL : KILL_TERMINATE,
                         -1,
                         s->control_pid,
                         false);
index 489ea1e..84f210a 100644 (file)
@@ -3313,7 +3313,7 @@ int unit_make_transient(Unit *u) {
 int unit_kill_context(
                 Unit *u,
                 KillContext *c,
-                bool sigkill,
+                KillOperation k,
                 pid_t main_pid,
                 pid_t control_pid,
                 bool main_pid_alien) {
@@ -3326,7 +3326,19 @@ int unit_kill_context(
         if (c->kill_mode == KILL_NONE)
                 return 0;
 
-        sig = sigkill ? SIGKILL : c->kill_signal;
+        switch (k) {
+        case KILL_KILL:
+                sig = SIGKILL;
+                break;
+        case KILL_ABORT:
+                sig = SIGABRT;
+                break;
+        case KILL_TERMINATE:
+                sig = c->kill_signal;
+                break;
+        default:
+                assert_not_reached("KillOperation unknown");
+        }
 
         if (main_pid > 0) {
                 r = kill_and_sigcont(main_pid, sig);
@@ -3340,7 +3352,7 @@ int unit_kill_context(
                         if (!main_pid_alien)
                                 wait_for_exit = true;
 
-                        if (c->send_sighup && !sigkill)
+                        if (c->send_sighup && k != KILL_KILL)
                                 kill(main_pid, SIGHUP);
                 }
         }
@@ -3356,12 +3368,12 @@ int unit_kill_context(
                 } else {
                         wait_for_exit = true;
 
-                        if (c->send_sighup && !sigkill)
+                        if (c->send_sighup && k != KILL_KILL)
                                 kill(control_pid, SIGHUP);
                 }
         }
 
-        if ((c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && sigkill)) && u->cgroup_path) {
+        if ((c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && k == KILL_KILL)) && u->cgroup_path) {
                 _cleanup_set_free_ Set *pid_set = NULL;
 
                 /* Exclude the main/control pids from being killed via the cgroup */
@@ -3385,7 +3397,7 @@ int unit_kill_context(
 
                         /* wait_for_exit = true; */
 
-                        if (c->send_sighup && !sigkill) {
+                        if (c->send_sighup && k != KILL_KILL) {
                                 set_free(pid_set);
 
                                 pid_set = unit_pid_set(main_pid, control_pid);
index bbad546..081ab18 100644 (file)
@@ -54,6 +54,12 @@ enum UnitActiveState {
         _UNIT_ACTIVE_STATE_INVALID = -1
 };
 
+typedef enum KillOperation {
+        KILL_TERMINATE,
+        KILL_KILL,
+        KILL_ABORT,
+} KillOperation;
+
 static inline bool UNIT_IS_ACTIVE_OR_RELOADING(UnitActiveState t) {
         return t == UNIT_ACTIVE || t == UNIT_RELOADING;
 }
@@ -576,7 +582,7 @@ int unit_write_drop_in_private_format(Unit *u, UnitSetPropertiesMode mode, const
 
 int unit_remove_drop_in(Unit *u, UnitSetPropertiesMode mode, const char *name);
 
-int unit_kill_context(Unit *u, KillContext *c, bool sigkill, pid_t main_pid, pid_t control_pid, bool main_pid_alien);
+int unit_kill_context(Unit *u, KillContext *c, KillOperation k, pid_t main_pid, pid_t control_pid, bool main_pid_alien);
 
 int unit_make_transient(Unit *u);