chiark / gitweb /
service: introduce WatchdogSec and hook up the watchdog with the existing failure...
authorMichael Olbrich <m.olbrich@pengutronix.de>
Wed, 8 Feb 2012 09:10:34 +0000 (10:10 +0100)
committerLennart Poettering <lennart@poettering.net>
Wed, 8 Feb 2012 16:10:38 +0000 (17:10 +0100)
man/systemd.service.xml
src/dbus-service.c
src/load-fragment-gperf.gperf.m4
src/service.c
src/service.h

index 0baddd1..0b5edb8 100644 (file)
                         </varlistentry>
 
                         <varlistentry>
+                                <term><varname>WatchdogSec=</varname></term>
+                                <listitem><para>Configures the watchdog
+                                timeout for a service. This is activated
+                                when the start-up is completed. The service
+                                must call
+                                <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>
+                                regularly with "WATCHDOG=1". If the time
+                                between two such calls is larger than
+                                the configured time then the service
+                                enters a failure state. By setting
+                                <term><varname>Restart=</varname></term>
+                                to <option>on-failure</option> or
+                                <option>always</option> the service can
+                                be restarted. Defaults to 0s, which
+                                disables this feature.</para></listitem>
+                        </varlistentry>
+
+                        <varlistentry>
                                 <term><varname>Restart=</varname></term>
                                 <listitem><para>Configures whether the
                                 main service process shall be
index 738dc7b..fedfc1d 100644 (file)
@@ -43,6 +43,7 @@
         "  <property name=\"NotifyAccess\" type=\"s\" access=\"read\"/>\n" \
         "  <property name=\"RestartUSec\" type=\"t\" access=\"read\"/>\n" \
         "  <property name=\"TimeoutUSec\" type=\"t\" access=\"read\"/>\n" \
+        "  <property name=\"WatchdogUSec\" type=\"t\" access=\"read\"/>\n" \
         "  <property name=\"WatchdogTimestamp\" type=\"t\" access=\"read\"/>\n" \
         "  <property name=\"WatchdogTimestampMonotonic\" type=\"t\" access=\"read\"/>\n" \
         BUS_EXEC_COMMAND_INTERFACE("ExecStartPre")                      \
@@ -119,6 +120,7 @@ static const BusProperty bus_service_properties[] = {
         { "NotifyAccess",           bus_service_append_notify_access, "s", offsetof(Service, notify_access)                },
         { "RestartUSec",            bus_property_append_usec,         "t", offsetof(Service, restart_usec)                 },
         { "TimeoutUSec",            bus_property_append_usec,         "t", offsetof(Service, timeout_usec)                 },
+        { "WatchdogUSec",           bus_property_append_usec,         "t", offsetof(Service, watchdog_usec)                },
         { "WatchdogTimestamp",      bus_property_append_usec,         "t", offsetof(Service, watchdog_timestamp.realtime)  },
         { "WatchdogTimestampMonotonic",bus_property_append_usec,      "t", offsetof(Service, watchdog_timestamp.monotonic) },
         BUS_EXEC_COMMAND_PROPERTY("ExecStartPre",  offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]),  true ),
index 14c0606..9191f90 100644 (file)
@@ -134,6 +134,7 @@ Service.ExecStop,                config_parse_exec,                  SERVICE_EXE
 Service.ExecStopPost,            config_parse_exec,                  SERVICE_EXEC_STOP_POST,        offsetof(Service, exec_command)
 Service.RestartSec,              config_parse_usec,                  0,                             offsetof(Service, restart_usec)
 Service.TimeoutSec,              config_parse_usec,                  0,                             offsetof(Service, timeout_usec)
+Service.WatchdogSec,             config_parse_usec,                  0,                             offsetof(Service, watchdog_usec)
 Service.Type,                    config_parse_service_type,          0,                             offsetof(Service, type)
 Service.Restart,                 config_parse_service_restart,       0,                             offsetof(Service, restart)
 Service.PermissionsStartOnly,    config_parse_bool,                  0,                             offsetof(Service, permissions_start_only)
index b6bbfab..1631595 100644 (file)
@@ -112,6 +112,9 @@ static void service_init(Unit *u) {
 
         s->timeout_usec = DEFAULT_TIMEOUT_USEC;
         s->restart_usec = DEFAULT_RESTART_USEC;
+
+        s->watchdog_watch.type = WATCH_INVALID;
+
         s->timer_watch.type = WATCH_INVALID;
 #ifdef HAVE_SYSV_COMPAT
         s->sysv_start_priority = -1;
@@ -208,14 +211,39 @@ static void service_connection_unref(Service *s) {
 static void service_stop_watchdog(Service *s) {
         assert(s);
 
+        unit_unwatch_timer(UNIT(s), &s->watchdog_watch);
         s->watchdog_timestamp.realtime = 0;
         s->watchdog_timestamp.monotonic = 0;
 }
 
+static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart);
+
+static void service_handle_watchdog(Service *s) {
+        usec_t offset;
+        int r;
+
+        assert(s);
+
+        if (s->watchdog_usec == 0)
+                return;
+
+        offset = now(CLOCK_MONOTONIC) - s->watchdog_timestamp.monotonic;
+        if (offset >= s->watchdog_usec) {
+                log_error("%s watchdog timeout!", UNIT(s)->id);
+                service_enter_dead(s, SERVICE_FAILURE_WATCHDOG, true);
+                return;
+        }
+
+        r = unit_watch_timer(UNIT(s), s->watchdog_usec - offset, &s->watchdog_watch);
+        if (r < 0)
+                log_warning("%s failed to install watchdog timer: %s", UNIT(s)->id, strerror(-r));
+}
+
 static void service_reset_watchdog(Service *s) {
         assert(s);
 
         dual_timestamp_get(&s->watchdog_timestamp);
+        service_handle_watchdog(s);
 }
 
 static void service_done(Unit *u) {
@@ -259,6 +287,8 @@ static void service_done(Unit *u) {
 
         unit_ref_unset(&s->accept_socket);
 
+        service_stop_watchdog(s);
+
         unit_unwatch_timer(u, &s->timer_watch);
 }
 
@@ -1568,9 +1598,12 @@ static int service_coldplug(Unit *u) {
                                 if ((r = unit_watch_pid(UNIT(s), s->control_pid)) < 0)
                                         return r;
 
+                if (s->deserialized_state == SERVICE_START_POST ||
+                    s->deserialized_state == SERVICE_RUNNING)
+                        service_handle_watchdog(s);
+
                 service_set_state(s, s->deserialized_state);
         }
-
         return 0;
 }
 
@@ -2002,6 +2035,9 @@ static void service_enter_start_post(Service *s) {
 
         service_unwatch_control_pid(s);
 
+        if (s->watchdog_usec > 0)
+                service_reset_watchdog(s);
+
         if ((s->control_command = s->exec_command[SERVICE_EXEC_START_POST])) {
                 s->control_command_id = SERVICE_EXEC_START_POST;
 
@@ -2922,6 +2958,11 @@ static void service_timer_event(Unit *u, uint64_t elapsed, Watch* w) {
         assert(s);
         assert(elapsed == 1);
 
+        if (w == &s->watchdog_watch) {
+                service_handle_watchdog(s);
+                return;
+        }
+
         assert(w == &s->timer_watch);
 
         switch (s->state) {
@@ -3611,7 +3652,8 @@ static const char* const service_result_table[_SERVICE_RESULT_MAX] = {
         [SERVICE_FAILURE_TIMEOUT] = "timeout",
         [SERVICE_FAILURE_EXIT_CODE] = "exit-code",
         [SERVICE_FAILURE_SIGNAL] = "signal",
-        [SERVICE_FAILURE_CORE_DUMP] = "core-dump"
+        [SERVICE_FAILURE_CORE_DUMP] = "core-dump",
+        [SERVICE_FAILURE_WATCHDOG] = "watchdog"
 };
 
 DEFINE_STRING_TABLE_LOOKUP(service_result, ServiceResult);
index b1e8b90..02726ef 100644 (file)
@@ -95,6 +95,7 @@ typedef enum ServiceResult {
         SERVICE_FAILURE_EXIT_CODE,
         SERVICE_FAILURE_SIGNAL,
         SERVICE_FAILURE_CORE_DUMP,
+        SERVICE_FAILURE_WATCHDOG,
         _SERVICE_RESULT_MAX,
         _SERVICE_RESULT_INVALID = -1
 } ServiceResult;
@@ -112,6 +113,8 @@ struct Service {
         usec_t timeout_usec;
 
         dual_timestamp watchdog_timestamp;
+        usec_t watchdog_usec;
+        Watch watchdog_watch;
 
         ExecCommand* exec_command[_SERVICE_EXEC_COMMAND_MAX];
         ExecContext exec_context;