chiark / gitweb /
core: Implement timeout based umount/remount limit
authorKyle Walker <kwalker@redhat.com>
Wed, 13 Dec 2017 17:49:26 +0000 (12:49 -0500)
committerSven Eden <yamakuzure@gmx.net>
Wed, 30 May 2018 05:49:36 +0000 (07:49 +0200)
Remount, and subsequent umount, attempts can hang for inaccessible network
based mount points. This can leave a system in a hard hang state that
requires a hard reset in order to recover. This change moves the remount,
and umount attempts into separate child processes. The remount and umount
operations will block for up to 90 seconds (DEFAULT_TIMEOUT_USEC). Should
those waits fail, the parent will issue a SIGKILL to the child and continue
with the shutdown efforts.

In addition, instead of only reporting some additional errors on the final
attempt, failures are reported as they occur.

src/basic/process-util.c
src/basic/process-util.h

index 611fd2339bdefd29a3443f65013f1706847b0c4c..2494ba6afca1f25e186c7e86a68635ef6bf5cbe3 100644 (file)
@@ -26,7 +26,6 @@
 #include <signal.h>
 #include <stdbool.h>
 #include <stdio.h>
-#include <stdio_ext.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
@@ -131,8 +130,6 @@ int get_process_cmdline(pid_t pid, size_t max_length, bool comm_fallback, char *
                 return -errno;
         }
 
-        (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
-
         if (max_length == 1) {
 
                 /* If there's only room for one byte, return the empty string */
@@ -411,8 +408,6 @@ int is_kernel_thread(pid_t pid) {
                 return -errno;
         }
 
-        (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
-
         count = fread(&c, 1, 1, f);
         eof = feof(f);
         fclose(f);
@@ -497,8 +492,6 @@ static int get_process_id(pid_t pid, const char *field, uid_t *uid) {
                 return -errno;
         }
 
-        (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
-
         FOREACH_LINE(line, f, return -errno) {
                 char *l;
 
@@ -577,8 +570,6 @@ int get_process_environ(pid_t pid, char **env) {
                 return -errno;
         }
 
-        (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
-
         while ((c = fgetc(f)) != EOF) {
                 if (!GREEDY_REALLOC(outcome, allocated, sz + 5))
                         return -ENOMEM;
@@ -714,6 +705,67 @@ int wait_for_terminate_and_warn(const char *name, pid_t pid, bool check_exit_cod
         return -EPROTO;
 }
 
+/*
+ * Return values:
+ * < 0 : wait_for_terminate_with_timeout() failed to get the state of the
+ *       process, the process timed out, the process was terminated by a
+ *       signal, or failed for an unknown reason.
+ * >=0 : The process terminated normally with no failures.
+ *
+ * Success is indicated by a return value of zero, a timeout is indicated
+ * by ETIMEDOUT, and all other child failure states are indicated by error
+ * is indicated by a non-zero value.
+ */
+int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout) {
+        sigset_t mask;
+        int r;
+        usec_t until;
+
+        assert_se(sigemptyset(&mask) == 0);
+        assert_se(sigaddset(&mask, SIGCHLD) == 0);
+
+        /* Drop into a sigtimewait-based timeout. Waiting for the
+         * pid to exit. */
+        until = now(CLOCK_MONOTONIC) + timeout;
+        for (;;) {
+                usec_t n;
+                siginfo_t status = {};
+                struct timespec ts;
+
+                n = now(CLOCK_MONOTONIC);
+                if (n >= until)
+                        break;
+
+                r = sigtimedwait(&mask, NULL, timespec_store(&ts, until - n)) < 0 ? -errno : 0;
+                /* Assuming we woke due to the child exiting. */
+                if (waitid(P_PID, pid, &status, WEXITED|WNOHANG) == 0) {
+                        if (status.si_pid == pid) {
+                                /* This is the correct child.*/
+                                if (status.si_code == CLD_EXITED)
+                                        return (status.si_status == 0) ? 0 : -EPROTO;
+                                else
+                                        return -EPROTO;
+                        }
+                }
+                /* Not the child, check for errors and proceed appropriately */
+                if (r < 0) {
+                        switch (r) {
+                        case -EAGAIN:
+                                /* Timed out, child is likely hung. */
+                                return -ETIMEDOUT;
+                        case -EINTR:
+                                /* Received a different signal and should retry */
+                                continue;
+                        default:
+                                /* Return any unexpected errors */
+                                return r;
+                        }
+                }
+        }
+
+        return -EPROTO;
+}
+
 #if 0 /// UNNEEDED by elogind
 void sigkill_wait(pid_t pid) {
         assert(pid > 1);
@@ -766,8 +818,6 @@ int getenv_for_pid(pid_t pid, const char *field, char **_value) {
                 return -errno;
         }
 
-        (void) __fsetlocking(f, FSETLOCKING_BYCALLER);
-
         l = strlen(field);
         r = 0;
 
index 5f6954d1c97a009d20df2e9cccdcf54580b7fd92..a84a96862dc6ef7e5a6fa38fc1d8973ac4c67ce0 100644 (file)
@@ -33,6 +33,7 @@
 #include "format-util.h"
 //#include "ioprio.h"
 #include "macro.h"
+//#include "time-util.h"
 
 #define procfs_file_alloca(pid, field)                                  \
         ({                                                              \
@@ -41,7 +42,7 @@
                 if (_pid_ == 0) {                                       \
                         _r_ = ("/proc/self/" field);                    \
                 } else {                                                \
-                        _r_ = alloca(STRLEN("/proc/") + DECIMAL_STR_MAX(pid_t) + 1 + sizeof(field)); \
+                        _r_ = alloca(strlen("/proc/") + DECIMAL_STR_MAX(pid_t) + 1 + sizeof(field)); \
                         sprintf((char*) _r_, "/proc/"PID_FMT"/" field, _pid_);                       \
                 }                                                       \
                 _r_;                                                    \
@@ -63,6 +64,7 @@ int get_process_ppid(pid_t pid, pid_t *ppid);
 
 int wait_for_terminate(pid_t pid, siginfo_t *status);
 int wait_for_terminate_and_warn(const char *name, pid_t pid, bool check_exit_code);
+int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout);
 #if 0 /// UNNEEDED by elogind
 
 void sigkill_wait(pid_t pid);