From a911bb9ab27ac0eb3bbf4e8b4109e5da9b88eee3 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 6 Feb 2014 17:17:51 +0100 Subject: [PATCH] core: watch SIGCHLD more closely to track processes of units with no reliable cgroup empty notifier When a process dies that we can associate with a specific unit, start watching all other processes of that unit, so that we can associate those processes with the unit too. Also, for service units start doing this as soon as we get the first SIGCHLD for either control or main process, so that we can follow the processes of the service from one to the other, as long as process that remain are processes of the ones we watched that died and got reassigned to us as parent. Similar, for scope units start doing this as soon as the scope controller abandons the unit, and thus management entirely reverts to systemd. To abandon a unit introduce a new Abandon() scope unit method call. --- src/core/dbus-scope.c | 17 ++++-- src/core/manager.c | 2 +- src/core/scope.c | 98 ++++++++++++++++++++---------- src/core/scope.h | 5 +- src/core/service.c | 134 ++++++++++++++++++++++++------------------ src/core/unit.c | 124 ++++++++++++++++++++++++++++++++++---- src/core/unit.h | 9 +++ 7 files changed, 281 insertions(+), 108 deletions(-) diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c index d5a204851..c46c972c2 100644 --- a/src/core/dbus-scope.c +++ b/src/core/dbus-scope.c @@ -28,6 +28,16 @@ #include "bus-util.h" #include "bus-internal.h" +static int bus_scope_abandon(sd_bus *bus, sd_bus_message *message, void *userdata, sd_bus_error *error) { + Scope *s = userdata; + + assert(bus); + assert(message); + assert(s); + + return scope_abandon(s); +} + static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, scope_result, ScopeResult); const sd_bus_vtable bus_scope_vtable[] = { @@ -36,6 +46,7 @@ const sd_bus_vtable bus_scope_vtable[] = { SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Scope, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Scope, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), SD_BUS_SIGNAL("RequestStop", NULL, 0), + SD_BUS_METHOD("Abandon", NULL, NULL, bus_scope_abandon, 0), SD_BUS_VTABLE_END }; @@ -56,10 +67,6 @@ static int bus_scope_set_transient_property( unsigned n = 0; uint32_t pid; - r = set_ensure_allocated(&s->pids, trivial_hash_func, trivial_compare_func); - if (r < 0) - return r; - r = sd_bus_message_enter_container(message, 'a', "u"); if (r < 0) return r; @@ -70,7 +77,7 @@ static int bus_scope_set_transient_property( return -EINVAL; if (mode != UNIT_CHECK) { - r = set_put(s->pids, LONG_TO_PTR(pid)); + r = unit_watch_pid(UNIT(s), pid); if (r < 0 && r != -EEXIST) return r; } diff --git a/src/core/manager.c b/src/core/manager.c index 45f5f70b2..c0bcc79f0 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -1459,7 +1459,7 @@ static int manager_dispatch_sigchld(Manager *m) { log_debug_unit(u->id, "Child %lu belongs to %s", (long unsigned) si.si_pid, u->id); - hashmap_remove(m->watch_pids, LONG_TO_PTR(si.si_pid)); + unit_unwatch_pid(u, si.si_pid); UNIT_VTABLE(u)->sigchld_event(u, si.si_pid, si.si_code, si.si_status); } diff --git a/src/core/scope.c b/src/core/scope.c index 0c1d17e68..940e40dda 100644 --- a/src/core/scope.c +++ b/src/core/scope.c @@ -35,6 +35,7 @@ static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = { [SCOPE_DEAD] = UNIT_INACTIVE, [SCOPE_RUNNING] = UNIT_ACTIVE, + [SCOPE_ABANDONED] = UNIT_ACTIVE, [SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING, [SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING, [SCOPE_FAILED] = UNIT_FAILED @@ -66,9 +67,6 @@ static void scope_done(Unit *u) { free(s->controller); - set_free(s->pids); - s->pids = NULL; - s->timer_event_source = sd_event_source_unref(s->timer_event_source); } @@ -100,15 +98,14 @@ static void scope_set_state(Scope *s, ScopeState state) { old_state = s->state; s->state = state; - if (state != SCOPE_STOP_SIGTERM && - state != SCOPE_STOP_SIGKILL) + if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) s->timer_event_source = sd_event_source_unref(s->timer_event_source); + if (IN_SET(state, SCOPE_DEAD, SCOPE_FAILED)) + unit_unwatch_all_pids(UNIT(s)); + if (state != old_state) - log_debug("%s changed %s -> %s", - UNIT(s)->id, - scope_state_to_string(old_state), - scope_state_to_string(state)); + log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state)); unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], true); } @@ -135,7 +132,7 @@ static int scope_verify(Scope *s) { if (UNIT(s)->load_state != UNIT_LOADED) return 0; - if (set_size(s->pids) <= 0 && UNIT(s)->manager->n_reloading <= 0) { + if (set_isempty(UNIT(s)->pids) && UNIT(s)->manager->n_reloading <= 0) { log_error_unit(UNIT(s)->id, "Scope %s has no PIDs. Refusing.", UNIT(s)->id); return -EINVAL; } @@ -181,12 +178,15 @@ static int scope_coldplug(Unit *u) { if (s->deserialized_state != s->state) { - if (s->deserialized_state == SCOPE_STOP_SIGKILL || s->deserialized_state == SCOPE_STOP_SIGTERM) { + if (IN_SET(s->deserialized_state, SCOPE_STOP_SIGKILL, SCOPE_STOP_SIGTERM)) { r = scope_arm_timer(s); if (r < 0) return r; } + if (!IN_SET(s->deserialized_state, SCOPE_DEAD, SCOPE_FAILED)) + unit_watch_all_pids(UNIT(s)); + scope_set_state(s, s->deserialized_state); } @@ -227,6 +227,8 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) { if (f != SCOPE_SUCCESS) s->result = f; + unit_watch_all_pids(UNIT(s)); + /* If we have a controller set let's ask the controller nicely * to terminate the scope, instead of us going directly into * SIGTERM beserk mode */ @@ -288,13 +290,10 @@ static int scope_start(Unit *u) { return r; } - r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, s->pids); + r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, UNIT(s)->pids); if (r < 0) return r; - set_free(s->pids); - s->pids = NULL; - s->result = SCOPE_SUCCESS; scope_set_state(s, SCOPE_RUNNING); @@ -305,13 +304,13 @@ static int scope_stop(Unit *u) { Scope *s = SCOPE(u); assert(s); - assert(s->state == SCOPE_RUNNING); if (s->state == SCOPE_STOP_SIGTERM || s->state == SCOPE_STOP_SIGKILL) return 0; - assert(s->state == SCOPE_RUNNING); + assert(s->state == SCOPE_RUNNING || + s->state == SCOPE_ABANDONED); scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_SUCCESS); return 0; @@ -389,8 +388,8 @@ static bool scope_check_gc(Unit *u) { /* Never clean up scopes that still have a process around, * even if the scope is formally dead. */ - if (UNIT(s)->cgroup_path) { - r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, UNIT(s)->cgroup_path, true); + if (u->cgroup_path) { + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true); if (r <= 0) return true; } @@ -398,6 +397,32 @@ static bool scope_check_gc(Unit *u) { return false; } +static void scope_notify_cgroup_empty_event(Unit *u) { + Scope *s = SCOPE(u); + assert(u); + + log_debug_unit(u->id, "%s: cgroup is empty", u->id); + + if (IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) + scope_enter_dead(s, SCOPE_SUCCESS); +} + +static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) { + + /* If we get a SIGCHLD event for one of the processes we were + interested in, then we look for others to watch, under the + assumption that we'll sooner or later get a SIGCHLD for + them, as the original process we watched was probably the + parent of them, and they are hence now our children. */ + + unit_tidy_watch_pids(u, 0, 0); + unit_watch_all_pids(u); + + /* If the PID set is empty now, then let's finish this off */ + if (set_isempty(u->pids)) + scope_notify_cgroup_empty_event(u); +} + static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { Scope *s = SCOPE(userdata); @@ -429,24 +454,30 @@ static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *user return 0; } -static void scope_notify_cgroup_empty_event(Unit *u) { - Scope *s = SCOPE(u); - assert(u); +int scope_abandon(Scope *s) { + assert(s); - log_debug_unit(u->id, "%s: cgroup is empty", u->id); + if (!IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED)) + return -ESTALE; - switch (s->state) { + free(s->controller); + s->controller = NULL; - case SCOPE_RUNNING: - case SCOPE_STOP_SIGTERM: - case SCOPE_STOP_SIGKILL: - scope_enter_dead(s, SCOPE_SUCCESS); + /* The client is no longer watching the remaining processes, + * so let's step in here, under the assumption that the + * remaining processes will be sooner or later reassigned to + * us as parent. */ - break; + unit_tidy_watch_pids(UNIT(s), 0, 0); + unit_watch_all_pids(UNIT(s)); - default: - ; - } + /* If the PID set is empty now, then let's finish this off */ + if (set_isempty(UNIT(s)->pids)) + scope_notify_cgroup_empty_event(UNIT(s)); + else + scope_set_state(s, SCOPE_ABANDONED); + + return 0; } _pure_ static UnitActiveState scope_active_state(Unit *u) { @@ -464,6 +495,7 @@ _pure_ static const char *scope_sub_state_to_string(Unit *u) { static const char* const scope_state_table[_SCOPE_STATE_MAX] = { [SCOPE_DEAD] = "dead", [SCOPE_RUNNING] = "running", + [SCOPE_ABANDONED] = "abandoned", [SCOPE_STOP_SIGTERM] = "stop-sigterm", [SCOPE_STOP_SIGKILL] = "stop-sigkill", [SCOPE_FAILED] = "failed", @@ -516,6 +548,8 @@ const UnitVTable scope_vtable = { .check_gc = scope_check_gc, + .sigchld_event = scope_sigchld_event, + .reset_failed = scope_reset_failed, .notify_cgroup_empty = scope_notify_cgroup_empty_event, diff --git a/src/core/scope.h b/src/core/scope.h index 014b50c76..6c5912642 100644 --- a/src/core/scope.h +++ b/src/core/scope.h @@ -29,6 +29,7 @@ typedef struct Scope Scope; typedef enum ScopeState { SCOPE_DEAD, SCOPE_RUNNING, + SCOPE_ABANDONED, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL, SCOPE_FAILED, @@ -57,13 +58,13 @@ struct Scope { char *controller; - Set *pids; - sd_event_source *timer_event_source; }; extern const UnitVTable scope_vtable; +int scope_abandon(Scope *s); + const char* scope_state_to_string(ScopeState i) _const_; ScopeState scope_state_from_string(const char *s) _pure_; diff --git a/src/core/service.c b/src/core/service.c index 0542eae02..8c47e24bf 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -1485,6 +1485,9 @@ static void service_set_state(Service *s, ServiceState state) { s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; } + if (IN_SET(state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) + unit_unwatch_all_pids(UNIT(s)); + if (!IN_SET(state, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD, @@ -1587,6 +1590,9 @@ static int service_coldplug(Unit *u) { return r; } + if (!IN_SET(s->deserialized_state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) + unit_watch_all_pids(UNIT(s)); + if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) service_start_watchdog(s); @@ -1895,6 +1901,7 @@ static void service_enter_stop_post(Service *s, ServiceResult f) { s->result = f; service_unwatch_control_pid(s); + unit_watch_all_pids(UNIT(s)); s->control_command = s->exec_command[SERVICE_EXEC_STOP_POST]; if (s->control_command) { @@ -1934,6 +1941,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f if (f != SERVICE_SUCCESS) s->result = f; + unit_watch_all_pids(UNIT(s)); + r = unit_kill_context( UNIT(s), &s->kill_context, @@ -1983,6 +1992,7 @@ static void service_enter_stop(Service *s, ServiceResult f) { s->result = f; service_unwatch_control_pid(s); + unit_watch_all_pids(UNIT(s)); s->control_command = s->exec_command[SERVICE_EXEC_STOP]; if (s->control_command) { @@ -2865,6 +2875,62 @@ fail: return 0; } +static void service_notify_cgroup_empty_event(Unit *u) { + Service *s = SERVICE(u); + + assert(u); + + log_debug_unit(u->id, "%s: cgroup is empty", u->id); + + switch (s->state) { + + /* Waiting for SIGCHLD is usually more interesting, + * because it includes return codes/signals. Which is + * why we ignore the cgroup events for most cases, + * except when we don't know pid which to expect the + * SIGCHLD for. */ + + case SERVICE_START: + case SERVICE_START_POST: + /* If we were hoping for the daemon to write its PID file, + * we can give up now. */ + if (s->pid_file_pathspec) { + log_warning_unit(u->id, + "%s never wrote its PID file. Failing.", UNIT(s)->id); + service_unwatch_pid_file(s); + if (s->state == SERVICE_START) + service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES); + else + service_enter_stop(s, SERVICE_FAILURE_RESOURCES); + } + break; + + case SERVICE_RUNNING: + /* service_enter_running() will figure out what to do */ + service_enter_running(s, SERVICE_SUCCESS); + break; + + case SERVICE_STOP_SIGTERM: + case SERVICE_STOP_SIGKILL: + + if (main_pid_good(s) <= 0 && !control_pid_good(s)) + service_enter_stop_post(s, SERVICE_SUCCESS); + + break; + + case SERVICE_STOP_POST: + case SERVICE_FINAL_SIGTERM: + case SERVICE_FINAL_SIGKILL: + if (main_pid_good(s) <= 0 && !control_pid_good(s)) + service_enter_dead(s, SERVICE_SUCCESS, true); + + break; + + default: + ; + } +} + static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { Service *s = SERVICE(u); ServiceResult f; @@ -3142,6 +3208,18 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { /* Notify clients about changed exit status */ unit_add_to_dbus_queue(u); + + /* We got one SIGCHLD for the service, let's watch all + * processes that are now running of the service, and watch + * that. Among the PIDs we then watch will be children + * reassigned to us, which hopefully allows us to identify + * when all children are gone */ + unit_tidy_watch_pids(u, s->main_pid, s->control_pid); + unit_watch_all_pids(u); + + /* If the PID set is empty now, then let's finish this off */ + if (set_isempty(u->pids)) + service_notify_cgroup_empty_event(u); } static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { @@ -3254,62 +3332,6 @@ static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void return 0; } -static void service_notify_cgroup_empty_event(Unit *u) { - Service *s = SERVICE(u); - - assert(u); - - log_debug_unit(u->id, "%s: cgroup is empty", u->id); - - switch (s->state) { - - /* Waiting for SIGCHLD is usually more interesting, - * because it includes return codes/signals. Which is - * why we ignore the cgroup events for most cases, - * except when we don't know pid which to expect the - * SIGCHLD for. */ - - case SERVICE_START: - case SERVICE_START_POST: - /* If we were hoping for the daemon to write its PID file, - * we can give up now. */ - if (s->pid_file_pathspec) { - log_warning_unit(u->id, - "%s never wrote its PID file. Failing.", UNIT(s)->id); - service_unwatch_pid_file(s); - if (s->state == SERVICE_START) - service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES); - else - service_enter_stop(s, SERVICE_FAILURE_RESOURCES); - } - break; - - case SERVICE_RUNNING: - /* service_enter_running() will figure out what to do */ - service_enter_running(s, SERVICE_SUCCESS); - break; - - case SERVICE_STOP_SIGTERM: - case SERVICE_STOP_SIGKILL: - - if (main_pid_good(s) <= 0 && !control_pid_good(s)) - service_enter_stop_post(s, SERVICE_SUCCESS); - - break; - - case SERVICE_STOP_POST: - case SERVICE_FINAL_SIGTERM: - case SERVICE_FINAL_SIGKILL: - if (main_pid_good(s) <= 0 && !control_pid_good(s)) - service_enter_dead(s, SERVICE_SUCCESS, true); - - break; - - default: - ; - } -} - static void service_notify_message(Unit *u, pid_t pid, char **tags) { Service *s = SERVICE(u); const char *e; diff --git a/src/core/unit.c b/src/core/unit.c index 345521a71..07eedcd63 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -482,6 +482,8 @@ void unit_free(Unit *u) { set_free_free(u->names); + unit_unwatch_all_pids(u); + condition_free_list(u->conditions); unit_ref_unset(&u->slice); @@ -1697,13 +1699,25 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su } int unit_watch_pid(Unit *u, pid_t pid) { + int q, r; + assert(u); assert(pid >= 1); + r = set_ensure_allocated(&u->pids, trivial_hash_func, trivial_compare_func); + if (r < 0) + return r; + /* Watch a specific PID. We only support one unit watching * each PID for now. */ - return hashmap_put(u->manager->watch_pids, LONG_TO_PTR(pid), u); + r = set_put(u->pids, LONG_TO_PTR(pid)); + + q = hashmap_put(u->manager->watch_pids, LONG_TO_PTR(pid), u); + if (q < 0) + return q; + + return r; } void unit_unwatch_pid(Unit *u, pid_t pid) { @@ -1711,6 +1725,102 @@ void unit_unwatch_pid(Unit *u, pid_t pid) { assert(pid >= 1); hashmap_remove_value(u->manager->watch_pids, LONG_TO_PTR(pid), u); + set_remove(u->pids, LONG_TO_PTR(pid)); +} + +static int watch_pids_in_path(Unit *u, const char *path) { + _cleanup_closedir_ DIR *d = NULL; + _cleanup_fclose_ FILE *f = NULL; + int ret = 0, r; + + assert(u); + assert(path); + + /* Adds all PIDs from a specific cgroup path to the set of PIDs we watch. */ + + r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f); + if (r >= 0) { + pid_t pid; + + while ((r = cg_read_pid(f, &pid)) > 0) { + r = unit_watch_pid(u, pid); + if (r < 0 && ret >= 0) + ret = r; + } + if (r < 0 && ret >= 0) + ret = r; + + } else if (ret >= 0) + ret = r; + + r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d); + if (r >= 0) { + char *fn; + + while ((r = cg_read_subgroup(d, &fn)) > 0) { + _cleanup_free_ char *p = NULL; + + p = strjoin(path, "/", fn, NULL); + free(fn); + + if (!p) + return -ENOMEM; + + r = watch_pids_in_path(u, p); + if (r < 0 && ret >= 0) + ret = r; + } + if (r < 0 && ret >= 0) + ret = r; + + } else if (ret >= 0) + ret = r; + + return ret; +} + + +int unit_watch_all_pids(Unit *u) { + assert(u); + + if (!u->cgroup_path) + return -ENOENT; + + /* Adds all PIDs from our cgroup to the set of PIDs we watch */ + + return watch_pids_in_path(u, u->cgroup_path); +} + +void unit_unwatch_all_pids(Unit *u) { + Iterator i; + void *e; + + assert(u); + + SET_FOREACH(e, u->pids, i) + hashmap_remove_value(u->manager->watch_pids, e, u); + + set_free(u->pids); + u->pids = NULL; +} + +void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) { + Iterator i; + void *e; + + assert(u); + + /* Cleans dead PIDs from our list */ + + SET_FOREACH(e, u->pids, i) { + pid_t pid = PTR_TO_LONG(e); + + if (pid == except1 || pid == except2) + continue; + + if (kill(pid, 0) < 0 && errno == ESRCH) + set_remove(u->pids, e); + } } bool unit_job_is_applicable(Unit *u, JobType j) { @@ -2979,17 +3089,7 @@ int unit_kill_context( log_warning_unit(u->id, "Failed to kill control group: %s", strerror(-r)); } else if (r > 0) { - /* FIXME: Now, we don't actually wait for any - * of the processes that are neither control - * nor main process. We should wait for them - * of course, but that's hard since the cgroup - * notification logic is so unreliable. It is - * not available at all in containers, and on - * the host it gets confused by - * subgroups. Hence, for now, let's not wait - * for these processes -- but when the kernel - * gets fixed we really should correct - * that. */ + wait_for_exit = true; if (c->send_sighup && !sigkill) { set_free(pid_set); diff --git a/src/core/unit.h b/src/core/unit.h index c686aecce..c104a8a9d 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -203,6 +203,11 @@ struct Unit { /* CGroup realize members queue */ LIST_FIELDS(Unit, cgroup_queue); + /* PIDs we keep an eye on. Note that a unit might have many + * more, but these are the ones we care enough about to + * process SIGCHLD for */ + Set *pids; + /* Used during GC sweeps */ unsigned gc_marker; @@ -538,6 +543,10 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su int unit_watch_pid(Unit *u, pid_t pid); void unit_unwatch_pid(Unit *u, pid_t pid); +int unit_watch_all_pids(Unit *u); +void unit_unwatch_all_pids(Unit *u); + +void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2); int unit_watch_bus_name(Unit *u, const char *name); void unit_unwatch_bus_name(Unit *u, const char *name); -- 2.30.2