From: Kay Sievers Date: Wed, 18 Jan 2012 04:06:18 +0000 (+0100) Subject: udevd: kill hanging event processes after 30 seconds X-Git-Tag: 178~9 X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~ianmdlvl/git?p=elogind.git;a=commitdiff_plain;h=e64fae5573e566ce4fd9b23c68ac8f3096603314 udevd: kill hanging event processes after 30 seconds Some broken kernel drivers load firmware synchronously in the module init path and block modprobe until the firmware request is fulfilled. The modprobe-generated firmware request is a direct child device of the device which caused modprobe to run. Child device event are blocked until the parent device is handled. This dead-locks until the kernel firmware loading timeout of 60 seconds is reached. The hanging modprobe event should now time-out and allow the firmware event to run before the 60 second kernel timeout. --- diff --git a/src/udev-event.c b/src/udev-event.c index 9bdc5186d..f0b9548f3 100644 --- a/src/udev-event.c +++ b/src/udev-event.c @@ -49,7 +49,7 @@ struct udev_event *udev_event_new(struct udev_device *dev) udev_list_init(udev, &event->run_list, false); event->fd_signal = -1; event->birth_usec = now_usec(); - event->timeout_usec = 60 * 1000 * 1000; + event->timeout_usec = 30 * 1000 * 1000; dbg(event->udev, "allocated event %p\n", event); return event; } diff --git a/src/udevd.c b/src/udevd.c index 11ab19a31..77a1e7909 100644 --- a/src/udevd.c +++ b/src/udevd.c @@ -133,6 +133,7 @@ struct worker { struct udev_monitor *monitor; enum worker_state state; struct event *event; + unsigned long long event_start_usec; }; /* passed from worker to main process */ @@ -372,6 +373,7 @@ out: close(fd_inotify); close(worker_watch[WRITE_END]); udev_rules_unref(rules); + udev_builtin_exit(udev); udev_monitor_unref(worker_monitor); udev_unref(udev); udev_log_close(); @@ -389,6 +391,7 @@ out: worker->monitor = worker_monitor; worker->pid = pid; worker->state = WORKER_RUNNING; + worker->event_start_usec = now_usec(); worker->event = event; event->state = EVENT_RUNNING; udev_list_node_append(&worker->node, &worker_list); @@ -419,6 +422,7 @@ static void event_run(struct event *event) worker_ref(worker); worker->event = event; worker->state = WORKER_RUNNING; + worker->event_start_usec = now_usec(); event->state = EVENT_RUNNING; return; } @@ -610,9 +614,11 @@ static void worker_returned(int fd_worker) continue; /* worker returned */ - worker->event->exitcode = msg.exitcode; - event_queue_delete(worker->event, true); - worker->event = NULL; + if (worker->event) { + worker->event->exitcode = msg.exitcode; + event_queue_delete(worker->event, true); + worker->event = NULL; + } if (worker->state != WORKER_KILLED) worker->state = WORKER_IDLE; worker_unref(worker); @@ -796,7 +802,7 @@ static void handle_signal(struct udev *udev, int signo) } if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { - if (worker->event != NULL) { + if (worker->event) { err(udev, "worker [%u] failed while handling '%s'\n", pid, worker->event->devpath); worker->event->exitcode = -32; @@ -1574,25 +1580,57 @@ int main(int argc, char *argv[]) break; /* timeout at exit for workers to finish */ - timeout = 60 * 1000; - } else if (udev_list_node_is_empty(&event_list) && children > 2) { - /* set timeout to kill idle workers */ - timeout = 3 * 1000; - } else { + timeout = 30 * 1000; + } else if (udev_list_node_is_empty(&event_list) && children <= 2) { + /* we are idle */ timeout = -1; + } else { + /* kill idle or hanging workers */ + timeout = 3 * 1000; } fdcount = epoll_wait(fd_ep, ev, ARRAY_SIZE(ev), timeout); if (fdcount < 0) continue; if (fdcount == 0) { + struct udev_list_node *loop; + + /* timeout */ if (udev_exit) { - info(udev, "timeout, giving up waiting for workers to finish\n"); + err(udev, "timeout, giving up waiting for workers to finish\n"); break; } - /* timeout - kill idle workers */ - worker_kill(udev, 2); + /* kill idle workers */ + if (udev_list_node_is_empty(&event_list)) { + info(udev, "cleanup idle workers\n"); + worker_kill(udev, 2); + } + + /* check for hanging events */ + udev_list_node_foreach(loop, &worker_list) { + struct worker *worker = node_to_worker(loop); + + if (worker->state != WORKER_RUNNING) + continue; + + if ((now_usec() - worker->event_start_usec) > 30 * 1000 * 1000) { + err(udev, "worker [%u] timeout, kill it\n", worker->pid, + worker->event ? worker->event->devpath : ""); + kill(worker->pid, SIGKILL); + worker->state = WORKER_KILLED; + /* drop reference taken for state 'running' */ + worker_unref(worker); + if (worker->event) { + err(udev, "seq %llu '%s' killed\n", + udev_device_get_seqnum(worker->event->dev), worker->event->devpath); + worker->event->exitcode = -64; + event_queue_delete(worker->event, true); + worker->event = NULL; + } + } + } + } is_worker = is_signal = is_inotify = is_netlink = is_ctrl = false;