chiark / gitweb /
sd-event: minor fixups to delays profiling changes
[elogind.git] / src / libelogind / sd-event / sd-event.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2013 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/epoll.h>
23 #include <sys/timerfd.h>
24 #include <sys/wait.h>
25
26 #include "sd-daemon.h"
27 #include "sd-event.h"
28 #include "sd-id128.h"
29
30 #include "alloc-util.h"
31 #include "fd-util.h"
32 #include "hashmap.h"
33 #include "list.h"
34 #include "macro.h"
35 #include "missing.h"
36 #include "prioq.h"
37 #include "process-util.h"
38 #include "set.h"
39 #include "signal-util.h"
40 #include "string-table.h"
41 #include "string-util.h"
42 #include "time-util.h"
43 #include "util.h"
44
45 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
46
47 typedef enum EventSourceType {
48         SOURCE_IO,
49         SOURCE_TIME_REALTIME,
50         SOURCE_TIME_BOOTTIME,
51         SOURCE_TIME_MONOTONIC,
52         SOURCE_TIME_REALTIME_ALARM,
53         SOURCE_TIME_BOOTTIME_ALARM,
54         SOURCE_SIGNAL,
55         SOURCE_CHILD,
56         SOURCE_DEFER,
57         SOURCE_POST,
58         SOURCE_EXIT,
59         SOURCE_WATCHDOG,
60         _SOURCE_EVENT_SOURCE_TYPE_MAX,
61         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
62 } EventSourceType;
63
64 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
65         [SOURCE_IO] = "io",
66         [SOURCE_TIME_REALTIME] = "realtime",
67         [SOURCE_TIME_BOOTTIME] = "bootime",
68         [SOURCE_TIME_MONOTONIC] = "monotonic",
69         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
70         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
71         [SOURCE_SIGNAL] = "signal",
72         [SOURCE_CHILD] = "child",
73         [SOURCE_DEFER] = "defer",
74         [SOURCE_POST] = "post",
75         [SOURCE_EXIT] = "exit",
76         [SOURCE_WATCHDOG] = "watchdog",
77 };
78
79 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
80
81 /* All objects we use in epoll events start with this value, so that
82  * we know how to dispatch it */
83 typedef enum WakeupType {
84         WAKEUP_NONE,
85         WAKEUP_EVENT_SOURCE,
86         WAKEUP_CLOCK_DATA,
87         WAKEUP_SIGNAL_DATA,
88         _WAKEUP_TYPE_MAX,
89         _WAKEUP_TYPE_INVALID = -1,
90 } WakeupType;
91
92 #define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
93
94 struct sd_event_source {
95         WakeupType wakeup;
96
97         unsigned n_ref;
98
99         sd_event *event;
100         void *userdata;
101         sd_event_handler_t prepare;
102
103         char *description;
104
105         EventSourceType type:5;
106         int enabled:3;
107         bool pending:1;
108         bool dispatching:1;
109         bool floating:1;
110
111         int64_t priority;
112         unsigned pending_index;
113         unsigned prepare_index;
114         unsigned pending_iteration;
115         unsigned prepare_iteration;
116
117         LIST_FIELDS(sd_event_source, sources);
118
119         union {
120                 struct {
121                         sd_event_io_handler_t callback;
122                         int fd;
123                         uint32_t events;
124                         uint32_t revents;
125                         bool registered:1;
126                 } io;
127                 struct {
128                         sd_event_time_handler_t callback;
129                         usec_t next, accuracy;
130                         unsigned earliest_index;
131                         unsigned latest_index;
132                 } time;
133                 struct {
134                         sd_event_signal_handler_t callback;
135                         struct signalfd_siginfo siginfo;
136                         int sig;
137                 } signal;
138                 struct {
139                         sd_event_child_handler_t callback;
140                         siginfo_t siginfo;
141                         pid_t pid;
142                         int options;
143                 } child;
144                 struct {
145                         sd_event_handler_t callback;
146                 } defer;
147                 struct {
148                         sd_event_handler_t callback;
149                 } post;
150                 struct {
151                         sd_event_handler_t callback;
152                         unsigned prioq_index;
153                 } exit;
154         };
155 };
156
157 struct clock_data {
158         WakeupType wakeup;
159         int fd;
160
161         /* For all clocks we maintain two priority queues each, one
162          * ordered for the earliest times the events may be
163          * dispatched, and one ordered by the latest times they must
164          * have been dispatched. The range between the top entries in
165          * the two prioqs is the time window we can freely schedule
166          * wakeups in */
167
168         Prioq *earliest;
169         Prioq *latest;
170         usec_t next;
171
172         bool needs_rearm:1;
173 };
174
175 struct signal_data {
176         WakeupType wakeup;
177
178         /* For each priority we maintain one signal fd, so that we
179          * only have to dequeue a single event per priority at a
180          * time. */
181
182         int fd;
183         int64_t priority;
184         sigset_t sigset;
185         sd_event_source *current;
186 };
187
188 struct sd_event {
189         unsigned n_ref;
190
191         int epoll_fd;
192         int watchdog_fd;
193
194         Prioq *pending;
195         Prioq *prepare;
196
197         /* timerfd_create() only supports these five clocks so far. We
198          * can add support for more clocks when the kernel learns to
199          * deal with them, too. */
200         struct clock_data realtime;
201         struct clock_data boottime;
202         struct clock_data monotonic;
203         struct clock_data realtime_alarm;
204         struct clock_data boottime_alarm;
205
206         usec_t perturb;
207
208         sd_event_source **signal_sources; /* indexed by signal number */
209         Hashmap *signal_data; /* indexed by priority */
210
211         Hashmap *child_sources;
212         unsigned n_enabled_child_sources;
213
214         Set *post_sources;
215
216         Prioq *exit;
217
218         pid_t original_pid;
219
220         unsigned iteration;
221         dual_timestamp timestamp;
222         usec_t timestamp_boottime;
223         int state;
224
225         bool exit_requested:1;
226         bool need_process_child:1;
227         bool watchdog:1;
228         bool profile_delays:1;
229
230         int exit_code;
231
232         pid_t tid;
233         sd_event **default_event_ptr;
234
235         usec_t watchdog_last, watchdog_period;
236
237         unsigned n_sources;
238
239         LIST_HEAD(sd_event_source, sources);
240
241         usec_t last_run, last_log;
242         unsigned delays[sizeof(usec_t) * 8];
243 };
244
245 static void source_disconnect(sd_event_source *s);
246
247 static int pending_prioq_compare(const void *a, const void *b) {
248         const sd_event_source *x = a, *y = b;
249
250         assert(x->pending);
251         assert(y->pending);
252
253         /* Enabled ones first */
254         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
255                 return -1;
256         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
257                 return 1;
258
259         /* Lower priority values first */
260         if (x->priority < y->priority)
261                 return -1;
262         if (x->priority > y->priority)
263                 return 1;
264
265         /* Older entries first */
266         if (x->pending_iteration < y->pending_iteration)
267                 return -1;
268         if (x->pending_iteration > y->pending_iteration)
269                 return 1;
270
271         return 0;
272 }
273
274 static int prepare_prioq_compare(const void *a, const void *b) {
275         const sd_event_source *x = a, *y = b;
276
277         assert(x->prepare);
278         assert(y->prepare);
279
280         /* Enabled ones first */
281         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
282                 return -1;
283         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
284                 return 1;
285
286         /* Move most recently prepared ones last, so that we can stop
287          * preparing as soon as we hit one that has already been
288          * prepared in the current iteration */
289         if (x->prepare_iteration < y->prepare_iteration)
290                 return -1;
291         if (x->prepare_iteration > y->prepare_iteration)
292                 return 1;
293
294         /* Lower priority values first */
295         if (x->priority < y->priority)
296                 return -1;
297         if (x->priority > y->priority)
298                 return 1;
299
300         return 0;
301 }
302
303 static int earliest_time_prioq_compare(const void *a, const void *b) {
304         const sd_event_source *x = a, *y = b;
305
306         assert(EVENT_SOURCE_IS_TIME(x->type));
307         assert(x->type == y->type);
308
309         /* Enabled ones first */
310         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
311                 return -1;
312         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
313                 return 1;
314
315         /* Move the pending ones to the end */
316         if (!x->pending && y->pending)
317                 return -1;
318         if (x->pending && !y->pending)
319                 return 1;
320
321         /* Order by time */
322         if (x->time.next < y->time.next)
323                 return -1;
324         if (x->time.next > y->time.next)
325                 return 1;
326
327         return 0;
328 }
329
330 static int latest_time_prioq_compare(const void *a, const void *b) {
331         const sd_event_source *x = a, *y = b;
332
333         assert(EVENT_SOURCE_IS_TIME(x->type));
334         assert(x->type == y->type);
335
336         /* Enabled ones first */
337         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
338                 return -1;
339         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
340                 return 1;
341
342         /* Move the pending ones to the end */
343         if (!x->pending && y->pending)
344                 return -1;
345         if (x->pending && !y->pending)
346                 return 1;
347
348         /* Order by time */
349         if (x->time.next + x->time.accuracy < y->time.next + y->time.accuracy)
350                 return -1;
351         if (x->time.next + x->time.accuracy > y->time.next + y->time.accuracy)
352                 return 1;
353
354         return 0;
355 }
356
357 static int exit_prioq_compare(const void *a, const void *b) {
358         const sd_event_source *x = a, *y = b;
359
360         assert(x->type == SOURCE_EXIT);
361         assert(y->type == SOURCE_EXIT);
362
363         /* Enabled ones first */
364         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
365                 return -1;
366         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
367                 return 1;
368
369         /* Lower priority values first */
370         if (x->priority < y->priority)
371                 return -1;
372         if (x->priority > y->priority)
373                 return 1;
374
375         return 0;
376 }
377
378 static void free_clock_data(struct clock_data *d) {
379         assert(d);
380         assert(d->wakeup == WAKEUP_CLOCK_DATA);
381
382         safe_close(d->fd);
383         prioq_free(d->earliest);
384         prioq_free(d->latest);
385 }
386
387 static void event_free(sd_event *e) {
388         sd_event_source *s;
389
390         assert(e);
391
392         while ((s = e->sources)) {
393                 assert(s->floating);
394                 source_disconnect(s);
395                 sd_event_source_unref(s);
396         }
397
398         assert(e->n_sources == 0);
399
400         if (e->default_event_ptr)
401                 *(e->default_event_ptr) = NULL;
402
403         safe_close(e->epoll_fd);
404         safe_close(e->watchdog_fd);
405
406         free_clock_data(&e->realtime);
407         free_clock_data(&e->boottime);
408         free_clock_data(&e->monotonic);
409         free_clock_data(&e->realtime_alarm);
410         free_clock_data(&e->boottime_alarm);
411
412         prioq_free(e->pending);
413         prioq_free(e->prepare);
414         prioq_free(e->exit);
415
416         free(e->signal_sources);
417         hashmap_free(e->signal_data);
418
419         hashmap_free(e->child_sources);
420         set_free(e->post_sources);
421         free(e);
422 }
423
424 _public_ int sd_event_new(sd_event** ret) {
425         sd_event *e;
426         int r;
427
428         assert_return(ret, -EINVAL);
429
430         e = new0(sd_event, 1);
431         if (!e)
432                 return -ENOMEM;
433
434         e->n_ref = 1;
435         e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
436         e->realtime.next = e->boottime.next = e->monotonic.next = e->realtime_alarm.next = e->boottime_alarm.next = USEC_INFINITY;
437         e->realtime.wakeup = e->boottime.wakeup = e->monotonic.wakeup = e->realtime_alarm.wakeup = e->boottime_alarm.wakeup = WAKEUP_CLOCK_DATA;
438         e->original_pid = getpid();
439         e->perturb = USEC_INFINITY;
440
441         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
442         if (r < 0)
443                 goto fail;
444
445         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
446         if (e->epoll_fd < 0) {
447                 r = -errno;
448                 goto fail;
449         }
450
451         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
452                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 ... 2^63 us will be logged every 5s.");
453                 e->profile_delays = true;
454         }
455
456         *ret = e;
457         return 0;
458
459 fail:
460         event_free(e);
461         return r;
462 }
463
464 _public_ sd_event* sd_event_ref(sd_event *e) {
465         assert_return(e, NULL);
466
467         assert(e->n_ref >= 1);
468         e->n_ref++;
469
470         return e;
471 }
472
473 _public_ sd_event* sd_event_unref(sd_event *e) {
474
475         if (!e)
476                 return NULL;
477
478         assert(e->n_ref >= 1);
479         e->n_ref--;
480
481         if (e->n_ref <= 0)
482                 event_free(e);
483
484         return NULL;
485 }
486
487 static bool event_pid_changed(sd_event *e) {
488         assert(e);
489
490         /* We don't support people creating an event loop and keeping
491          * it around over a fork(). Let's complain. */
492
493         return e->original_pid != getpid();
494 }
495
496 static void source_io_unregister(sd_event_source *s) {
497         int r;
498
499         assert(s);
500         assert(s->type == SOURCE_IO);
501
502         if (event_pid_changed(s->event))
503                 return;
504
505         if (!s->io.registered)
506                 return;
507
508         r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
509         if (r < 0)
510                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
511                                 strna(s->description), event_source_type_to_string(s->type));
512
513         s->io.registered = false;
514 }
515
516 static int source_io_register(
517                 sd_event_source *s,
518                 int enabled,
519                 uint32_t events) {
520
521         struct epoll_event ev = {};
522         int r;
523
524         assert(s);
525         assert(s->type == SOURCE_IO);
526         assert(enabled != SD_EVENT_OFF);
527
528         ev.events = events;
529         ev.data.ptr = s;
530
531         if (enabled == SD_EVENT_ONESHOT)
532                 ev.events |= EPOLLONESHOT;
533
534         if (s->io.registered)
535                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
536         else
537                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
538         if (r < 0)
539                 return -errno;
540
541         s->io.registered = true;
542
543         return 0;
544 }
545
546 #if 0 /// UNNEEDED by elogind
547 static clockid_t event_source_type_to_clock(EventSourceType t) {
548
549         switch (t) {
550
551         case SOURCE_TIME_REALTIME:
552                 return CLOCK_REALTIME;
553
554         case SOURCE_TIME_BOOTTIME:
555                 return CLOCK_BOOTTIME;
556
557         case SOURCE_TIME_MONOTONIC:
558                 return CLOCK_MONOTONIC;
559
560         case SOURCE_TIME_REALTIME_ALARM:
561                 return CLOCK_REALTIME_ALARM;
562
563         case SOURCE_TIME_BOOTTIME_ALARM:
564                 return CLOCK_BOOTTIME_ALARM;
565
566         default:
567                 return (clockid_t) -1;
568         }
569 }
570 #endif // 0
571
572 static EventSourceType clock_to_event_source_type(clockid_t clock) {
573
574         switch (clock) {
575
576         case CLOCK_REALTIME:
577                 return SOURCE_TIME_REALTIME;
578
579         case CLOCK_BOOTTIME:
580                 return SOURCE_TIME_BOOTTIME;
581
582         case CLOCK_MONOTONIC:
583                 return SOURCE_TIME_MONOTONIC;
584
585         case CLOCK_REALTIME_ALARM:
586                 return SOURCE_TIME_REALTIME_ALARM;
587
588         case CLOCK_BOOTTIME_ALARM:
589                 return SOURCE_TIME_BOOTTIME_ALARM;
590
591         default:
592                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
593         }
594 }
595
596 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
597         assert(e);
598
599         switch (t) {
600
601         case SOURCE_TIME_REALTIME:
602                 return &e->realtime;
603
604         case SOURCE_TIME_BOOTTIME:
605                 return &e->boottime;
606
607         case SOURCE_TIME_MONOTONIC:
608                 return &e->monotonic;
609
610         case SOURCE_TIME_REALTIME_ALARM:
611                 return &e->realtime_alarm;
612
613         case SOURCE_TIME_BOOTTIME_ALARM:
614                 return &e->boottime_alarm;
615
616         default:
617                 return NULL;
618         }
619 }
620
621 static int event_make_signal_data(
622                 sd_event *e,
623                 int sig,
624                 struct signal_data **ret) {
625
626         struct epoll_event ev = {};
627         struct signal_data *d;
628         bool added = false;
629         sigset_t ss_copy;
630         int64_t priority;
631         int r;
632
633         assert(e);
634
635         if (event_pid_changed(e))
636                 return -ECHILD;
637
638         if (e->signal_sources && e->signal_sources[sig])
639                 priority = e->signal_sources[sig]->priority;
640         else
641                 priority = 0;
642
643         d = hashmap_get(e->signal_data, &priority);
644         if (d) {
645                 if (sigismember(&d->sigset, sig) > 0) {
646                         if (ret)
647                                 *ret = d;
648                 return 0;
649                 }
650         } else {
651                 r = hashmap_ensure_allocated(&e->signal_data, &uint64_hash_ops);
652                 if (r < 0)
653                         return r;
654
655                 d = new0(struct signal_data, 1);
656                 if (!d)
657                         return -ENOMEM;
658
659                 d->wakeup = WAKEUP_SIGNAL_DATA;
660                 d->fd  = -1;
661                 d->priority = priority;
662
663                 r = hashmap_put(e->signal_data, &d->priority, d);
664         if (r < 0)
665                         return r;
666
667                 added = true;
668         }
669
670         ss_copy = d->sigset;
671         assert_se(sigaddset(&ss_copy, sig) >= 0);
672
673         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
674         if (r < 0) {
675                 r = -errno;
676                 goto fail;
677         }
678
679         d->sigset = ss_copy;
680
681         if (d->fd >= 0) {
682                 if (ret)
683                         *ret = d;
684                 return 0;
685         }
686
687         d->fd = r;
688
689         ev.events = EPOLLIN;
690         ev.data.ptr = d;
691
692         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev);
693         if (r < 0) {
694                 r = -errno;
695                 goto fail;
696         }
697
698         if (ret)
699                 *ret = d;
700
701         return 0;
702
703 fail:
704         if (added) {
705                 d->fd = safe_close(d->fd);
706                 hashmap_remove(e->signal_data, &d->priority);
707                 free(d);
708         }
709
710         return r;
711 }
712
713 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
714         assert(e);
715         assert(d);
716
717         /* Turns off the specified signal in the signal data
718          * object. If the signal mask of the object becomes empty that
719          * way removes it. */
720
721         if (sigismember(&d->sigset, sig) == 0)
722                 return;
723
724         assert_se(sigdelset(&d->sigset, sig) >= 0);
725
726         if (sigisemptyset(&d->sigset)) {
727
728                 /* If all the mask is all-zero we can get rid of the structure */
729                 hashmap_remove(e->signal_data, &d->priority);
730                 assert(!d->current);
731                 safe_close(d->fd);
732                 free(d);
733                 return;
734         }
735
736         assert(d->fd >= 0);
737
738         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
739                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
740 }
741
742 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
743         struct signal_data *d;
744         static const int64_t zero_priority = 0;
745
746         assert(e);
747
748         /* Rechecks if the specified signal is still something we are
749          * interested in. If not, we'll unmask it, and possibly drop
750          * the signalfd for it. */
751
752         if (sig == SIGCHLD &&
753             e->n_enabled_child_sources > 0)
754                 return;
755
756         if (e->signal_sources &&
757             e->signal_sources[sig] &&
758             e->signal_sources[sig]->enabled != SD_EVENT_OFF)
759                 return;
760
761         /*
762          * The specified signal might be enabled in three different queues:
763          *
764          * 1) the one that belongs to the priority passed (if it is non-NULL)
765          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
766          * 3) the 0 priority (to cover the SIGCHLD case)
767          *
768          * Hence, let's remove it from all three here.
769          */
770
771         if (priority) {
772                 d = hashmap_get(e->signal_data, priority);
773                 if (d)
774                         event_unmask_signal_data(e, d, sig);
775         }
776
777         if (e->signal_sources && e->signal_sources[sig]) {
778                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
779                 if (d)
780                         event_unmask_signal_data(e, d, sig);
781         }
782
783         d = hashmap_get(e->signal_data, &zero_priority);
784         if (d)
785                 event_unmask_signal_data(e, d, sig);
786 }
787
788 static void source_disconnect(sd_event_source *s) {
789         sd_event *event;
790
791         assert(s);
792
793         if (!s->event)
794                 return;
795
796         assert(s->event->n_sources > 0);
797
798         switch (s->type) {
799
800         case SOURCE_IO:
801                 if (s->io.fd >= 0)
802                         source_io_unregister(s);
803
804                 break;
805
806         case SOURCE_TIME_REALTIME:
807         case SOURCE_TIME_BOOTTIME:
808         case SOURCE_TIME_MONOTONIC:
809         case SOURCE_TIME_REALTIME_ALARM:
810         case SOURCE_TIME_BOOTTIME_ALARM: {
811                 struct clock_data *d;
812
813                 d = event_get_clock_data(s->event, s->type);
814                 assert(d);
815
816                 prioq_remove(d->earliest, s, &s->time.earliest_index);
817                 prioq_remove(d->latest, s, &s->time.latest_index);
818                 d->needs_rearm = true;
819                 break;
820         }
821
822         case SOURCE_SIGNAL:
823                 if (s->signal.sig > 0) {
824
825                         if (s->event->signal_sources)
826                                 s->event->signal_sources[s->signal.sig] = NULL;
827
828                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
829                 }
830
831                 break;
832
833         case SOURCE_CHILD:
834                 if (s->child.pid > 0) {
835                         if (s->enabled != SD_EVENT_OFF) {
836                                 assert(s->event->n_enabled_child_sources > 0);
837                                 s->event->n_enabled_child_sources--;
838                         }
839
840                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
841                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
842                 }
843
844                 break;
845
846         case SOURCE_DEFER:
847                 /* nothing */
848                 break;
849
850         case SOURCE_POST:
851                 set_remove(s->event->post_sources, s);
852                 break;
853
854         case SOURCE_EXIT:
855                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
856                 break;
857
858         default:
859                 assert_not_reached("Wut? I shouldn't exist.");
860         }
861
862         if (s->pending)
863                 prioq_remove(s->event->pending, s, &s->pending_index);
864
865         if (s->prepare)
866                 prioq_remove(s->event->prepare, s, &s->prepare_index);
867
868         event = s->event;
869
870         s->type = _SOURCE_EVENT_SOURCE_TYPE_INVALID;
871         s->event = NULL;
872         LIST_REMOVE(sources, event->sources, s);
873         event->n_sources--;
874
875         if (!s->floating)
876                 sd_event_unref(event);
877 }
878
879 static void source_free(sd_event_source *s) {
880         assert(s);
881
882         source_disconnect(s);
883         free(s->description);
884         free(s);
885 }
886
887 static int source_set_pending(sd_event_source *s, bool b) {
888         int r;
889
890         assert(s);
891         assert(s->type != SOURCE_EXIT);
892
893         if (s->pending == b)
894                 return 0;
895
896         s->pending = b;
897
898         if (b) {
899                 s->pending_iteration = s->event->iteration;
900
901                 r = prioq_put(s->event->pending, s, &s->pending_index);
902                 if (r < 0) {
903                         s->pending = false;
904                         return r;
905                 }
906         } else
907                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
908
909         if (EVENT_SOURCE_IS_TIME(s->type)) {
910                 struct clock_data *d;
911
912                 d = event_get_clock_data(s->event, s->type);
913                 assert(d);
914
915                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
916                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
917                 d->needs_rearm = true;
918         }
919
920         if (s->type == SOURCE_SIGNAL && !b) {
921                 struct signal_data *d;
922
923                 d = hashmap_get(s->event->signal_data, &s->priority);
924                 if (d && d->current == s)
925                         d->current = NULL;
926         }
927
928         return 0;
929 }
930
931 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
932         sd_event_source *s;
933
934         assert(e);
935
936         s = new0(sd_event_source, 1);
937         if (!s)
938                 return NULL;
939
940         s->n_ref = 1;
941         s->event = e;
942         s->floating = floating;
943         s->type = type;
944         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
945
946         if (!floating)
947                 sd_event_ref(e);
948
949         LIST_PREPEND(sources, e->sources, s);
950         e->n_sources ++;
951
952         return s;
953 }
954
955 _public_ int sd_event_add_io(
956                 sd_event *e,
957                 sd_event_source **ret,
958                 int fd,
959                 uint32_t events,
960                 sd_event_io_handler_t callback,
961                 void *userdata) {
962
963         sd_event_source *s;
964         int r;
965
966         assert_return(e, -EINVAL);
967         assert_return(fd >= 0, -EBADF);
968         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
969         assert_return(callback, -EINVAL);
970         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
971         assert_return(!event_pid_changed(e), -ECHILD);
972
973         s = source_new(e, !ret, SOURCE_IO);
974         if (!s)
975                 return -ENOMEM;
976
977         s->wakeup = WAKEUP_EVENT_SOURCE;
978         s->io.fd = fd;
979         s->io.events = events;
980         s->io.callback = callback;
981         s->userdata = userdata;
982         s->enabled = SD_EVENT_ON;
983
984         r = source_io_register(s, s->enabled, events);
985         if (r < 0) {
986                 source_free(s);
987                 return r;
988         }
989
990         if (ret)
991                 *ret = s;
992
993         return 0;
994 }
995
996 static void initialize_perturb(sd_event *e) {
997         sd_id128_t bootid = {};
998
999         /* When we sleep for longer, we try to realign the wakeup to
1000            the same time wihtin each minute/second/250ms, so that
1001            events all across the system can be coalesced into a single
1002            CPU wakeup. However, let's take some system-specific
1003            randomness for this value, so that in a network of systems
1004            with synced clocks timer events are distributed a
1005            bit. Here, we calculate a perturbation usec offset from the
1006            boot ID. */
1007
1008         if (_likely_(e->perturb != USEC_INFINITY))
1009                 return;
1010
1011         if (sd_id128_get_boot(&bootid) >= 0)
1012                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1013 }
1014
1015 static int event_setup_timer_fd(
1016                 sd_event *e,
1017                 struct clock_data *d,
1018                 clockid_t clock) {
1019
1020         struct epoll_event ev = {};
1021         int r, fd;
1022
1023         assert(e);
1024         assert(d);
1025
1026         if (_likely_(d->fd >= 0))
1027                 return 0;
1028
1029         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1030         if (fd < 0)
1031                 return -errno;
1032
1033         ev.events = EPOLLIN;
1034         ev.data.ptr = d;
1035
1036         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
1037         if (r < 0) {
1038                 safe_close(fd);
1039                 return -errno;
1040         }
1041
1042         d->fd = fd;
1043         return 0;
1044 }
1045
1046 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1047         assert(s);
1048
1049         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1050 }
1051
1052 _public_ int sd_event_add_time(
1053                 sd_event *e,
1054                 sd_event_source **ret,
1055                 clockid_t clock,
1056                 uint64_t usec,
1057                 uint64_t accuracy,
1058                 sd_event_time_handler_t callback,
1059                 void *userdata) {
1060
1061         EventSourceType type;
1062         sd_event_source *s;
1063         struct clock_data *d;
1064         int r;
1065
1066         assert_return(e, -EINVAL);
1067         assert_return(usec != (uint64_t) -1, -EINVAL);
1068         assert_return(accuracy != (uint64_t) -1, -EINVAL);
1069         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1070         assert_return(!event_pid_changed(e), -ECHILD);
1071
1072         if (!callback)
1073                 callback = time_exit_callback;
1074
1075         type = clock_to_event_source_type(clock);
1076         assert_return(type >= 0, -EOPNOTSUPP);
1077
1078         d = event_get_clock_data(e, type);
1079         assert(d);
1080
1081         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1082         if (r < 0)
1083                 return r;
1084
1085         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1086         if (r < 0)
1087                 return r;
1088
1089         if (d->fd < 0) {
1090                 r = event_setup_timer_fd(e, d, clock);
1091                 if (r < 0)
1092                         return r;
1093         }
1094
1095         s = source_new(e, !ret, type);
1096         if (!s)
1097                 return -ENOMEM;
1098
1099         s->time.next = usec;
1100         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1101         s->time.callback = callback;
1102         s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
1103         s->userdata = userdata;
1104         s->enabled = SD_EVENT_ONESHOT;
1105
1106         d->needs_rearm = true;
1107
1108         r = prioq_put(d->earliest, s, &s->time.earliest_index);
1109         if (r < 0)
1110                 goto fail;
1111
1112         r = prioq_put(d->latest, s, &s->time.latest_index);
1113         if (r < 0)
1114                 goto fail;
1115
1116         if (ret)
1117                 *ret = s;
1118
1119         return 0;
1120
1121 fail:
1122         source_free(s);
1123         return r;
1124 }
1125
1126 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1127         assert(s);
1128
1129         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1130 }
1131
1132 _public_ int sd_event_add_signal(
1133                 sd_event *e,
1134                 sd_event_source **ret,
1135                 int sig,
1136                 sd_event_signal_handler_t callback,
1137                 void *userdata) {
1138
1139         sd_event_source *s;
1140         struct signal_data *d;
1141         sigset_t ss;
1142         int r;
1143
1144         assert_return(e, -EINVAL);
1145         assert_return(sig > 0, -EINVAL);
1146         assert_return(sig < _NSIG, -EINVAL);
1147         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1148         assert_return(!event_pid_changed(e), -ECHILD);
1149
1150         if (!callback)
1151                 callback = signal_exit_callback;
1152
1153         r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
1154         if (r != 0)
1155                 return -r;
1156
1157         if (!sigismember(&ss, sig))
1158                 return -EBUSY;
1159
1160         if (!e->signal_sources) {
1161                 e->signal_sources = new0(sd_event_source*, _NSIG);
1162                 if (!e->signal_sources)
1163                         return -ENOMEM;
1164         } else if (e->signal_sources[sig])
1165                 return -EBUSY;
1166
1167         s = source_new(e, !ret, SOURCE_SIGNAL);
1168         if (!s)
1169                 return -ENOMEM;
1170
1171         s->signal.sig = sig;
1172         s->signal.callback = callback;
1173         s->userdata = userdata;
1174         s->enabled = SD_EVENT_ON;
1175
1176         e->signal_sources[sig] = s;
1177
1178         r = event_make_signal_data(e, sig, &d);
1179                 if (r < 0) {
1180                         source_free(s);
1181                         return r;
1182                 }
1183
1184         /* Use the signal name as description for the event source by default */
1185         (void) sd_event_source_set_description(s, signal_to_string(sig));
1186
1187         if (ret)
1188                 *ret = s;
1189
1190         return 0;
1191 }
1192
1193 #if 0 /// UNNEEDED by elogind
1194 _public_ int sd_event_add_child(
1195                 sd_event *e,
1196                 sd_event_source **ret,
1197                 pid_t pid,
1198                 int options,
1199                 sd_event_child_handler_t callback,
1200                 void *userdata) {
1201
1202         sd_event_source *s;
1203         int r;
1204
1205         assert_return(e, -EINVAL);
1206         assert_return(pid > 1, -EINVAL);
1207         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1208         assert_return(options != 0, -EINVAL);
1209         assert_return(callback, -EINVAL);
1210         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1211         assert_return(!event_pid_changed(e), -ECHILD);
1212
1213         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1214         if (r < 0)
1215                 return r;
1216
1217         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1218                 return -EBUSY;
1219
1220         s = source_new(e, !ret, SOURCE_CHILD);
1221         if (!s)
1222                 return -ENOMEM;
1223
1224         s->child.pid = pid;
1225         s->child.options = options;
1226         s->child.callback = callback;
1227         s->userdata = userdata;
1228         s->enabled = SD_EVENT_ONESHOT;
1229
1230         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1231         if (r < 0) {
1232                 source_free(s);
1233                 return r;
1234         }
1235
1236         e->n_enabled_child_sources ++;
1237
1238         r = event_make_signal_data(e, SIGCHLD, NULL);
1239                 if (r < 0) {
1240                 e->n_enabled_child_sources--;
1241                         source_free(s);
1242                         return r;
1243                 }
1244
1245         e->need_process_child = true;
1246
1247         if (ret)
1248                 *ret = s;
1249
1250         return 0;
1251 }
1252
1253 _public_ int sd_event_add_defer(
1254                 sd_event *e,
1255                 sd_event_source **ret,
1256                 sd_event_handler_t callback,
1257                 void *userdata) {
1258
1259         sd_event_source *s;
1260         int r;
1261
1262         assert_return(e, -EINVAL);
1263         assert_return(callback, -EINVAL);
1264         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1265         assert_return(!event_pid_changed(e), -ECHILD);
1266
1267         s = source_new(e, !ret, SOURCE_DEFER);
1268         if (!s)
1269                 return -ENOMEM;
1270
1271         s->defer.callback = callback;
1272         s->userdata = userdata;
1273         s->enabled = SD_EVENT_ONESHOT;
1274
1275         r = source_set_pending(s, true);
1276         if (r < 0) {
1277                 source_free(s);
1278                 return r;
1279         }
1280
1281         if (ret)
1282                 *ret = s;
1283
1284         return 0;
1285 }
1286 #endif // 0
1287
1288 _public_ int sd_event_add_post(
1289                 sd_event *e,
1290                 sd_event_source **ret,
1291                 sd_event_handler_t callback,
1292                 void *userdata) {
1293
1294         sd_event_source *s;
1295         int r;
1296
1297         assert_return(e, -EINVAL);
1298         assert_return(callback, -EINVAL);
1299         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1300         assert_return(!event_pid_changed(e), -ECHILD);
1301
1302         r = set_ensure_allocated(&e->post_sources, NULL);
1303         if (r < 0)
1304                 return r;
1305
1306         s = source_new(e, !ret, SOURCE_POST);
1307         if (!s)
1308                 return -ENOMEM;
1309
1310         s->post.callback = callback;
1311         s->userdata = userdata;
1312         s->enabled = SD_EVENT_ON;
1313
1314         r = set_put(e->post_sources, s);
1315         if (r < 0) {
1316                 source_free(s);
1317                 return r;
1318         }
1319
1320         if (ret)
1321                 *ret = s;
1322
1323         return 0;
1324 }
1325
1326 _public_ int sd_event_add_exit(
1327                 sd_event *e,
1328                 sd_event_source **ret,
1329                 sd_event_handler_t callback,
1330                 void *userdata) {
1331
1332         sd_event_source *s;
1333         int r;
1334
1335         assert_return(e, -EINVAL);
1336         assert_return(callback, -EINVAL);
1337         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1338         assert_return(!event_pid_changed(e), -ECHILD);
1339
1340         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1341         if (r < 0)
1342                 return r;
1343
1344         s = source_new(e, !ret, SOURCE_EXIT);
1345         if (!s)
1346                 return -ENOMEM;
1347
1348         s->exit.callback = callback;
1349         s->userdata = userdata;
1350         s->exit.prioq_index = PRIOQ_IDX_NULL;
1351         s->enabled = SD_EVENT_ONESHOT;
1352
1353         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1354         if (r < 0) {
1355                 source_free(s);
1356                 return r;
1357         }
1358
1359         if (ret)
1360                 *ret = s;
1361
1362         return 0;
1363 }
1364
1365 #if 0 /// UNNEEDED by elogind
1366 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
1367         assert_return(s, NULL);
1368
1369         assert(s->n_ref >= 1);
1370         s->n_ref++;
1371
1372         return s;
1373 }
1374 #endif // 0
1375
1376 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
1377
1378         if (!s)
1379                 return NULL;
1380
1381         assert(s->n_ref >= 1);
1382         s->n_ref--;
1383
1384         if (s->n_ref <= 0) {
1385                 /* Here's a special hack: when we are called from a
1386                  * dispatch handler we won't free the event source
1387                  * immediately, but we will detach the fd from the
1388                  * epoll. This way it is safe for the caller to unref
1389                  * the event source and immediately close the fd, but
1390                  * we still retain a valid event source object after
1391                  * the callback. */
1392
1393                 if (s->dispatching) {
1394                         if (s->type == SOURCE_IO)
1395                                 source_io_unregister(s);
1396
1397                         source_disconnect(s);
1398                 } else
1399                         source_free(s);
1400         }
1401
1402         return NULL;
1403 }
1404
1405 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
1406         assert_return(s, -EINVAL);
1407         assert_return(!event_pid_changed(s->event), -ECHILD);
1408
1409         return free_and_strdup(&s->description, description);
1410 }
1411
1412 #if 0 /// UNNEEDED by elogind
1413 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
1414         assert_return(s, -EINVAL);
1415         assert_return(description, -EINVAL);
1416         assert_return(s->description, -ENXIO);
1417         assert_return(!event_pid_changed(s->event), -ECHILD);
1418
1419         *description = s->description;
1420         return 0;
1421 }
1422 #endif // 0
1423
1424 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1425         assert_return(s, NULL);
1426
1427         return s->event;
1428 }
1429
1430 #if 0 /// UNNEEDED by elogind
1431 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1432         assert_return(s, -EINVAL);
1433         assert_return(s->type != SOURCE_EXIT, -EDOM);
1434         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1435         assert_return(!event_pid_changed(s->event), -ECHILD);
1436
1437         return s->pending;
1438 }
1439
1440 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1441         assert_return(s, -EINVAL);
1442         assert_return(s->type == SOURCE_IO, -EDOM);
1443         assert_return(!event_pid_changed(s->event), -ECHILD);
1444
1445         return s->io.fd;
1446 }
1447 #endif // 0
1448
1449 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1450         int r;
1451
1452         assert_return(s, -EINVAL);
1453         assert_return(fd >= 0, -EBADF);
1454         assert_return(s->type == SOURCE_IO, -EDOM);
1455         assert_return(!event_pid_changed(s->event), -ECHILD);
1456
1457         if (s->io.fd == fd)
1458                 return 0;
1459
1460         if (s->enabled == SD_EVENT_OFF) {
1461                 s->io.fd = fd;
1462                 s->io.registered = false;
1463         } else {
1464                 int saved_fd;
1465
1466                 saved_fd = s->io.fd;
1467                 assert(s->io.registered);
1468
1469                 s->io.fd = fd;
1470                 s->io.registered = false;
1471
1472                 r = source_io_register(s, s->enabled, s->io.events);
1473                 if (r < 0) {
1474                         s->io.fd = saved_fd;
1475                         s->io.registered = true;
1476                         return r;
1477                 }
1478
1479                 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
1480         }
1481
1482         return 0;
1483 }
1484
1485 #if 0 /// UNNEEDED by elogind
1486 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1487         assert_return(s, -EINVAL);
1488         assert_return(events, -EINVAL);
1489         assert_return(s->type == SOURCE_IO, -EDOM);
1490         assert_return(!event_pid_changed(s->event), -ECHILD);
1491
1492         *events = s->io.events;
1493         return 0;
1494 }
1495 #endif // 0
1496
1497 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1498         int r;
1499
1500         assert_return(s, -EINVAL);
1501         assert_return(s->type == SOURCE_IO, -EDOM);
1502         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1503         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1504         assert_return(!event_pid_changed(s->event), -ECHILD);
1505
1506         /* edge-triggered updates are never skipped, so we can reset edges */
1507         if (s->io.events == events && !(events & EPOLLET))
1508                 return 0;
1509
1510         if (s->enabled != SD_EVENT_OFF) {
1511                 r = source_io_register(s, s->enabled, events);
1512                 if (r < 0)
1513                         return r;
1514         }
1515
1516         s->io.events = events;
1517         source_set_pending(s, false);
1518
1519         return 0;
1520 }
1521
1522 #if 0 /// UNNEEDED by elogind
1523 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1524         assert_return(s, -EINVAL);
1525         assert_return(revents, -EINVAL);
1526         assert_return(s->type == SOURCE_IO, -EDOM);
1527         assert_return(s->pending, -ENODATA);
1528         assert_return(!event_pid_changed(s->event), -ECHILD);
1529
1530         *revents = s->io.revents;
1531         return 0;
1532 }
1533
1534 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1535         assert_return(s, -EINVAL);
1536         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1537         assert_return(!event_pid_changed(s->event), -ECHILD);
1538
1539         return s->signal.sig;
1540 }
1541
1542 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
1543         assert_return(s, -EINVAL);
1544         assert_return(!event_pid_changed(s->event), -ECHILD);
1545
1546         return s->priority;
1547 }
1548 #endif // 0
1549
1550 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
1551         int r;
1552
1553         assert_return(s, -EINVAL);
1554         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1555         assert_return(!event_pid_changed(s->event), -ECHILD);
1556
1557         if (s->priority == priority)
1558                 return 0;
1559
1560         if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
1561                 struct signal_data *old, *d;
1562
1563                 /* Move us from the signalfd belonging to the old
1564                  * priority to the signalfd of the new priority */
1565
1566                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
1567
1568                 s->priority = priority;
1569
1570                 r = event_make_signal_data(s->event, s->signal.sig, &d);
1571                 if (r < 0) {
1572                         s->priority = old->priority;
1573                         return r;
1574                 }
1575
1576                 event_unmask_signal_data(s->event, old, s->signal.sig);
1577         } else
1578         s->priority = priority;
1579
1580         if (s->pending)
1581                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1582
1583         if (s->prepare)
1584                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1585
1586         if (s->type == SOURCE_EXIT)
1587                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1588
1589         return 0;
1590 }
1591
1592 #if 0 /// UNNEEDED by elogind
1593 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1594         assert_return(s, -EINVAL);
1595         assert_return(m, -EINVAL);
1596         assert_return(!event_pid_changed(s->event), -ECHILD);
1597
1598         *m = s->enabled;
1599         return 0;
1600 }
1601 #endif // 0
1602
1603 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1604         int r;
1605
1606         assert_return(s, -EINVAL);
1607         assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
1608         assert_return(!event_pid_changed(s->event), -ECHILD);
1609
1610         /* If we are dead anyway, we are fine with turning off
1611          * sources, but everything else needs to fail. */
1612         if (s->event->state == SD_EVENT_FINISHED)
1613                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
1614
1615         if (s->enabled == m)
1616                 return 0;
1617
1618         if (m == SD_EVENT_OFF) {
1619
1620                 switch (s->type) {
1621
1622                 case SOURCE_IO:
1623                         source_io_unregister(s);
1624                         s->enabled = m;
1625                         break;
1626
1627                 case SOURCE_TIME_REALTIME:
1628                 case SOURCE_TIME_BOOTTIME:
1629                 case SOURCE_TIME_MONOTONIC:
1630                 case SOURCE_TIME_REALTIME_ALARM:
1631                 case SOURCE_TIME_BOOTTIME_ALARM: {
1632                         struct clock_data *d;
1633
1634                         s->enabled = m;
1635                         d = event_get_clock_data(s->event, s->type);
1636                         assert(d);
1637
1638                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1639                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1640                         d->needs_rearm = true;
1641                         break;
1642                 }
1643
1644                 case SOURCE_SIGNAL:
1645                         s->enabled = m;
1646
1647                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1648                         break;
1649
1650                 case SOURCE_CHILD:
1651                         s->enabled = m;
1652
1653                         assert(s->event->n_enabled_child_sources > 0);
1654                         s->event->n_enabled_child_sources--;
1655
1656                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1657                         break;
1658
1659                 case SOURCE_EXIT:
1660                         s->enabled = m;
1661                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1662                         break;
1663
1664                 case SOURCE_DEFER:
1665                 case SOURCE_POST:
1666                         s->enabled = m;
1667                         break;
1668
1669                 default:
1670                         assert_not_reached("Wut? I shouldn't exist.");
1671                 }
1672
1673         } else {
1674                 switch (s->type) {
1675
1676                 case SOURCE_IO:
1677                         r = source_io_register(s, m, s->io.events);
1678                         if (r < 0)
1679                                 return r;
1680
1681                         s->enabled = m;
1682                         break;
1683
1684                 case SOURCE_TIME_REALTIME:
1685                 case SOURCE_TIME_BOOTTIME:
1686                 case SOURCE_TIME_MONOTONIC:
1687                 case SOURCE_TIME_REALTIME_ALARM:
1688                 case SOURCE_TIME_BOOTTIME_ALARM: {
1689                         struct clock_data *d;
1690
1691                         s->enabled = m;
1692                         d = event_get_clock_data(s->event, s->type);
1693                         assert(d);
1694
1695                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1696                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1697                         d->needs_rearm = true;
1698                         break;
1699                 }
1700
1701                 case SOURCE_SIGNAL:
1702
1703                         s->enabled = m;
1704
1705                         r = event_make_signal_data(s->event, s->signal.sig, NULL);
1706                                 if (r < 0) {
1707                                         s->enabled = SD_EVENT_OFF;
1708                                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1709                                         return r;
1710                                 }
1711
1712                         break;
1713
1714                 case SOURCE_CHILD:
1715
1716                         if (s->enabled == SD_EVENT_OFF)
1717                                 s->event->n_enabled_child_sources++;
1718
1719                         s->enabled = m;
1720
1721                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
1722                                         if (r < 0) {
1723                                                 s->enabled = SD_EVENT_OFF;
1724                                 s->event->n_enabled_child_sources--;
1725                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1726                                                 return r;
1727                                         }
1728
1729                         break;
1730
1731                 case SOURCE_EXIT:
1732                         s->enabled = m;
1733                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1734                         break;
1735
1736                 case SOURCE_DEFER:
1737                 case SOURCE_POST:
1738                         s->enabled = m;
1739                         break;
1740
1741                 default:
1742                         assert_not_reached("Wut? I shouldn't exist.");
1743                 }
1744         }
1745
1746         if (s->pending)
1747                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1748
1749         if (s->prepare)
1750                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1751
1752         return 0;
1753 }
1754
1755 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1756         assert_return(s, -EINVAL);
1757         assert_return(usec, -EINVAL);
1758         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1759         assert_return(!event_pid_changed(s->event), -ECHILD);
1760
1761         *usec = s->time.next;
1762         return 0;
1763 }
1764
1765 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1766         struct clock_data *d;
1767
1768         assert_return(s, -EINVAL);
1769         assert_return(usec != (uint64_t) -1, -EINVAL);
1770         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1771         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1772         assert_return(!event_pid_changed(s->event), -ECHILD);
1773
1774         s->time.next = usec;
1775
1776         source_set_pending(s, false);
1777
1778         d = event_get_clock_data(s->event, s->type);
1779         assert(d);
1780
1781         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1782         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1783         d->needs_rearm = true;
1784
1785         return 0;
1786 }
1787
1788 #if 0 /// UNNEEDED by elogind
1789 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1790         assert_return(s, -EINVAL);
1791         assert_return(usec, -EINVAL);
1792         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1793         assert_return(!event_pid_changed(s->event), -ECHILD);
1794
1795         *usec = s->time.accuracy;
1796         return 0;
1797 }
1798
1799 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1800         struct clock_data *d;
1801
1802         assert_return(s, -EINVAL);
1803         assert_return(usec != (uint64_t) -1, -EINVAL);
1804         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1805         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1806         assert_return(!event_pid_changed(s->event), -ECHILD);
1807
1808         if (usec == 0)
1809                 usec = DEFAULT_ACCURACY_USEC;
1810
1811         s->time.accuracy = usec;
1812
1813         source_set_pending(s, false);
1814
1815         d = event_get_clock_data(s->event, s->type);
1816         assert(d);
1817
1818         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1819         d->needs_rearm = true;
1820
1821         return 0;
1822 }
1823
1824 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
1825         assert_return(s, -EINVAL);
1826         assert_return(clock, -EINVAL);
1827         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1828         assert_return(!event_pid_changed(s->event), -ECHILD);
1829
1830         *clock = event_source_type_to_clock(s->type);
1831         return 0;
1832 }
1833
1834 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1835         assert_return(s, -EINVAL);
1836         assert_return(pid, -EINVAL);
1837         assert_return(s->type == SOURCE_CHILD, -EDOM);
1838         assert_return(!event_pid_changed(s->event), -ECHILD);
1839
1840         *pid = s->child.pid;
1841         return 0;
1842 }
1843 #endif // 0
1844
1845 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1846         int r;
1847
1848         assert_return(s, -EINVAL);
1849         assert_return(s->type != SOURCE_EXIT, -EDOM);
1850         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1851         assert_return(!event_pid_changed(s->event), -ECHILD);
1852
1853         if (s->prepare == callback)
1854                 return 0;
1855
1856         if (callback && s->prepare) {
1857                 s->prepare = callback;
1858                 return 0;
1859         }
1860
1861         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1862         if (r < 0)
1863                 return r;
1864
1865         s->prepare = callback;
1866
1867         if (callback) {
1868                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1869                 if (r < 0)
1870                         return r;
1871         } else
1872                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1873
1874         return 0;
1875 }
1876
1877 #if 0 /// UNNEEDED by elogind
1878 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1879         assert_return(s, NULL);
1880
1881         return s->userdata;
1882 }
1883
1884 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1885         void *ret;
1886
1887         assert_return(s, NULL);
1888
1889         ret = s->userdata;
1890         s->userdata = userdata;
1891
1892         return ret;
1893 }
1894 #endif // 0
1895
1896 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1897         usec_t c;
1898         assert(e);
1899         assert(a <= b);
1900
1901         if (a <= 0)
1902                 return 0;
1903
1904         if (b <= a + 1)
1905                 return a;
1906
1907         initialize_perturb(e);
1908
1909         /*
1910           Find a good time to wake up again between times a and b. We
1911           have two goals here:
1912
1913           a) We want to wake up as seldom as possible, hence prefer
1914              later times over earlier times.
1915
1916           b) But if we have to wake up, then let's make sure to
1917              dispatch as much as possible on the entire system.
1918
1919           We implement this by waking up everywhere at the same time
1920           within any given minute if we can, synchronised via the
1921           perturbation value determined from the boot ID. If we can't,
1922           then we try to find the same spot in every 10s, then 1s and
1923           then 250ms step. Otherwise, we pick the last possible time
1924           to wake up.
1925         */
1926
1927         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1928         if (c >= b) {
1929                 if (_unlikely_(c < USEC_PER_MINUTE))
1930                         return b;
1931
1932                 c -= USEC_PER_MINUTE;
1933         }
1934
1935         if (c >= a)
1936                 return c;
1937
1938         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1939         if (c >= b) {
1940                 if (_unlikely_(c < USEC_PER_SEC*10))
1941                         return b;
1942
1943                 c -= USEC_PER_SEC*10;
1944         }
1945
1946         if (c >= a)
1947                 return c;
1948
1949         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1950         if (c >= b) {
1951                 if (_unlikely_(c < USEC_PER_SEC))
1952                         return b;
1953
1954                 c -= USEC_PER_SEC;
1955         }
1956
1957         if (c >= a)
1958                 return c;
1959
1960         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1961         if (c >= b) {
1962                 if (_unlikely_(c < USEC_PER_MSEC*250))
1963                         return b;
1964
1965                 c -= USEC_PER_MSEC*250;
1966         }
1967
1968         if (c >= a)
1969                 return c;
1970
1971         return b;
1972 }
1973
1974 static int event_arm_timer(
1975                 sd_event *e,
1976                 struct clock_data *d) {
1977
1978         struct itimerspec its = {};
1979         sd_event_source *a, *b;
1980         usec_t t;
1981         int r;
1982
1983         assert(e);
1984         assert(d);
1985
1986         if (!d->needs_rearm)
1987                 return 0;
1988         else
1989                 d->needs_rearm = false;
1990
1991         a = prioq_peek(d->earliest);
1992         if (!a || a->enabled == SD_EVENT_OFF) {
1993
1994                 if (d->fd < 0)
1995                         return 0;
1996
1997                 if (d->next == USEC_INFINITY)
1998                         return 0;
1999
2000                 /* disarm */
2001                 r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2002                 if (r < 0)
2003                         return r;
2004
2005                 d->next = USEC_INFINITY;
2006                 return 0;
2007         }
2008
2009         b = prioq_peek(d->latest);
2010         assert_se(b && b->enabled != SD_EVENT_OFF);
2011
2012         t = sleep_between(e, a->time.next, b->time.next + b->time.accuracy);
2013         if (d->next == t)
2014                 return 0;
2015
2016         assert_se(d->fd >= 0);
2017
2018         if (t == 0) {
2019                 /* We don' want to disarm here, just mean some time looooong ago. */
2020                 its.it_value.tv_sec = 0;
2021                 its.it_value.tv_nsec = 1;
2022         } else
2023                 timespec_store(&its.it_value, t);
2024
2025         r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2026         if (r < 0)
2027                 return -errno;
2028
2029         d->next = t;
2030         return 0;
2031 }
2032
2033 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
2034         assert(e);
2035         assert(s);
2036         assert(s->type == SOURCE_IO);
2037
2038         /* If the event source was already pending, we just OR in the
2039          * new revents, otherwise we reset the value. The ORing is
2040          * necessary to handle EPOLLONESHOT events properly where
2041          * readability might happen independently of writability, and
2042          * we need to keep track of both */
2043
2044         if (s->pending)
2045                 s->io.revents |= revents;
2046         else
2047                 s->io.revents = revents;
2048
2049         return source_set_pending(s, true);
2050 }
2051
2052 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
2053         uint64_t x;
2054         ssize_t ss;
2055
2056         assert(e);
2057         assert(fd >= 0);
2058
2059         assert_return(events == EPOLLIN, -EIO);
2060
2061         ss = read(fd, &x, sizeof(x));
2062         if (ss < 0) {
2063                 if (errno == EAGAIN || errno == EINTR)
2064                         return 0;
2065
2066                 return -errno;
2067         }
2068
2069         if (_unlikely_(ss != sizeof(x)))
2070                 return -EIO;
2071
2072         if (next)
2073                 *next = USEC_INFINITY;
2074
2075         return 0;
2076 }
2077
2078 static int process_timer(
2079                 sd_event *e,
2080                 usec_t n,
2081                 struct clock_data *d) {
2082
2083         sd_event_source *s;
2084         int r;
2085
2086         assert(e);
2087         assert(d);
2088
2089         for (;;) {
2090                 s = prioq_peek(d->earliest);
2091                 if (!s ||
2092                     s->time.next > n ||
2093                     s->enabled == SD_EVENT_OFF ||
2094                     s->pending)
2095                         break;
2096
2097                 r = source_set_pending(s, true);
2098                 if (r < 0)
2099                         return r;
2100
2101                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2102                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
2103                 d->needs_rearm = true;
2104         }
2105
2106         return 0;
2107 }
2108
2109 static int process_child(sd_event *e) {
2110         sd_event_source *s;
2111         Iterator i;
2112         int r;
2113
2114         assert(e);
2115
2116         e->need_process_child = false;
2117
2118         /*
2119            So, this is ugly. We iteratively invoke waitid() with P_PID
2120            + WNOHANG for each PID we wait for, instead of using
2121            P_ALL. This is because we only want to get child
2122            information of very specific child processes, and not all
2123            of them. We might not have processed the SIGCHLD even of a
2124            previous invocation and we don't want to maintain a
2125            unbounded *per-child* event queue, hence we really don't
2126            want anything flushed out of the kernel's queue that we
2127            don't care about. Since this is O(n) this means that if you
2128            have a lot of processes you probably want to handle SIGCHLD
2129            yourself.
2130
2131            We do not reap the children here (by using WNOWAIT), this
2132            is only done after the event source is dispatched so that
2133            the callback still sees the process as a zombie.
2134         */
2135
2136         HASHMAP_FOREACH(s, e->child_sources, i) {
2137                 assert(s->type == SOURCE_CHILD);
2138
2139                 if (s->pending)
2140                         continue;
2141
2142                 if (s->enabled == SD_EVENT_OFF)
2143                         continue;
2144
2145                 zero(s->child.siginfo);
2146                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
2147                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
2148                 if (r < 0)
2149                         return -errno;
2150
2151                 if (s->child.siginfo.si_pid != 0) {
2152                         bool zombie =
2153                                 s->child.siginfo.si_code == CLD_EXITED ||
2154                                 s->child.siginfo.si_code == CLD_KILLED ||
2155                                 s->child.siginfo.si_code == CLD_DUMPED;
2156
2157                         if (!zombie && (s->child.options & WEXITED)) {
2158                                 /* If the child isn't dead then let's
2159                                  * immediately remove the state change
2160                                  * from the queue, since there's no
2161                                  * benefit in leaving it queued */
2162
2163                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
2164                                 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
2165                         }
2166
2167                         r = source_set_pending(s, true);
2168                         if (r < 0)
2169                                 return r;
2170                 }
2171         }
2172
2173         return 0;
2174 }
2175
2176 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
2177         bool read_one = false;
2178         int r;
2179
2180         assert(e);
2181         assert_return(events == EPOLLIN, -EIO);
2182
2183         /* If there's a signal queued on this priority and SIGCHLD is
2184            on this priority too, then make sure to recheck the
2185            children we watch. This is because we only ever dequeue
2186            the first signal per priority, and if we dequeue one, and
2187            SIGCHLD might be enqueued later we wouldn't know, but we
2188            might have higher priority children we care about hence we
2189            need to check that explicitly. */
2190
2191         if (sigismember(&d->sigset, SIGCHLD))
2192                 e->need_process_child = true;
2193
2194         /* If there's already an event source pending for this
2195          * priority we don't read another */
2196         if (d->current)
2197                 return 0;
2198
2199         for (;;) {
2200                 struct signalfd_siginfo si;
2201                 ssize_t n;
2202                 sd_event_source *s = NULL;
2203
2204                 n = read(d->fd, &si, sizeof(si));
2205                 if (n < 0) {
2206                         if (errno == EAGAIN || errno == EINTR)
2207                                 return read_one;
2208
2209                         return -errno;
2210                 }
2211
2212                 if (_unlikely_(n != sizeof(si)))
2213                         return -EIO;
2214
2215                 assert(si.ssi_signo < _NSIG);
2216
2217                 read_one = true;
2218
2219                 if (e->signal_sources)
2220                         s = e->signal_sources[si.ssi_signo];
2221                 if (!s)
2222                         continue;
2223                 if (s->pending)
2224                         continue;
2225
2226                 s->signal.siginfo = si;
2227                 d->current = s;
2228
2229                 r = source_set_pending(s, true);
2230                 if (r < 0)
2231                         return r;
2232
2233                 return 1;
2234         }
2235 }
2236
2237 static int source_dispatch(sd_event_source *s) {
2238         int r = 0;
2239
2240         assert(s);
2241         assert(s->pending || s->type == SOURCE_EXIT);
2242
2243         if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) {
2244                 r = source_set_pending(s, false);
2245                 if (r < 0)
2246                         return r;
2247         }
2248
2249         if (s->type != SOURCE_POST) {
2250                 sd_event_source *z;
2251                 Iterator i;
2252
2253                 /* If we execute a non-post source, let's mark all
2254                  * post sources as pending */
2255
2256                 SET_FOREACH(z, s->event->post_sources, i) {
2257                         if (z->enabled == SD_EVENT_OFF)
2258                                 continue;
2259
2260                         r = source_set_pending(z, true);
2261                         if (r < 0)
2262                                 return r;
2263                 }
2264         }
2265
2266         if (s->enabled == SD_EVENT_ONESHOT) {
2267                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
2268                 if (r < 0)
2269                         return r;
2270         }
2271
2272         s->dispatching = true;
2273
2274         switch (s->type) {
2275
2276         case SOURCE_IO:
2277                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
2278                 break;
2279
2280         case SOURCE_TIME_REALTIME:
2281         case SOURCE_TIME_BOOTTIME:
2282         case SOURCE_TIME_MONOTONIC:
2283         case SOURCE_TIME_REALTIME_ALARM:
2284         case SOURCE_TIME_BOOTTIME_ALARM:
2285                 r = s->time.callback(s, s->time.next, s->userdata);
2286                 break;
2287
2288         case SOURCE_SIGNAL:
2289                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
2290                 break;
2291
2292         case SOURCE_CHILD: {
2293                 bool zombie;
2294
2295                 zombie = s->child.siginfo.si_code == CLD_EXITED ||
2296                          s->child.siginfo.si_code == CLD_KILLED ||
2297                          s->child.siginfo.si_code == CLD_DUMPED;
2298
2299                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
2300
2301                 /* Now, reap the PID for good. */
2302                 if (zombie)
2303                         waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
2304
2305                 break;
2306         }
2307
2308         case SOURCE_DEFER:
2309                 r = s->defer.callback(s, s->userdata);
2310                 break;
2311
2312         case SOURCE_POST:
2313                 r = s->post.callback(s, s->userdata);
2314                 break;
2315
2316         case SOURCE_EXIT:
2317                 r = s->exit.callback(s, s->userdata);
2318                 break;
2319
2320         case SOURCE_WATCHDOG:
2321         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
2322         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
2323                 assert_not_reached("Wut? I shouldn't exist.");
2324         }
2325
2326         s->dispatching = false;
2327
2328         if (r < 0)
2329                 log_debug_errno(r, "Event source %s (type %s) returned error, disabling: %m",
2330                                 strna(s->description), event_source_type_to_string(s->type));
2331
2332         if (s->n_ref == 0)
2333                 source_free(s);
2334         else if (r < 0)
2335                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
2336
2337         return 1;
2338 }
2339
2340 static int event_prepare(sd_event *e) {
2341         int r;
2342
2343         assert(e);
2344
2345         for (;;) {
2346                 sd_event_source *s;
2347
2348                 s = prioq_peek(e->prepare);
2349                 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
2350                         break;
2351
2352                 s->prepare_iteration = e->iteration;
2353                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
2354                 if (r < 0)
2355                         return r;
2356
2357                 assert(s->prepare);
2358
2359                 s->dispatching = true;
2360                 r = s->prepare(s, s->userdata);
2361                 s->dispatching = false;
2362
2363                 if (r < 0)
2364                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, disabling: %m",
2365                                         strna(s->description), event_source_type_to_string(s->type));
2366
2367                 if (s->n_ref == 0)
2368                         source_free(s);
2369                 else if (r < 0)
2370                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
2371         }
2372
2373         return 0;
2374 }
2375
2376 static int dispatch_exit(sd_event *e) {
2377         sd_event_source *p;
2378         int r;
2379
2380         assert(e);
2381
2382         p = prioq_peek(e->exit);
2383         if (!p || p->enabled == SD_EVENT_OFF) {
2384                 e->state = SD_EVENT_FINISHED;
2385                 return 0;
2386         }
2387
2388         sd_event_ref(e);
2389         e->iteration++;
2390         e->state = SD_EVENT_EXITING;
2391
2392         r = source_dispatch(p);
2393
2394         e->state = SD_EVENT_INITIAL;
2395         sd_event_unref(e);
2396
2397         return r;
2398 }
2399
2400 static sd_event_source* event_next_pending(sd_event *e) {
2401         sd_event_source *p;
2402
2403         assert(e);
2404
2405         p = prioq_peek(e->pending);
2406         if (!p)
2407                 return NULL;
2408
2409         if (p->enabled == SD_EVENT_OFF)
2410                 return NULL;
2411
2412         return p;
2413 }
2414
2415 static int arm_watchdog(sd_event *e) {
2416         struct itimerspec its = {};
2417         usec_t t;
2418         int r;
2419
2420         assert(e);
2421         assert(e->watchdog_fd >= 0);
2422
2423         t = sleep_between(e,
2424                           e->watchdog_last + (e->watchdog_period / 2),
2425                           e->watchdog_last + (e->watchdog_period * 3 / 4));
2426
2427         timespec_store(&its.it_value, t);
2428
2429         /* Make sure we never set the watchdog to 0, which tells the
2430          * kernel to disable it. */
2431         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
2432                 its.it_value.tv_nsec = 1;
2433
2434         r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
2435         if (r < 0)
2436                 return -errno;
2437
2438         return 0;
2439 }
2440
2441 static int process_watchdog(sd_event *e) {
2442         assert(e);
2443
2444         if (!e->watchdog)
2445                 return 0;
2446
2447         /* Don't notify watchdog too often */
2448         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
2449                 return 0;
2450
2451         sd_notify(false, "WATCHDOG=1");
2452         e->watchdog_last = e->timestamp.monotonic;
2453
2454         return arm_watchdog(e);
2455 }
2456
2457 _public_ int sd_event_prepare(sd_event *e) {
2458         int r;
2459
2460         assert_return(e, -EINVAL);
2461         assert_return(!event_pid_changed(e), -ECHILD);
2462         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2463         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2464
2465         if (e->exit_requested)
2466                 goto pending;
2467
2468         e->iteration++;
2469
2470         e->state = SD_EVENT_PREPARING;
2471         r = event_prepare(e);
2472         e->state = SD_EVENT_INITIAL;
2473         if (r < 0)
2474                 return r;
2475
2476         r = event_arm_timer(e, &e->realtime);
2477         if (r < 0)
2478                 return r;
2479
2480         r = event_arm_timer(e, &e->boottime);
2481         if (r < 0)
2482                 return r;
2483
2484         r = event_arm_timer(e, &e->monotonic);
2485         if (r < 0)
2486                 return r;
2487
2488         r = event_arm_timer(e, &e->realtime_alarm);
2489         if (r < 0)
2490                 return r;
2491
2492         r = event_arm_timer(e, &e->boottime_alarm);
2493         if (r < 0)
2494                 return r;
2495
2496         if (event_next_pending(e) || e->need_process_child)
2497                 goto pending;
2498
2499         e->state = SD_EVENT_ARMED;
2500
2501         return 0;
2502
2503 pending:
2504         e->state = SD_EVENT_ARMED;
2505         r = sd_event_wait(e, 0);
2506         if (r == 0)
2507                 e->state = SD_EVENT_ARMED;
2508
2509         return r;
2510 }
2511
2512 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
2513         struct epoll_event *ev_queue;
2514         unsigned ev_queue_max;
2515         int r, m, i;
2516
2517         assert_return(e, -EINVAL);
2518         assert_return(!event_pid_changed(e), -ECHILD);
2519         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2520         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
2521
2522         if (e->exit_requested) {
2523                 e->state = SD_EVENT_PENDING;
2524                 return 1;
2525         }
2526
2527         ev_queue_max = MAX(e->n_sources, 1u);
2528         ev_queue = newa(struct epoll_event, ev_queue_max);
2529
2530         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
2531                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
2532         if (m < 0) {
2533                 if (errno == EINTR) {
2534                         e->state = SD_EVENT_PENDING;
2535                         return 1;
2536                 }
2537
2538                 r = -errno;
2539                 goto finish;
2540         }
2541
2542         dual_timestamp_get(&e->timestamp);
2543         e->timestamp_boottime = now(CLOCK_BOOTTIME);
2544
2545         for (i = 0; i < m; i++) {
2546
2547                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
2548                         r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
2549                 else {
2550                         WakeupType *t = ev_queue[i].data.ptr;
2551
2552                         switch (*t) {
2553
2554                         case WAKEUP_EVENT_SOURCE:
2555                         r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
2556                                 break;
2557
2558                         case WAKEUP_CLOCK_DATA: {
2559                                 struct clock_data *d = ev_queue[i].data.ptr;
2560                                 r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
2561                                 break;
2562                         }
2563
2564                         case WAKEUP_SIGNAL_DATA:
2565                                 r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
2566                                 break;
2567
2568                         default:
2569                                 assert_not_reached("Invalid wake-up pointer");
2570                         }
2571                 }
2572                 if (r < 0)
2573                         goto finish;
2574         }
2575
2576         r = process_watchdog(e);
2577         if (r < 0)
2578                 goto finish;
2579
2580         r = process_timer(e, e->timestamp.realtime, &e->realtime);
2581         if (r < 0)
2582                 goto finish;
2583
2584         r = process_timer(e, e->timestamp_boottime, &e->boottime);
2585         if (r < 0)
2586                 goto finish;
2587
2588         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
2589         if (r < 0)
2590                 goto finish;
2591
2592         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
2593         if (r < 0)
2594                 goto finish;
2595
2596         r = process_timer(e, e->timestamp_boottime, &e->boottime_alarm);
2597         if (r < 0)
2598                 goto finish;
2599
2600         if (e->need_process_child) {
2601                 r = process_child(e);
2602                 if (r < 0)
2603                         goto finish;
2604         }
2605
2606         if (event_next_pending(e)) {
2607                 e->state = SD_EVENT_PENDING;
2608
2609                 return 1;
2610         }
2611
2612         r = 0;
2613
2614 finish:
2615         e->state = SD_EVENT_INITIAL;
2616
2617         return r;
2618 }
2619
2620 _public_ int sd_event_dispatch(sd_event *e) {
2621         sd_event_source *p;
2622         int r;
2623
2624         assert_return(e, -EINVAL);
2625         assert_return(!event_pid_changed(e), -ECHILD);
2626         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2627         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
2628
2629         if (e->exit_requested)
2630                 return dispatch_exit(e);
2631
2632         p = event_next_pending(e);
2633         if (p) {
2634                 sd_event_ref(e);
2635
2636                 e->state = SD_EVENT_RUNNING;
2637                 r = source_dispatch(p);
2638                 e->state = SD_EVENT_INITIAL;
2639
2640                 sd_event_unref(e);
2641
2642                 return r;
2643         }
2644
2645         e->state = SD_EVENT_INITIAL;
2646
2647         return 1;
2648 }
2649
2650 static void event_log_delays(sd_event *e) {
2651         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1];
2652         unsigned i;
2653         int o;
2654
2655         for (i = o = 0; i < ELEMENTSOF(e->delays); i++) {
2656                 o += snprintf(&b[o], sizeof(b) - o, "%u ", e->delays[i]);
2657                 e->delays[i] = 0;
2658         }
2659         log_debug("Event loop iterations: %.*s", o, b);
2660 }
2661
2662 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
2663         int r;
2664
2665         assert_return(e, -EINVAL);
2666         assert_return(!event_pid_changed(e), -ECHILD);
2667         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2668         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2669
2670         if (e->profile_delays && e->last_run) {
2671                 usec_t this_run;
2672                 unsigned l;
2673
2674                 this_run = now(CLOCK_MONOTONIC);
2675
2676                 l = u64log2(this_run - e->last_run);
2677                 assert(l < sizeof(e->delays));
2678                 e->delays[l]++;
2679
2680                 if (this_run - e->last_log >= 5*USEC_PER_SEC) {
2681                         event_log_delays(e);
2682                         e->last_log = this_run;
2683                 }
2684         }
2685
2686         r = sd_event_prepare(e);
2687         if (r == 0)
2688                 /* There was nothing? Then wait... */
2689                 r = sd_event_wait(e, timeout);
2690
2691         if (e->profile_delays)
2692                 e->last_run = now(CLOCK_MONOTONIC);
2693
2694         if (r > 0) {
2695                 /* There's something now, then let's dispatch it */
2696                 r = sd_event_dispatch(e);
2697                 if (r < 0)
2698                         return r;
2699
2700                 return 1;
2701         }
2702
2703         return r;
2704 }
2705
2706 #if 0 /// UNNEEDED by elogind
2707 _public_ int sd_event_loop(sd_event *e) {
2708         int r;
2709
2710         assert_return(e, -EINVAL);
2711         assert_return(!event_pid_changed(e), -ECHILD);
2712         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2713
2714         sd_event_ref(e);
2715
2716         while (e->state != SD_EVENT_FINISHED) {
2717                 r = sd_event_run(e, (uint64_t) -1);
2718                 if (r < 0)
2719                         goto finish;
2720         }
2721
2722         r = e->exit_code;
2723
2724 finish:
2725         sd_event_unref(e);
2726         return r;
2727 }
2728
2729 _public_ int sd_event_get_fd(sd_event *e) {
2730
2731         assert_return(e, -EINVAL);
2732         assert_return(!event_pid_changed(e), -ECHILD);
2733
2734         return e->epoll_fd;
2735 }
2736 #endif // 0
2737
2738 _public_ int sd_event_get_state(sd_event *e) {
2739         assert_return(e, -EINVAL);
2740         assert_return(!event_pid_changed(e), -ECHILD);
2741
2742         return e->state;
2743 }
2744
2745 #if 0 /// UNNEEDED by elogind
2746 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2747         assert_return(e, -EINVAL);
2748         assert_return(code, -EINVAL);
2749         assert_return(!event_pid_changed(e), -ECHILD);
2750
2751         if (!e->exit_requested)
2752                 return -ENODATA;
2753
2754         *code = e->exit_code;
2755         return 0;
2756 }
2757 #endif // 0
2758
2759 _public_ int sd_event_exit(sd_event *e, int code) {
2760         assert_return(e, -EINVAL);
2761         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2762         assert_return(!event_pid_changed(e), -ECHILD);
2763
2764         e->exit_requested = true;
2765         e->exit_code = code;
2766
2767         return 0;
2768 }
2769
2770 #if 0 /// UNNEEDED by elogind
2771 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
2772         assert_return(e, -EINVAL);
2773         assert_return(usec, -EINVAL);
2774         assert_return(!event_pid_changed(e), -ECHILD);
2775         assert_return(IN_SET(clock,
2776                              CLOCK_REALTIME,
2777                              CLOCK_REALTIME_ALARM,
2778                              CLOCK_MONOTONIC,
2779                              CLOCK_BOOTTIME,
2780                              CLOCK_BOOTTIME_ALARM), -EOPNOTSUPP);
2781
2782         if (!dual_timestamp_is_set(&e->timestamp)) {
2783                 /* Implicitly fall back to now() if we never ran
2784                  * before and thus have no cached time. */
2785                 *usec = now(clock);
2786                 return 1;
2787         }
2788
2789         switch (clock) {
2790
2791         case CLOCK_REALTIME:
2792         case CLOCK_REALTIME_ALARM:
2793                 *usec = e->timestamp.realtime;
2794                 break;
2795
2796         case CLOCK_MONOTONIC:
2797                 *usec = e->timestamp.monotonic;
2798                 break;
2799
2800         default:
2801                 *usec = e->timestamp_boottime;
2802                 break;
2803         }
2804
2805         return 0;
2806 }
2807 #endif // 0
2808
2809 _public_ int sd_event_default(sd_event **ret) {
2810
2811         static thread_local sd_event *default_event = NULL;
2812         sd_event *e = NULL;
2813         int r;
2814
2815         if (!ret)
2816                 return !!default_event;
2817
2818         if (default_event) {
2819                 *ret = sd_event_ref(default_event);
2820                 return 0;
2821         }
2822
2823         r = sd_event_new(&e);
2824         if (r < 0)
2825                 return r;
2826
2827         e->default_event_ptr = &default_event;
2828         e->tid = gettid();
2829         default_event = e;
2830
2831         *ret = e;
2832         return 1;
2833 }
2834
2835 #if 0 /// UNNEEDED by elogind
2836 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2837         assert_return(e, -EINVAL);
2838         assert_return(tid, -EINVAL);
2839         assert_return(!event_pid_changed(e), -ECHILD);
2840
2841         if (e->tid != 0) {
2842                 *tid = e->tid;
2843                 return 0;
2844         }
2845
2846         return -ENXIO;
2847 }
2848 #endif // 0
2849
2850 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2851         int r;
2852
2853         assert_return(e, -EINVAL);
2854         assert_return(!event_pid_changed(e), -ECHILD);
2855
2856         if (e->watchdog == !!b)
2857                 return e->watchdog;
2858
2859         if (b) {
2860                 struct epoll_event ev = {};
2861
2862                 r = sd_watchdog_enabled(false, &e->watchdog_period);
2863                 if (r <= 0)
2864                         return r;
2865
2866                 /* Issue first ping immediately */
2867                 sd_notify(false, "WATCHDOG=1");
2868                 e->watchdog_last = now(CLOCK_MONOTONIC);
2869
2870                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2871                 if (e->watchdog_fd < 0)
2872                         return -errno;
2873
2874                 r = arm_watchdog(e);
2875                 if (r < 0)
2876                         goto fail;
2877
2878                 ev.events = EPOLLIN;
2879                 ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
2880
2881                 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2882                 if (r < 0) {
2883                         r = -errno;
2884                         goto fail;
2885                 }
2886
2887         } else {
2888                 if (e->watchdog_fd >= 0) {
2889                         epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2890                         e->watchdog_fd = safe_close(e->watchdog_fd);
2891                 }
2892         }
2893
2894         e->watchdog = !!b;
2895         return e->watchdog;
2896
2897 fail:
2898         e->watchdog_fd = safe_close(e->watchdog_fd);
2899         return r;
2900 }
2901
2902 #if 0 /// UNNEEDED by elogind
2903 _public_ int sd_event_get_watchdog(sd_event *e) {
2904         assert_return(e, -EINVAL);
2905         assert_return(!event_pid_changed(e), -ECHILD);
2906
2907         return e->watchdog;
2908 }
2909 #endif // 0