chiark / gitweb /
tree-wide: fall back to now(CLOCK_MONOTONIC) if CLOCK_BOOTTIME unsupported (#3037)
[elogind.git] / src / libelogind / sd-event / sd-event.c
1 /***
2   This file is part of systemd.
3
4   Copyright 2013 Lennart Poettering
5
6   systemd is free software; you can redistribute it and/or modify it
7   under the terms of the GNU Lesser General Public License as published by
8   the Free Software Foundation; either version 2.1 of the License, or
9   (at your option) any later version.
10
11   systemd is distributed in the hope that it will be useful, but
12   WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public License
17   along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <sys/epoll.h>
21 #include <sys/timerfd.h>
22 #include <sys/wait.h>
23
24 #include "sd-daemon.h"
25 #include "sd-event.h"
26 #include "sd-id128.h"
27
28 #include "alloc-util.h"
29 #include "fd-util.h"
30 #include "hashmap.h"
31 #include "list.h"
32 #include "macro.h"
33 #include "missing.h"
34 #include "prioq.h"
35 #include "process-util.h"
36 #include "set.h"
37 #include "signal-util.h"
38 #include "string-table.h"
39 #include "string-util.h"
40 #include "time-util.h"
41 #include "util.h"
42
43 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
44
45 typedef enum EventSourceType {
46         SOURCE_IO,
47         SOURCE_TIME_REALTIME,
48         SOURCE_TIME_BOOTTIME,
49         SOURCE_TIME_MONOTONIC,
50         SOURCE_TIME_REALTIME_ALARM,
51         SOURCE_TIME_BOOTTIME_ALARM,
52         SOURCE_SIGNAL,
53         SOURCE_CHILD,
54         SOURCE_DEFER,
55         SOURCE_POST,
56         SOURCE_EXIT,
57         SOURCE_WATCHDOG,
58         _SOURCE_EVENT_SOURCE_TYPE_MAX,
59         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
60 } EventSourceType;
61
62 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
63         [SOURCE_IO] = "io",
64         [SOURCE_TIME_REALTIME] = "realtime",
65         [SOURCE_TIME_BOOTTIME] = "bootime",
66         [SOURCE_TIME_MONOTONIC] = "monotonic",
67         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
68         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
69         [SOURCE_SIGNAL] = "signal",
70         [SOURCE_CHILD] = "child",
71         [SOURCE_DEFER] = "defer",
72         [SOURCE_POST] = "post",
73         [SOURCE_EXIT] = "exit",
74         [SOURCE_WATCHDOG] = "watchdog",
75 };
76
77 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
78
79 /* All objects we use in epoll events start with this value, so that
80  * we know how to dispatch it */
81 typedef enum WakeupType {
82         WAKEUP_NONE,
83         WAKEUP_EVENT_SOURCE,
84         WAKEUP_CLOCK_DATA,
85         WAKEUP_SIGNAL_DATA,
86         _WAKEUP_TYPE_MAX,
87         _WAKEUP_TYPE_INVALID = -1,
88 } WakeupType;
89
90 #define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
91
92 struct sd_event_source {
93         WakeupType wakeup;
94
95         unsigned n_ref;
96
97         sd_event *event;
98         void *userdata;
99         sd_event_handler_t prepare;
100
101         char *description;
102
103         EventSourceType type:5;
104         int enabled:3;
105         bool pending:1;
106         bool dispatching:1;
107         bool floating:1;
108
109         int64_t priority;
110         unsigned pending_index;
111         unsigned prepare_index;
112         unsigned pending_iteration;
113         unsigned prepare_iteration;
114
115         LIST_FIELDS(sd_event_source, sources);
116
117         union {
118                 struct {
119                         sd_event_io_handler_t callback;
120                         int fd;
121                         uint32_t events;
122                         uint32_t revents;
123                         bool registered:1;
124                 } io;
125                 struct {
126                         sd_event_time_handler_t callback;
127                         usec_t next, accuracy;
128                         unsigned earliest_index;
129                         unsigned latest_index;
130                 } time;
131                 struct {
132                         sd_event_signal_handler_t callback;
133                         struct signalfd_siginfo siginfo;
134                         int sig;
135                 } signal;
136                 struct {
137                         sd_event_child_handler_t callback;
138                         siginfo_t siginfo;
139                         pid_t pid;
140                         int options;
141                 } child;
142                 struct {
143                         sd_event_handler_t callback;
144                 } defer;
145                 struct {
146                         sd_event_handler_t callback;
147                 } post;
148                 struct {
149                         sd_event_handler_t callback;
150                         unsigned prioq_index;
151                 } exit;
152         };
153 };
154
155 struct clock_data {
156         WakeupType wakeup;
157         int fd;
158
159         /* For all clocks we maintain two priority queues each, one
160          * ordered for the earliest times the events may be
161          * dispatched, and one ordered by the latest times they must
162          * have been dispatched. The range between the top entries in
163          * the two prioqs is the time window we can freely schedule
164          * wakeups in */
165
166         Prioq *earliest;
167         Prioq *latest;
168         usec_t next;
169
170         bool needs_rearm:1;
171 };
172
173 struct signal_data {
174         WakeupType wakeup;
175
176         /* For each priority we maintain one signal fd, so that we
177          * only have to dequeue a single event per priority at a
178          * time. */
179
180         int fd;
181         int64_t priority;
182         sigset_t sigset;
183         sd_event_source *current;
184 };
185
186 struct sd_event {
187         unsigned n_ref;
188
189         int epoll_fd;
190         int watchdog_fd;
191
192         Prioq *pending;
193         Prioq *prepare;
194
195         /* timerfd_create() only supports these five clocks so far. We
196          * can add support for more clocks when the kernel learns to
197          * deal with them, too. */
198         struct clock_data realtime;
199         struct clock_data boottime;
200         struct clock_data monotonic;
201         struct clock_data realtime_alarm;
202         struct clock_data boottime_alarm;
203
204         usec_t perturb;
205
206         sd_event_source **signal_sources; /* indexed by signal number */
207         Hashmap *signal_data; /* indexed by priority */
208
209         Hashmap *child_sources;
210         unsigned n_enabled_child_sources;
211
212         Set *post_sources;
213
214         Prioq *exit;
215
216         pid_t original_pid;
217
218         unsigned iteration;
219         dual_timestamp timestamp;
220         usec_t timestamp_boottime;
221         int state;
222
223         bool exit_requested:1;
224         bool need_process_child:1;
225         bool watchdog:1;
226         bool profile_delays:1;
227
228         int exit_code;
229
230         pid_t tid;
231         sd_event **default_event_ptr;
232
233         usec_t watchdog_last, watchdog_period;
234
235         unsigned n_sources;
236
237         LIST_HEAD(sd_event_source, sources);
238
239         usec_t last_run, last_log;
240         unsigned delays[sizeof(usec_t) * 8];
241 };
242
243 static void source_disconnect(sd_event_source *s);
244
245 static int pending_prioq_compare(const void *a, const void *b) {
246         const sd_event_source *x = a, *y = b;
247
248         assert(x->pending);
249         assert(y->pending);
250
251         /* Enabled ones first */
252         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
253                 return -1;
254         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
255                 return 1;
256
257         /* Lower priority values first */
258         if (x->priority < y->priority)
259                 return -1;
260         if (x->priority > y->priority)
261                 return 1;
262
263         /* Older entries first */
264         if (x->pending_iteration < y->pending_iteration)
265                 return -1;
266         if (x->pending_iteration > y->pending_iteration)
267                 return 1;
268
269         return 0;
270 }
271
272 static int prepare_prioq_compare(const void *a, const void *b) {
273         const sd_event_source *x = a, *y = b;
274
275         assert(x->prepare);
276         assert(y->prepare);
277
278         /* Enabled ones first */
279         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
280                 return -1;
281         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
282                 return 1;
283
284         /* Move most recently prepared ones last, so that we can stop
285          * preparing as soon as we hit one that has already been
286          * prepared in the current iteration */
287         if (x->prepare_iteration < y->prepare_iteration)
288                 return -1;
289         if (x->prepare_iteration > y->prepare_iteration)
290                 return 1;
291
292         /* Lower priority values first */
293         if (x->priority < y->priority)
294                 return -1;
295         if (x->priority > y->priority)
296                 return 1;
297
298         return 0;
299 }
300
301 static int earliest_time_prioq_compare(const void *a, const void *b) {
302         const sd_event_source *x = a, *y = b;
303
304         assert(EVENT_SOURCE_IS_TIME(x->type));
305         assert(x->type == y->type);
306
307         /* Enabled ones first */
308         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
309                 return -1;
310         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
311                 return 1;
312
313         /* Move the pending ones to the end */
314         if (!x->pending && y->pending)
315                 return -1;
316         if (x->pending && !y->pending)
317                 return 1;
318
319         /* Order by time */
320         if (x->time.next < y->time.next)
321                 return -1;
322         if (x->time.next > y->time.next)
323                 return 1;
324
325         return 0;
326 }
327
328 static usec_t time_event_source_latest(const sd_event_source *s) {
329         return usec_add(s->time.next, s->time.accuracy);
330 }
331
332 static int latest_time_prioq_compare(const void *a, const void *b) {
333         const sd_event_source *x = a, *y = b;
334
335         assert(EVENT_SOURCE_IS_TIME(x->type));
336         assert(x->type == y->type);
337
338         /* Enabled ones first */
339         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
340                 return -1;
341         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
342                 return 1;
343
344         /* Move the pending ones to the end */
345         if (!x->pending && y->pending)
346                 return -1;
347         if (x->pending && !y->pending)
348                 return 1;
349
350         /* Order by time */
351         if (time_event_source_latest(x) < time_event_source_latest(y))
352                 return -1;
353         if (time_event_source_latest(x) > time_event_source_latest(y))
354                 return 1;
355
356         return 0;
357 }
358
359 static int exit_prioq_compare(const void *a, const void *b) {
360         const sd_event_source *x = a, *y = b;
361
362         assert(x->type == SOURCE_EXIT);
363         assert(y->type == SOURCE_EXIT);
364
365         /* Enabled ones first */
366         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
367                 return -1;
368         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
369                 return 1;
370
371         /* Lower priority values first */
372         if (x->priority < y->priority)
373                 return -1;
374         if (x->priority > y->priority)
375                 return 1;
376
377         return 0;
378 }
379
380 static void free_clock_data(struct clock_data *d) {
381         assert(d);
382         assert(d->wakeup == WAKEUP_CLOCK_DATA);
383
384         safe_close(d->fd);
385         prioq_free(d->earliest);
386         prioq_free(d->latest);
387 }
388
389 static void event_free(sd_event *e) {
390         sd_event_source *s;
391
392         assert(e);
393
394         while ((s = e->sources)) {
395                 assert(s->floating);
396                 source_disconnect(s);
397                 sd_event_source_unref(s);
398         }
399
400         assert(e->n_sources == 0);
401
402         if (e->default_event_ptr)
403                 *(e->default_event_ptr) = NULL;
404
405         safe_close(e->epoll_fd);
406         safe_close(e->watchdog_fd);
407
408         free_clock_data(&e->realtime);
409         free_clock_data(&e->boottime);
410         free_clock_data(&e->monotonic);
411         free_clock_data(&e->realtime_alarm);
412         free_clock_data(&e->boottime_alarm);
413
414         prioq_free(e->pending);
415         prioq_free(e->prepare);
416         prioq_free(e->exit);
417
418         free(e->signal_sources);
419         hashmap_free(e->signal_data);
420
421         hashmap_free(e->child_sources);
422         set_free(e->post_sources);
423         free(e);
424 }
425
426 _public_ int sd_event_new(sd_event** ret) {
427         sd_event *e;
428         int r;
429
430         assert_return(ret, -EINVAL);
431
432         e = new0(sd_event, 1);
433         if (!e)
434                 return -ENOMEM;
435
436         e->n_ref = 1;
437         e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
438         e->realtime.next = e->boottime.next = e->monotonic.next = e->realtime_alarm.next = e->boottime_alarm.next = USEC_INFINITY;
439         e->realtime.wakeup = e->boottime.wakeup = e->monotonic.wakeup = e->realtime_alarm.wakeup = e->boottime_alarm.wakeup = WAKEUP_CLOCK_DATA;
440         e->original_pid = getpid();
441         e->perturb = USEC_INFINITY;
442
443         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
444         if (r < 0)
445                 goto fail;
446
447         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
448         if (e->epoll_fd < 0) {
449                 r = -errno;
450                 goto fail;
451         }
452
453         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
454                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 ... 2^63 us will be logged every 5s.");
455                 e->profile_delays = true;
456         }
457
458         *ret = e;
459         return 0;
460
461 fail:
462         event_free(e);
463         return r;
464 }
465
466 _public_ sd_event* sd_event_ref(sd_event *e) {
467
468         if (!e)
469                 return NULL;
470
471         assert(e->n_ref >= 1);
472         e->n_ref++;
473
474         return e;
475 }
476
477 _public_ sd_event* sd_event_unref(sd_event *e) {
478
479         if (!e)
480                 return NULL;
481
482         assert(e->n_ref >= 1);
483         e->n_ref--;
484
485         if (e->n_ref <= 0)
486                 event_free(e);
487
488         return NULL;
489 }
490
491 static bool event_pid_changed(sd_event *e) {
492         assert(e);
493
494         /* We don't support people creating an event loop and keeping
495          * it around over a fork(). Let's complain. */
496
497         return e->original_pid != getpid();
498 }
499
500 static void source_io_unregister(sd_event_source *s) {
501         int r;
502
503         assert(s);
504         assert(s->type == SOURCE_IO);
505
506         if (event_pid_changed(s->event))
507                 return;
508
509         if (!s->io.registered)
510                 return;
511
512         r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
513         if (r < 0)
514                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
515                                 strna(s->description), event_source_type_to_string(s->type));
516
517         s->io.registered = false;
518 }
519
520 static int source_io_register(
521                 sd_event_source *s,
522                 int enabled,
523                 uint32_t events) {
524
525         struct epoll_event ev = {};
526         int r;
527
528         assert(s);
529         assert(s->type == SOURCE_IO);
530         assert(enabled != SD_EVENT_OFF);
531
532         ev.events = events;
533         ev.data.ptr = s;
534
535         if (enabled == SD_EVENT_ONESHOT)
536                 ev.events |= EPOLLONESHOT;
537
538         if (s->io.registered)
539                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
540         else
541                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
542         if (r < 0)
543                 return -errno;
544
545         s->io.registered = true;
546
547         return 0;
548 }
549
550 #if 0 /// UNNEEDED by elogind
551 static clockid_t event_source_type_to_clock(EventSourceType t) {
552
553         switch (t) {
554
555         case SOURCE_TIME_REALTIME:
556                 return CLOCK_REALTIME;
557
558         case SOURCE_TIME_BOOTTIME:
559                 return CLOCK_BOOTTIME;
560
561         case SOURCE_TIME_MONOTONIC:
562                 return CLOCK_MONOTONIC;
563
564         case SOURCE_TIME_REALTIME_ALARM:
565                 return CLOCK_REALTIME_ALARM;
566
567         case SOURCE_TIME_BOOTTIME_ALARM:
568                 return CLOCK_BOOTTIME_ALARM;
569
570         default:
571                 return (clockid_t) -1;
572         }
573 }
574 #endif // 0
575
576 static EventSourceType clock_to_event_source_type(clockid_t clock) {
577
578         switch (clock) {
579
580         case CLOCK_REALTIME:
581                 return SOURCE_TIME_REALTIME;
582
583         case CLOCK_BOOTTIME:
584                 return SOURCE_TIME_BOOTTIME;
585
586         case CLOCK_MONOTONIC:
587                 return SOURCE_TIME_MONOTONIC;
588
589         case CLOCK_REALTIME_ALARM:
590                 return SOURCE_TIME_REALTIME_ALARM;
591
592         case CLOCK_BOOTTIME_ALARM:
593                 return SOURCE_TIME_BOOTTIME_ALARM;
594
595         default:
596                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
597         }
598 }
599
600 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
601         assert(e);
602
603         switch (t) {
604
605         case SOURCE_TIME_REALTIME:
606                 return &e->realtime;
607
608         case SOURCE_TIME_BOOTTIME:
609                 return &e->boottime;
610
611         case SOURCE_TIME_MONOTONIC:
612                 return &e->monotonic;
613
614         case SOURCE_TIME_REALTIME_ALARM:
615                 return &e->realtime_alarm;
616
617         case SOURCE_TIME_BOOTTIME_ALARM:
618                 return &e->boottime_alarm;
619
620         default:
621                 return NULL;
622         }
623 }
624
625 static int event_make_signal_data(
626                 sd_event *e,
627                 int sig,
628                 struct signal_data **ret) {
629
630         struct epoll_event ev = {};
631         struct signal_data *d;
632         bool added = false;
633         sigset_t ss_copy;
634         int64_t priority;
635         int r;
636
637         assert(e);
638
639         if (event_pid_changed(e))
640                 return -ECHILD;
641
642         if (e->signal_sources && e->signal_sources[sig])
643                 priority = e->signal_sources[sig]->priority;
644         else
645                 priority = 0;
646
647         d = hashmap_get(e->signal_data, &priority);
648         if (d) {
649                 if (sigismember(&d->sigset, sig) > 0) {
650                         if (ret)
651                                 *ret = d;
652                 return 0;
653                 }
654         } else {
655                 r = hashmap_ensure_allocated(&e->signal_data, &uint64_hash_ops);
656                 if (r < 0)
657                         return r;
658
659                 d = new0(struct signal_data, 1);
660                 if (!d)
661                         return -ENOMEM;
662
663                 d->wakeup = WAKEUP_SIGNAL_DATA;
664                 d->fd  = -1;
665                 d->priority = priority;
666
667                 r = hashmap_put(e->signal_data, &d->priority, d);
668                 if (r < 0) {
669                         free(d);
670                         return r;
671                 }
672
673                 added = true;
674         }
675
676         ss_copy = d->sigset;
677         assert_se(sigaddset(&ss_copy, sig) >= 0);
678
679         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
680         if (r < 0) {
681                 r = -errno;
682                 goto fail;
683         }
684
685         d->sigset = ss_copy;
686
687         if (d->fd >= 0) {
688                 if (ret)
689                         *ret = d;
690                 return 0;
691         }
692
693         d->fd = r;
694
695         ev.events = EPOLLIN;
696         ev.data.ptr = d;
697
698         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev);
699         if (r < 0) {
700                 r = -errno;
701                 goto fail;
702         }
703
704         if (ret)
705                 *ret = d;
706
707         return 0;
708
709 fail:
710         if (added) {
711                 d->fd = safe_close(d->fd);
712                 hashmap_remove(e->signal_data, &d->priority);
713                 free(d);
714         }
715
716         return r;
717 }
718
719 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
720         assert(e);
721         assert(d);
722
723         /* Turns off the specified signal in the signal data
724          * object. If the signal mask of the object becomes empty that
725          * way removes it. */
726
727         if (sigismember(&d->sigset, sig) == 0)
728                 return;
729
730         assert_se(sigdelset(&d->sigset, sig) >= 0);
731
732         if (sigisemptyset(&d->sigset)) {
733
734                 /* If all the mask is all-zero we can get rid of the structure */
735                 hashmap_remove(e->signal_data, &d->priority);
736                 assert(!d->current);
737                 safe_close(d->fd);
738                 free(d);
739                 return;
740         }
741
742         assert(d->fd >= 0);
743
744         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
745                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
746 }
747
748 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
749         struct signal_data *d;
750         static const int64_t zero_priority = 0;
751
752         assert(e);
753
754         /* Rechecks if the specified signal is still something we are
755          * interested in. If not, we'll unmask it, and possibly drop
756          * the signalfd for it. */
757
758         if (sig == SIGCHLD &&
759             e->n_enabled_child_sources > 0)
760                 return;
761
762         if (e->signal_sources &&
763             e->signal_sources[sig] &&
764             e->signal_sources[sig]->enabled != SD_EVENT_OFF)
765                 return;
766
767         /*
768          * The specified signal might be enabled in three different queues:
769          *
770          * 1) the one that belongs to the priority passed (if it is non-NULL)
771          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
772          * 3) the 0 priority (to cover the SIGCHLD case)
773          *
774          * Hence, let's remove it from all three here.
775          */
776
777         if (priority) {
778                 d = hashmap_get(e->signal_data, priority);
779                 if (d)
780                         event_unmask_signal_data(e, d, sig);
781         }
782
783         if (e->signal_sources && e->signal_sources[sig]) {
784                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
785                 if (d)
786                         event_unmask_signal_data(e, d, sig);
787         }
788
789         d = hashmap_get(e->signal_data, &zero_priority);
790         if (d)
791                 event_unmask_signal_data(e, d, sig);
792 }
793
794 static void source_disconnect(sd_event_source *s) {
795         sd_event *event;
796
797         assert(s);
798
799         if (!s->event)
800                 return;
801
802         assert(s->event->n_sources > 0);
803
804         switch (s->type) {
805
806         case SOURCE_IO:
807                 if (s->io.fd >= 0)
808                         source_io_unregister(s);
809
810                 break;
811
812         case SOURCE_TIME_REALTIME:
813         case SOURCE_TIME_BOOTTIME:
814         case SOURCE_TIME_MONOTONIC:
815         case SOURCE_TIME_REALTIME_ALARM:
816         case SOURCE_TIME_BOOTTIME_ALARM: {
817                 struct clock_data *d;
818
819                 d = event_get_clock_data(s->event, s->type);
820                 assert(d);
821
822                 prioq_remove(d->earliest, s, &s->time.earliest_index);
823                 prioq_remove(d->latest, s, &s->time.latest_index);
824                 d->needs_rearm = true;
825                 break;
826         }
827
828         case SOURCE_SIGNAL:
829                 if (s->signal.sig > 0) {
830
831                         if (s->event->signal_sources)
832                                 s->event->signal_sources[s->signal.sig] = NULL;
833
834                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
835                 }
836
837                 break;
838
839         case SOURCE_CHILD:
840                 if (s->child.pid > 0) {
841                         if (s->enabled != SD_EVENT_OFF) {
842                                 assert(s->event->n_enabled_child_sources > 0);
843                                 s->event->n_enabled_child_sources--;
844                         }
845
846                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
847                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
848                 }
849
850                 break;
851
852         case SOURCE_DEFER:
853                 /* nothing */
854                 break;
855
856         case SOURCE_POST:
857                 set_remove(s->event->post_sources, s);
858                 break;
859
860         case SOURCE_EXIT:
861                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
862                 break;
863
864         default:
865                 assert_not_reached("Wut? I shouldn't exist.");
866         }
867
868         if (s->pending)
869                 prioq_remove(s->event->pending, s, &s->pending_index);
870
871         if (s->prepare)
872                 prioq_remove(s->event->prepare, s, &s->prepare_index);
873
874         event = s->event;
875
876         s->type = _SOURCE_EVENT_SOURCE_TYPE_INVALID;
877         s->event = NULL;
878         LIST_REMOVE(sources, event->sources, s);
879         event->n_sources--;
880
881         if (!s->floating)
882                 sd_event_unref(event);
883 }
884
885 static void source_free(sd_event_source *s) {
886         assert(s);
887
888         source_disconnect(s);
889         free(s->description);
890         free(s);
891 }
892
893 static int source_set_pending(sd_event_source *s, bool b) {
894         int r;
895
896         assert(s);
897         assert(s->type != SOURCE_EXIT);
898
899         if (s->pending == b)
900                 return 0;
901
902         s->pending = b;
903
904         if (b) {
905                 s->pending_iteration = s->event->iteration;
906
907                 r = prioq_put(s->event->pending, s, &s->pending_index);
908                 if (r < 0) {
909                         s->pending = false;
910                         return r;
911                 }
912         } else
913                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
914
915         if (EVENT_SOURCE_IS_TIME(s->type)) {
916                 struct clock_data *d;
917
918                 d = event_get_clock_data(s->event, s->type);
919                 assert(d);
920
921                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
922                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
923                 d->needs_rearm = true;
924         }
925
926         if (s->type == SOURCE_SIGNAL && !b) {
927                 struct signal_data *d;
928
929                 d = hashmap_get(s->event->signal_data, &s->priority);
930                 if (d && d->current == s)
931                         d->current = NULL;
932         }
933
934         return 0;
935 }
936
937 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
938         sd_event_source *s;
939
940         assert(e);
941
942         s = new0(sd_event_source, 1);
943         if (!s)
944                 return NULL;
945
946         s->n_ref = 1;
947         s->event = e;
948         s->floating = floating;
949         s->type = type;
950         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
951
952         if (!floating)
953                 sd_event_ref(e);
954
955         LIST_PREPEND(sources, e->sources, s);
956         e->n_sources ++;
957
958         return s;
959 }
960
961 _public_ int sd_event_add_io(
962                 sd_event *e,
963                 sd_event_source **ret,
964                 int fd,
965                 uint32_t events,
966                 sd_event_io_handler_t callback,
967                 void *userdata) {
968
969         sd_event_source *s;
970         int r;
971
972         assert_return(e, -EINVAL);
973         assert_return(fd >= 0, -EBADF);
974         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
975         assert_return(callback, -EINVAL);
976         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
977         assert_return(!event_pid_changed(e), -ECHILD);
978
979         s = source_new(e, !ret, SOURCE_IO);
980         if (!s)
981                 return -ENOMEM;
982
983         s->wakeup = WAKEUP_EVENT_SOURCE;
984         s->io.fd = fd;
985         s->io.events = events;
986         s->io.callback = callback;
987         s->userdata = userdata;
988         s->enabled = SD_EVENT_ON;
989
990         r = source_io_register(s, s->enabled, events);
991         if (r < 0) {
992                 source_free(s);
993                 return r;
994         }
995
996         if (ret)
997                 *ret = s;
998
999         return 0;
1000 }
1001
1002 static void initialize_perturb(sd_event *e) {
1003         sd_id128_t bootid = {};
1004
1005         /* When we sleep for longer, we try to realign the wakeup to
1006            the same time wihtin each minute/second/250ms, so that
1007            events all across the system can be coalesced into a single
1008            CPU wakeup. However, let's take some system-specific
1009            randomness for this value, so that in a network of systems
1010            with synced clocks timer events are distributed a
1011            bit. Here, we calculate a perturbation usec offset from the
1012            boot ID. */
1013
1014         if (_likely_(e->perturb != USEC_INFINITY))
1015                 return;
1016
1017         if (sd_id128_get_boot(&bootid) >= 0)
1018                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1019 }
1020
1021 static int event_setup_timer_fd(
1022                 sd_event *e,
1023                 struct clock_data *d,
1024                 clockid_t clock) {
1025
1026         struct epoll_event ev = {};
1027         int r, fd;
1028
1029         assert(e);
1030         assert(d);
1031
1032         if (_likely_(d->fd >= 0))
1033                 return 0;
1034
1035         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1036         if (fd < 0)
1037                 return -errno;
1038
1039         ev.events = EPOLLIN;
1040         ev.data.ptr = d;
1041
1042         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
1043         if (r < 0) {
1044                 safe_close(fd);
1045                 return -errno;
1046         }
1047
1048         d->fd = fd;
1049         return 0;
1050 }
1051
1052 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1053         assert(s);
1054
1055         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1056 }
1057
1058 _public_ int sd_event_add_time(
1059                 sd_event *e,
1060                 sd_event_source **ret,
1061                 clockid_t clock,
1062                 uint64_t usec,
1063                 uint64_t accuracy,
1064                 sd_event_time_handler_t callback,
1065                 void *userdata) {
1066
1067         EventSourceType type;
1068         sd_event_source *s;
1069         struct clock_data *d;
1070         int r;
1071
1072         assert_return(e, -EINVAL);
1073         assert_return(accuracy != (uint64_t) -1, -EINVAL);
1074         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1075         assert_return(!event_pid_changed(e), -ECHILD);
1076
1077         if (!callback)
1078                 callback = time_exit_callback;
1079
1080         type = clock_to_event_source_type(clock);
1081         assert_return(type >= 0, -EOPNOTSUPP);
1082
1083         d = event_get_clock_data(e, type);
1084         assert(d);
1085
1086         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1087         if (r < 0)
1088                 return r;
1089
1090         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1091         if (r < 0)
1092                 return r;
1093
1094         if (d->fd < 0) {
1095                 r = event_setup_timer_fd(e, d, clock);
1096                 if (r < 0)
1097                         return r;
1098         }
1099
1100         s = source_new(e, !ret, type);
1101         if (!s)
1102                 return -ENOMEM;
1103
1104         s->time.next = usec;
1105         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1106         s->time.callback = callback;
1107         s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
1108         s->userdata = userdata;
1109         s->enabled = SD_EVENT_ONESHOT;
1110
1111         d->needs_rearm = true;
1112
1113         r = prioq_put(d->earliest, s, &s->time.earliest_index);
1114         if (r < 0)
1115                 goto fail;
1116
1117         r = prioq_put(d->latest, s, &s->time.latest_index);
1118         if (r < 0)
1119                 goto fail;
1120
1121         if (ret)
1122                 *ret = s;
1123
1124         return 0;
1125
1126 fail:
1127         source_free(s);
1128         return r;
1129 }
1130
1131 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1132         assert(s);
1133
1134         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1135 }
1136
1137 _public_ int sd_event_add_signal(
1138                 sd_event *e,
1139                 sd_event_source **ret,
1140                 int sig,
1141                 sd_event_signal_handler_t callback,
1142                 void *userdata) {
1143
1144         sd_event_source *s;
1145         struct signal_data *d;
1146         sigset_t ss;
1147         int r;
1148
1149         assert_return(e, -EINVAL);
1150         assert_return(SIGNAL_VALID(sig), -EINVAL);
1151         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1152         assert_return(!event_pid_changed(e), -ECHILD);
1153
1154         if (!callback)
1155                 callback = signal_exit_callback;
1156
1157         r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
1158         if (r != 0)
1159                 return -r;
1160
1161         if (!sigismember(&ss, sig))
1162                 return -EBUSY;
1163
1164         if (!e->signal_sources) {
1165                 e->signal_sources = new0(sd_event_source*, _NSIG);
1166                 if (!e->signal_sources)
1167                         return -ENOMEM;
1168         } else if (e->signal_sources[sig])
1169                 return -EBUSY;
1170
1171         s = source_new(e, !ret, SOURCE_SIGNAL);
1172         if (!s)
1173                 return -ENOMEM;
1174
1175         s->signal.sig = sig;
1176         s->signal.callback = callback;
1177         s->userdata = userdata;
1178         s->enabled = SD_EVENT_ON;
1179
1180         e->signal_sources[sig] = s;
1181
1182         r = event_make_signal_data(e, sig, &d);
1183                 if (r < 0) {
1184                         source_free(s);
1185                         return r;
1186                 }
1187
1188         /* Use the signal name as description for the event source by default */
1189         (void) sd_event_source_set_description(s, signal_to_string(sig));
1190
1191         if (ret)
1192                 *ret = s;
1193
1194         return 0;
1195 }
1196
1197 #if 0 /// UNNEEDED by elogind
1198 _public_ int sd_event_add_child(
1199                 sd_event *e,
1200                 sd_event_source **ret,
1201                 pid_t pid,
1202                 int options,
1203                 sd_event_child_handler_t callback,
1204                 void *userdata) {
1205
1206         sd_event_source *s;
1207         int r;
1208
1209         assert_return(e, -EINVAL);
1210         assert_return(pid > 1, -EINVAL);
1211         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1212         assert_return(options != 0, -EINVAL);
1213         assert_return(callback, -EINVAL);
1214         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1215         assert_return(!event_pid_changed(e), -ECHILD);
1216
1217         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1218         if (r < 0)
1219                 return r;
1220
1221         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1222                 return -EBUSY;
1223
1224         s = source_new(e, !ret, SOURCE_CHILD);
1225         if (!s)
1226                 return -ENOMEM;
1227
1228         s->child.pid = pid;
1229         s->child.options = options;
1230         s->child.callback = callback;
1231         s->userdata = userdata;
1232         s->enabled = SD_EVENT_ONESHOT;
1233
1234         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1235         if (r < 0) {
1236                 source_free(s);
1237                 return r;
1238         }
1239
1240         e->n_enabled_child_sources ++;
1241
1242         r = event_make_signal_data(e, SIGCHLD, NULL);
1243                 if (r < 0) {
1244                 e->n_enabled_child_sources--;
1245                         source_free(s);
1246                         return r;
1247                 }
1248
1249         e->need_process_child = true;
1250
1251         if (ret)
1252                 *ret = s;
1253
1254         return 0;
1255 }
1256
1257 _public_ int sd_event_add_defer(
1258                 sd_event *e,
1259                 sd_event_source **ret,
1260                 sd_event_handler_t callback,
1261                 void *userdata) {
1262
1263         sd_event_source *s;
1264         int r;
1265
1266         assert_return(e, -EINVAL);
1267         assert_return(callback, -EINVAL);
1268         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1269         assert_return(!event_pid_changed(e), -ECHILD);
1270
1271         s = source_new(e, !ret, SOURCE_DEFER);
1272         if (!s)
1273                 return -ENOMEM;
1274
1275         s->defer.callback = callback;
1276         s->userdata = userdata;
1277         s->enabled = SD_EVENT_ONESHOT;
1278
1279         r = source_set_pending(s, true);
1280         if (r < 0) {
1281                 source_free(s);
1282                 return r;
1283         }
1284
1285         if (ret)
1286                 *ret = s;
1287
1288         return 0;
1289 }
1290 #endif // 0
1291
1292 _public_ int sd_event_add_post(
1293                 sd_event *e,
1294                 sd_event_source **ret,
1295                 sd_event_handler_t callback,
1296                 void *userdata) {
1297
1298         sd_event_source *s;
1299         int r;
1300
1301         assert_return(e, -EINVAL);
1302         assert_return(callback, -EINVAL);
1303         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1304         assert_return(!event_pid_changed(e), -ECHILD);
1305
1306         r = set_ensure_allocated(&e->post_sources, NULL);
1307         if (r < 0)
1308                 return r;
1309
1310         s = source_new(e, !ret, SOURCE_POST);
1311         if (!s)
1312                 return -ENOMEM;
1313
1314         s->post.callback = callback;
1315         s->userdata = userdata;
1316         s->enabled = SD_EVENT_ON;
1317
1318         r = set_put(e->post_sources, s);
1319         if (r < 0) {
1320                 source_free(s);
1321                 return r;
1322         }
1323
1324         if (ret)
1325                 *ret = s;
1326
1327         return 0;
1328 }
1329
1330 _public_ int sd_event_add_exit(
1331                 sd_event *e,
1332                 sd_event_source **ret,
1333                 sd_event_handler_t callback,
1334                 void *userdata) {
1335
1336         sd_event_source *s;
1337         int r;
1338
1339         assert_return(e, -EINVAL);
1340         assert_return(callback, -EINVAL);
1341         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1342         assert_return(!event_pid_changed(e), -ECHILD);
1343
1344         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1345         if (r < 0)
1346                 return r;
1347
1348         s = source_new(e, !ret, SOURCE_EXIT);
1349         if (!s)
1350                 return -ENOMEM;
1351
1352         s->exit.callback = callback;
1353         s->userdata = userdata;
1354         s->exit.prioq_index = PRIOQ_IDX_NULL;
1355         s->enabled = SD_EVENT_ONESHOT;
1356
1357         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1358         if (r < 0) {
1359                 source_free(s);
1360                 return r;
1361         }
1362
1363         if (ret)
1364                 *ret = s;
1365
1366         return 0;
1367 }
1368
1369 #if 0 /// UNNEEDED by elogind
1370 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
1371
1372         if (!s)
1373                 return NULL;
1374
1375         assert(s->n_ref >= 1);
1376         s->n_ref++;
1377
1378         return s;
1379 }
1380 #endif // 0
1381
1382 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
1383
1384         if (!s)
1385                 return NULL;
1386
1387         assert(s->n_ref >= 1);
1388         s->n_ref--;
1389
1390         if (s->n_ref <= 0) {
1391                 /* Here's a special hack: when we are called from a
1392                  * dispatch handler we won't free the event source
1393                  * immediately, but we will detach the fd from the
1394                  * epoll. This way it is safe for the caller to unref
1395                  * the event source and immediately close the fd, but
1396                  * we still retain a valid event source object after
1397                  * the callback. */
1398
1399                 if (s->dispatching) {
1400                         if (s->type == SOURCE_IO)
1401                                 source_io_unregister(s);
1402
1403                         source_disconnect(s);
1404                 } else
1405                         source_free(s);
1406         }
1407
1408         return NULL;
1409 }
1410
1411 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
1412         assert_return(s, -EINVAL);
1413         assert_return(!event_pid_changed(s->event), -ECHILD);
1414
1415         return free_and_strdup(&s->description, description);
1416 }
1417
1418 #if 0 /// UNNEEDED by elogind
1419 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
1420         assert_return(s, -EINVAL);
1421         assert_return(description, -EINVAL);
1422         assert_return(s->description, -ENXIO);
1423         assert_return(!event_pid_changed(s->event), -ECHILD);
1424
1425         *description = s->description;
1426         return 0;
1427 }
1428 #endif // 0
1429
1430 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1431         assert_return(s, NULL);
1432
1433         return s->event;
1434 }
1435
1436 #if 0 /// UNNEEDED by elogind
1437 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1438         assert_return(s, -EINVAL);
1439         assert_return(s->type != SOURCE_EXIT, -EDOM);
1440         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1441         assert_return(!event_pid_changed(s->event), -ECHILD);
1442
1443         return s->pending;
1444 }
1445
1446 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1447         assert_return(s, -EINVAL);
1448         assert_return(s->type == SOURCE_IO, -EDOM);
1449         assert_return(!event_pid_changed(s->event), -ECHILD);
1450
1451         return s->io.fd;
1452 }
1453 #endif // 0
1454
1455 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1456         int r;
1457
1458         assert_return(s, -EINVAL);
1459         assert_return(fd >= 0, -EBADF);
1460         assert_return(s->type == SOURCE_IO, -EDOM);
1461         assert_return(!event_pid_changed(s->event), -ECHILD);
1462
1463         if (s->io.fd == fd)
1464                 return 0;
1465
1466         if (s->enabled == SD_EVENT_OFF) {
1467                 s->io.fd = fd;
1468                 s->io.registered = false;
1469         } else {
1470                 int saved_fd;
1471
1472                 saved_fd = s->io.fd;
1473                 assert(s->io.registered);
1474
1475                 s->io.fd = fd;
1476                 s->io.registered = false;
1477
1478                 r = source_io_register(s, s->enabled, s->io.events);
1479                 if (r < 0) {
1480                         s->io.fd = saved_fd;
1481                         s->io.registered = true;
1482                         return r;
1483                 }
1484
1485                 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
1486         }
1487
1488         return 0;
1489 }
1490
1491 #if 0 /// UNNEEDED by elogind
1492 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1493         assert_return(s, -EINVAL);
1494         assert_return(events, -EINVAL);
1495         assert_return(s->type == SOURCE_IO, -EDOM);
1496         assert_return(!event_pid_changed(s->event), -ECHILD);
1497
1498         *events = s->io.events;
1499         return 0;
1500 }
1501 #endif // 0
1502
1503 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1504         int r;
1505
1506         assert_return(s, -EINVAL);
1507         assert_return(s->type == SOURCE_IO, -EDOM);
1508         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1509         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1510         assert_return(!event_pid_changed(s->event), -ECHILD);
1511
1512         /* edge-triggered updates are never skipped, so we can reset edges */
1513         if (s->io.events == events && !(events & EPOLLET))
1514                 return 0;
1515
1516         if (s->enabled != SD_EVENT_OFF) {
1517                 r = source_io_register(s, s->enabled, events);
1518                 if (r < 0)
1519                         return r;
1520         }
1521
1522         s->io.events = events;
1523         source_set_pending(s, false);
1524
1525         return 0;
1526 }
1527
1528 #if 0 /// UNNEEDED by elogind
1529 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1530         assert_return(s, -EINVAL);
1531         assert_return(revents, -EINVAL);
1532         assert_return(s->type == SOURCE_IO, -EDOM);
1533         assert_return(s->pending, -ENODATA);
1534         assert_return(!event_pid_changed(s->event), -ECHILD);
1535
1536         *revents = s->io.revents;
1537         return 0;
1538 }
1539
1540 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1541         assert_return(s, -EINVAL);
1542         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1543         assert_return(!event_pid_changed(s->event), -ECHILD);
1544
1545         return s->signal.sig;
1546 }
1547
1548 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
1549         assert_return(s, -EINVAL);
1550         assert_return(!event_pid_changed(s->event), -ECHILD);
1551
1552         return s->priority;
1553 }
1554 #endif // 0
1555
1556 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
1557         int r;
1558
1559         assert_return(s, -EINVAL);
1560         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1561         assert_return(!event_pid_changed(s->event), -ECHILD);
1562
1563         if (s->priority == priority)
1564                 return 0;
1565
1566         if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
1567                 struct signal_data *old, *d;
1568
1569                 /* Move us from the signalfd belonging to the old
1570                  * priority to the signalfd of the new priority */
1571
1572                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
1573
1574                 s->priority = priority;
1575
1576                 r = event_make_signal_data(s->event, s->signal.sig, &d);
1577                 if (r < 0) {
1578                         s->priority = old->priority;
1579                         return r;
1580                 }
1581
1582                 event_unmask_signal_data(s->event, old, s->signal.sig);
1583         } else
1584         s->priority = priority;
1585
1586         if (s->pending)
1587                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1588
1589         if (s->prepare)
1590                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1591
1592         if (s->type == SOURCE_EXIT)
1593                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1594
1595         return 0;
1596 }
1597
1598 #if 0 /// UNNEEDED by elogind
1599 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1600         assert_return(s, -EINVAL);
1601         assert_return(m, -EINVAL);
1602         assert_return(!event_pid_changed(s->event), -ECHILD);
1603
1604         *m = s->enabled;
1605         return 0;
1606 }
1607 #endif // 0
1608
1609 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1610         int r;
1611
1612         assert_return(s, -EINVAL);
1613         assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
1614         assert_return(!event_pid_changed(s->event), -ECHILD);
1615
1616         /* If we are dead anyway, we are fine with turning off
1617          * sources, but everything else needs to fail. */
1618         if (s->event->state == SD_EVENT_FINISHED)
1619                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
1620
1621         if (s->enabled == m)
1622                 return 0;
1623
1624         if (m == SD_EVENT_OFF) {
1625
1626                 switch (s->type) {
1627
1628                 case SOURCE_IO:
1629                         source_io_unregister(s);
1630                         s->enabled = m;
1631                         break;
1632
1633                 case SOURCE_TIME_REALTIME:
1634                 case SOURCE_TIME_BOOTTIME:
1635                 case SOURCE_TIME_MONOTONIC:
1636                 case SOURCE_TIME_REALTIME_ALARM:
1637                 case SOURCE_TIME_BOOTTIME_ALARM: {
1638                         struct clock_data *d;
1639
1640                         s->enabled = m;
1641                         d = event_get_clock_data(s->event, s->type);
1642                         assert(d);
1643
1644                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1645                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1646                         d->needs_rearm = true;
1647                         break;
1648                 }
1649
1650                 case SOURCE_SIGNAL:
1651                         s->enabled = m;
1652
1653                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1654                         break;
1655
1656                 case SOURCE_CHILD:
1657                         s->enabled = m;
1658
1659                         assert(s->event->n_enabled_child_sources > 0);
1660                         s->event->n_enabled_child_sources--;
1661
1662                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1663                         break;
1664
1665                 case SOURCE_EXIT:
1666                         s->enabled = m;
1667                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1668                         break;
1669
1670                 case SOURCE_DEFER:
1671                 case SOURCE_POST:
1672                         s->enabled = m;
1673                         break;
1674
1675                 default:
1676                         assert_not_reached("Wut? I shouldn't exist.");
1677                 }
1678
1679         } else {
1680                 switch (s->type) {
1681
1682                 case SOURCE_IO:
1683                         r = source_io_register(s, m, s->io.events);
1684                         if (r < 0)
1685                                 return r;
1686
1687                         s->enabled = m;
1688                         break;
1689
1690                 case SOURCE_TIME_REALTIME:
1691                 case SOURCE_TIME_BOOTTIME:
1692                 case SOURCE_TIME_MONOTONIC:
1693                 case SOURCE_TIME_REALTIME_ALARM:
1694                 case SOURCE_TIME_BOOTTIME_ALARM: {
1695                         struct clock_data *d;
1696
1697                         s->enabled = m;
1698                         d = event_get_clock_data(s->event, s->type);
1699                         assert(d);
1700
1701                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1702                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1703                         d->needs_rearm = true;
1704                         break;
1705                 }
1706
1707                 case SOURCE_SIGNAL:
1708
1709                         s->enabled = m;
1710
1711                         r = event_make_signal_data(s->event, s->signal.sig, NULL);
1712                                 if (r < 0) {
1713                                         s->enabled = SD_EVENT_OFF;
1714                                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1715                                         return r;
1716                                 }
1717
1718                         break;
1719
1720                 case SOURCE_CHILD:
1721
1722                         if (s->enabled == SD_EVENT_OFF)
1723                                 s->event->n_enabled_child_sources++;
1724
1725                         s->enabled = m;
1726
1727                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
1728                                         if (r < 0) {
1729                                                 s->enabled = SD_EVENT_OFF;
1730                                 s->event->n_enabled_child_sources--;
1731                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1732                                                 return r;
1733                                         }
1734
1735                         break;
1736
1737                 case SOURCE_EXIT:
1738                         s->enabled = m;
1739                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1740                         break;
1741
1742                 case SOURCE_DEFER:
1743                 case SOURCE_POST:
1744                         s->enabled = m;
1745                         break;
1746
1747                 default:
1748                         assert_not_reached("Wut? I shouldn't exist.");
1749                 }
1750         }
1751
1752         if (s->pending)
1753                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1754
1755         if (s->prepare)
1756                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1757
1758         return 0;
1759 }
1760
1761 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1762         assert_return(s, -EINVAL);
1763         assert_return(usec, -EINVAL);
1764         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1765         assert_return(!event_pid_changed(s->event), -ECHILD);
1766
1767         *usec = s->time.next;
1768         return 0;
1769 }
1770
1771 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1772         struct clock_data *d;
1773
1774         assert_return(s, -EINVAL);
1775         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1776         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1777         assert_return(!event_pid_changed(s->event), -ECHILD);
1778
1779         s->time.next = usec;
1780
1781         source_set_pending(s, false);
1782
1783         d = event_get_clock_data(s->event, s->type);
1784         assert(d);
1785
1786         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1787         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1788         d->needs_rearm = true;
1789
1790         return 0;
1791 }
1792
1793 #if 0 /// UNNEEDED by elogind
1794 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1795         assert_return(s, -EINVAL);
1796         assert_return(usec, -EINVAL);
1797         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1798         assert_return(!event_pid_changed(s->event), -ECHILD);
1799
1800         *usec = s->time.accuracy;
1801         return 0;
1802 }
1803
1804 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1805         struct clock_data *d;
1806
1807         assert_return(s, -EINVAL);
1808         assert_return(usec != (uint64_t) -1, -EINVAL);
1809         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1810         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1811         assert_return(!event_pid_changed(s->event), -ECHILD);
1812
1813         if (usec == 0)
1814                 usec = DEFAULT_ACCURACY_USEC;
1815
1816         s->time.accuracy = usec;
1817
1818         source_set_pending(s, false);
1819
1820         d = event_get_clock_data(s->event, s->type);
1821         assert(d);
1822
1823         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1824         d->needs_rearm = true;
1825
1826         return 0;
1827 }
1828
1829 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
1830         assert_return(s, -EINVAL);
1831         assert_return(clock, -EINVAL);
1832         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1833         assert_return(!event_pid_changed(s->event), -ECHILD);
1834
1835         *clock = event_source_type_to_clock(s->type);
1836         return 0;
1837 }
1838
1839 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1840         assert_return(s, -EINVAL);
1841         assert_return(pid, -EINVAL);
1842         assert_return(s->type == SOURCE_CHILD, -EDOM);
1843         assert_return(!event_pid_changed(s->event), -ECHILD);
1844
1845         *pid = s->child.pid;
1846         return 0;
1847 }
1848 #endif // 0
1849
1850 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1851         int r;
1852
1853         assert_return(s, -EINVAL);
1854         assert_return(s->type != SOURCE_EXIT, -EDOM);
1855         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1856         assert_return(!event_pid_changed(s->event), -ECHILD);
1857
1858         if (s->prepare == callback)
1859                 return 0;
1860
1861         if (callback && s->prepare) {
1862                 s->prepare = callback;
1863                 return 0;
1864         }
1865
1866         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1867         if (r < 0)
1868                 return r;
1869
1870         s->prepare = callback;
1871
1872         if (callback) {
1873                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1874                 if (r < 0)
1875                         return r;
1876         } else
1877                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1878
1879         return 0;
1880 }
1881
1882 #if 0 /// UNNEEDED by elogind
1883 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1884         assert_return(s, NULL);
1885
1886         return s->userdata;
1887 }
1888
1889 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1890         void *ret;
1891
1892         assert_return(s, NULL);
1893
1894         ret = s->userdata;
1895         s->userdata = userdata;
1896
1897         return ret;
1898 }
1899 #endif // 0
1900
1901 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1902         usec_t c;
1903         assert(e);
1904         assert(a <= b);
1905
1906         if (a <= 0)
1907                 return 0;
1908         if (a >= USEC_INFINITY)
1909                 return USEC_INFINITY;
1910
1911         if (b <= a + 1)
1912                 return a;
1913
1914         initialize_perturb(e);
1915
1916         /*
1917           Find a good time to wake up again between times a and b. We
1918           have two goals here:
1919
1920           a) We want to wake up as seldom as possible, hence prefer
1921              later times over earlier times.
1922
1923           b) But if we have to wake up, then let's make sure to
1924              dispatch as much as possible on the entire system.
1925
1926           We implement this by waking up everywhere at the same time
1927           within any given minute if we can, synchronised via the
1928           perturbation value determined from the boot ID. If we can't,
1929           then we try to find the same spot in every 10s, then 1s and
1930           then 250ms step. Otherwise, we pick the last possible time
1931           to wake up.
1932         */
1933
1934         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1935         if (c >= b) {
1936                 if (_unlikely_(c < USEC_PER_MINUTE))
1937                         return b;
1938
1939                 c -= USEC_PER_MINUTE;
1940         }
1941
1942         if (c >= a)
1943                 return c;
1944
1945         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1946         if (c >= b) {
1947                 if (_unlikely_(c < USEC_PER_SEC*10))
1948                         return b;
1949
1950                 c -= USEC_PER_SEC*10;
1951         }
1952
1953         if (c >= a)
1954                 return c;
1955
1956         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1957         if (c >= b) {
1958                 if (_unlikely_(c < USEC_PER_SEC))
1959                         return b;
1960
1961                 c -= USEC_PER_SEC;
1962         }
1963
1964         if (c >= a)
1965                 return c;
1966
1967         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1968         if (c >= b) {
1969                 if (_unlikely_(c < USEC_PER_MSEC*250))
1970                         return b;
1971
1972                 c -= USEC_PER_MSEC*250;
1973         }
1974
1975         if (c >= a)
1976                 return c;
1977
1978         return b;
1979 }
1980
1981 static int event_arm_timer(
1982                 sd_event *e,
1983                 struct clock_data *d) {
1984
1985         struct itimerspec its = {};
1986         sd_event_source *a, *b;
1987         usec_t t;
1988         int r;
1989
1990         assert(e);
1991         assert(d);
1992
1993         if (!d->needs_rearm)
1994                 return 0;
1995         else
1996                 d->needs_rearm = false;
1997
1998         a = prioq_peek(d->earliest);
1999         if (!a || a->enabled == SD_EVENT_OFF || a->time.next == USEC_INFINITY) {
2000
2001                 if (d->fd < 0)
2002                         return 0;
2003
2004                 if (d->next == USEC_INFINITY)
2005                         return 0;
2006
2007                 /* disarm */
2008                 r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2009                 if (r < 0)
2010                         return r;
2011
2012                 d->next = USEC_INFINITY;
2013                 return 0;
2014         }
2015
2016         b = prioq_peek(d->latest);
2017         assert_se(b && b->enabled != SD_EVENT_OFF);
2018
2019         t = sleep_between(e, a->time.next, time_event_source_latest(b));
2020         if (d->next == t)
2021                 return 0;
2022
2023         assert_se(d->fd >= 0);
2024
2025         if (t == 0) {
2026                 /* We don' want to disarm here, just mean some time looooong ago. */
2027                 its.it_value.tv_sec = 0;
2028                 its.it_value.tv_nsec = 1;
2029         } else
2030                 timespec_store(&its.it_value, t);
2031
2032         r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2033         if (r < 0)
2034                 return -errno;
2035
2036         d->next = t;
2037         return 0;
2038 }
2039
2040 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
2041         assert(e);
2042         assert(s);
2043         assert(s->type == SOURCE_IO);
2044
2045         /* If the event source was already pending, we just OR in the
2046          * new revents, otherwise we reset the value. The ORing is
2047          * necessary to handle EPOLLONESHOT events properly where
2048          * readability might happen independently of writability, and
2049          * we need to keep track of both */
2050
2051         if (s->pending)
2052                 s->io.revents |= revents;
2053         else
2054                 s->io.revents = revents;
2055
2056         return source_set_pending(s, true);
2057 }
2058
2059 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
2060         uint64_t x;
2061         ssize_t ss;
2062
2063         assert(e);
2064         assert(fd >= 0);
2065
2066         assert_return(events == EPOLLIN, -EIO);
2067
2068         ss = read(fd, &x, sizeof(x));
2069         if (ss < 0) {
2070                 if (errno == EAGAIN || errno == EINTR)
2071                         return 0;
2072
2073                 return -errno;
2074         }
2075
2076         if (_unlikely_(ss != sizeof(x)))
2077                 return -EIO;
2078
2079         if (next)
2080                 *next = USEC_INFINITY;
2081
2082         return 0;
2083 }
2084
2085 static int process_timer(
2086                 sd_event *e,
2087                 usec_t n,
2088                 struct clock_data *d) {
2089
2090         sd_event_source *s;
2091         int r;
2092
2093         assert(e);
2094         assert(d);
2095
2096         for (;;) {
2097                 s = prioq_peek(d->earliest);
2098                 if (!s ||
2099                     s->time.next > n ||
2100                     s->enabled == SD_EVENT_OFF ||
2101                     s->pending)
2102                         break;
2103
2104                 r = source_set_pending(s, true);
2105                 if (r < 0)
2106                         return r;
2107
2108                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2109                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
2110                 d->needs_rearm = true;
2111         }
2112
2113         return 0;
2114 }
2115
2116 static int process_child(sd_event *e) {
2117         sd_event_source *s;
2118         Iterator i;
2119         int r;
2120
2121         assert(e);
2122
2123         e->need_process_child = false;
2124
2125         /*
2126            So, this is ugly. We iteratively invoke waitid() with P_PID
2127            + WNOHANG for each PID we wait for, instead of using
2128            P_ALL. This is because we only want to get child
2129            information of very specific child processes, and not all
2130            of them. We might not have processed the SIGCHLD even of a
2131            previous invocation and we don't want to maintain a
2132            unbounded *per-child* event queue, hence we really don't
2133            want anything flushed out of the kernel's queue that we
2134            don't care about. Since this is O(n) this means that if you
2135            have a lot of processes you probably want to handle SIGCHLD
2136            yourself.
2137
2138            We do not reap the children here (by using WNOWAIT), this
2139            is only done after the event source is dispatched so that
2140            the callback still sees the process as a zombie.
2141         */
2142
2143         HASHMAP_FOREACH(s, e->child_sources, i) {
2144                 assert(s->type == SOURCE_CHILD);
2145
2146                 if (s->pending)
2147                         continue;
2148
2149                 if (s->enabled == SD_EVENT_OFF)
2150                         continue;
2151
2152                 zero(s->child.siginfo);
2153                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
2154                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
2155                 if (r < 0)
2156                         return -errno;
2157
2158                 if (s->child.siginfo.si_pid != 0) {
2159                         bool zombie =
2160                                 s->child.siginfo.si_code == CLD_EXITED ||
2161                                 s->child.siginfo.si_code == CLD_KILLED ||
2162                                 s->child.siginfo.si_code == CLD_DUMPED;
2163
2164                         if (!zombie && (s->child.options & WEXITED)) {
2165                                 /* If the child isn't dead then let's
2166                                  * immediately remove the state change
2167                                  * from the queue, since there's no
2168                                  * benefit in leaving it queued */
2169
2170                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
2171                                 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
2172                         }
2173
2174                         r = source_set_pending(s, true);
2175                         if (r < 0)
2176                                 return r;
2177                 }
2178         }
2179
2180         return 0;
2181 }
2182
2183 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
2184         bool read_one = false;
2185         int r;
2186
2187         assert(e);
2188         assert_return(events == EPOLLIN, -EIO);
2189
2190         /* If there's a signal queued on this priority and SIGCHLD is
2191            on this priority too, then make sure to recheck the
2192            children we watch. This is because we only ever dequeue
2193            the first signal per priority, and if we dequeue one, and
2194            SIGCHLD might be enqueued later we wouldn't know, but we
2195            might have higher priority children we care about hence we
2196            need to check that explicitly. */
2197
2198         if (sigismember(&d->sigset, SIGCHLD))
2199                 e->need_process_child = true;
2200
2201         /* If there's already an event source pending for this
2202          * priority we don't read another */
2203         if (d->current)
2204                 return 0;
2205
2206         for (;;) {
2207                 struct signalfd_siginfo si;
2208                 ssize_t n;
2209                 sd_event_source *s = NULL;
2210
2211                 n = read(d->fd, &si, sizeof(si));
2212                 if (n < 0) {
2213                         if (errno == EAGAIN || errno == EINTR)
2214                                 return read_one;
2215
2216                         return -errno;
2217                 }
2218
2219                 if (_unlikely_(n != sizeof(si)))
2220                         return -EIO;
2221
2222                 assert(SIGNAL_VALID(si.ssi_signo));
2223
2224                 read_one = true;
2225
2226                 if (e->signal_sources)
2227                         s = e->signal_sources[si.ssi_signo];
2228                 if (!s)
2229                         continue;
2230                 if (s->pending)
2231                         continue;
2232
2233                 s->signal.siginfo = si;
2234                 d->current = s;
2235
2236                 r = source_set_pending(s, true);
2237                 if (r < 0)
2238                         return r;
2239
2240                 return 1;
2241         }
2242 }
2243
2244 static int source_dispatch(sd_event_source *s) {
2245         int r = 0;
2246
2247         assert(s);
2248         assert(s->pending || s->type == SOURCE_EXIT);
2249
2250         if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) {
2251                 r = source_set_pending(s, false);
2252                 if (r < 0)
2253                         return r;
2254         }
2255
2256         if (s->type != SOURCE_POST) {
2257                 sd_event_source *z;
2258                 Iterator i;
2259
2260                 /* If we execute a non-post source, let's mark all
2261                  * post sources as pending */
2262
2263                 SET_FOREACH(z, s->event->post_sources, i) {
2264                         if (z->enabled == SD_EVENT_OFF)
2265                                 continue;
2266
2267                         r = source_set_pending(z, true);
2268                         if (r < 0)
2269                                 return r;
2270                 }
2271         }
2272
2273         if (s->enabled == SD_EVENT_ONESHOT) {
2274                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
2275                 if (r < 0)
2276                         return r;
2277         }
2278
2279         s->dispatching = true;
2280
2281         switch (s->type) {
2282
2283         case SOURCE_IO:
2284                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
2285                 break;
2286
2287         case SOURCE_TIME_REALTIME:
2288         case SOURCE_TIME_BOOTTIME:
2289         case SOURCE_TIME_MONOTONIC:
2290         case SOURCE_TIME_REALTIME_ALARM:
2291         case SOURCE_TIME_BOOTTIME_ALARM:
2292                 r = s->time.callback(s, s->time.next, s->userdata);
2293                 break;
2294
2295         case SOURCE_SIGNAL:
2296                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
2297                 break;
2298
2299         case SOURCE_CHILD: {
2300                 bool zombie;
2301
2302                 zombie = s->child.siginfo.si_code == CLD_EXITED ||
2303                          s->child.siginfo.si_code == CLD_KILLED ||
2304                          s->child.siginfo.si_code == CLD_DUMPED;
2305
2306                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
2307
2308                 /* Now, reap the PID for good. */
2309                 if (zombie)
2310                         waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
2311
2312                 break;
2313         }
2314
2315         case SOURCE_DEFER:
2316                 r = s->defer.callback(s, s->userdata);
2317                 break;
2318
2319         case SOURCE_POST:
2320                 r = s->post.callback(s, s->userdata);
2321                 break;
2322
2323         case SOURCE_EXIT:
2324                 r = s->exit.callback(s, s->userdata);
2325                 break;
2326
2327         case SOURCE_WATCHDOG:
2328         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
2329         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
2330                 assert_not_reached("Wut? I shouldn't exist.");
2331         }
2332
2333         s->dispatching = false;
2334
2335         if (r < 0)
2336                 log_debug_errno(r, "Event source %s (type %s) returned error, disabling: %m",
2337                                 strna(s->description), event_source_type_to_string(s->type));
2338
2339         if (s->n_ref == 0)
2340                 source_free(s);
2341         else if (r < 0)
2342                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
2343
2344         return 1;
2345 }
2346
2347 static int event_prepare(sd_event *e) {
2348         int r;
2349
2350         assert(e);
2351
2352         for (;;) {
2353                 sd_event_source *s;
2354
2355                 s = prioq_peek(e->prepare);
2356                 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
2357                         break;
2358
2359                 s->prepare_iteration = e->iteration;
2360                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
2361                 if (r < 0)
2362                         return r;
2363
2364                 assert(s->prepare);
2365
2366                 s->dispatching = true;
2367                 r = s->prepare(s, s->userdata);
2368                 s->dispatching = false;
2369
2370                 if (r < 0)
2371                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, disabling: %m",
2372                                         strna(s->description), event_source_type_to_string(s->type));
2373
2374                 if (s->n_ref == 0)
2375                         source_free(s);
2376                 else if (r < 0)
2377                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
2378         }
2379
2380         return 0;
2381 }
2382
2383 static int dispatch_exit(sd_event *e) {
2384         sd_event_source *p;
2385         int r;
2386
2387         assert(e);
2388
2389         p = prioq_peek(e->exit);
2390         if (!p || p->enabled == SD_EVENT_OFF) {
2391                 e->state = SD_EVENT_FINISHED;
2392                 return 0;
2393         }
2394
2395         sd_event_ref(e);
2396         e->iteration++;
2397         e->state = SD_EVENT_EXITING;
2398
2399         r = source_dispatch(p);
2400
2401         e->state = SD_EVENT_INITIAL;
2402         sd_event_unref(e);
2403
2404         return r;
2405 }
2406
2407 static sd_event_source* event_next_pending(sd_event *e) {
2408         sd_event_source *p;
2409
2410         assert(e);
2411
2412         p = prioq_peek(e->pending);
2413         if (!p)
2414                 return NULL;
2415
2416         if (p->enabled == SD_EVENT_OFF)
2417                 return NULL;
2418
2419         return p;
2420 }
2421
2422 static int arm_watchdog(sd_event *e) {
2423         struct itimerspec its = {};
2424         usec_t t;
2425         int r;
2426
2427         assert(e);
2428         assert(e->watchdog_fd >= 0);
2429
2430         t = sleep_between(e,
2431                           e->watchdog_last + (e->watchdog_period / 2),
2432                           e->watchdog_last + (e->watchdog_period * 3 / 4));
2433
2434         timespec_store(&its.it_value, t);
2435
2436         /* Make sure we never set the watchdog to 0, which tells the
2437          * kernel to disable it. */
2438         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
2439                 its.it_value.tv_nsec = 1;
2440
2441         r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
2442         if (r < 0)
2443                 return -errno;
2444
2445         return 0;
2446 }
2447
2448 static int process_watchdog(sd_event *e) {
2449         assert(e);
2450
2451         if (!e->watchdog)
2452                 return 0;
2453
2454         /* Don't notify watchdog too often */
2455         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
2456                 return 0;
2457
2458         sd_notify(false, "WATCHDOG=1");
2459         e->watchdog_last = e->timestamp.monotonic;
2460
2461         return arm_watchdog(e);
2462 }
2463
2464 _public_ int sd_event_prepare(sd_event *e) {
2465         int r;
2466
2467         assert_return(e, -EINVAL);
2468         assert_return(!event_pid_changed(e), -ECHILD);
2469         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2470         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2471
2472         if (e->exit_requested)
2473                 goto pending;
2474
2475         e->iteration++;
2476
2477         e->state = SD_EVENT_PREPARING;
2478         r = event_prepare(e);
2479         e->state = SD_EVENT_INITIAL;
2480         if (r < 0)
2481                 return r;
2482
2483         r = event_arm_timer(e, &e->realtime);
2484         if (r < 0)
2485                 return r;
2486
2487         r = event_arm_timer(e, &e->boottime);
2488         if (r < 0)
2489                 return r;
2490
2491         r = event_arm_timer(e, &e->monotonic);
2492         if (r < 0)
2493                 return r;
2494
2495         r = event_arm_timer(e, &e->realtime_alarm);
2496         if (r < 0)
2497                 return r;
2498
2499         r = event_arm_timer(e, &e->boottime_alarm);
2500         if (r < 0)
2501                 return r;
2502
2503         if (event_next_pending(e) || e->need_process_child)
2504                 goto pending;
2505
2506         e->state = SD_EVENT_ARMED;
2507
2508         return 0;
2509
2510 pending:
2511         e->state = SD_EVENT_ARMED;
2512         r = sd_event_wait(e, 0);
2513         if (r == 0)
2514                 e->state = SD_EVENT_ARMED;
2515
2516         return r;
2517 }
2518
2519 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
2520         struct epoll_event *ev_queue;
2521         unsigned ev_queue_max;
2522         int r, m, i;
2523
2524         assert_return(e, -EINVAL);
2525         assert_return(!event_pid_changed(e), -ECHILD);
2526         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2527         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
2528
2529         if (e->exit_requested) {
2530                 e->state = SD_EVENT_PENDING;
2531                 return 1;
2532         }
2533
2534         ev_queue_max = MAX(e->n_sources, 1u);
2535         ev_queue = newa(struct epoll_event, ev_queue_max);
2536
2537         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
2538                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
2539         if (m < 0) {
2540                 if (errno == EINTR) {
2541                         e->state = SD_EVENT_PENDING;
2542                         return 1;
2543                 }
2544
2545                 r = -errno;
2546                 goto finish;
2547         }
2548
2549         dual_timestamp_get(&e->timestamp);
2550         e->timestamp_boottime = now(clock_boottime_or_monotonic());
2551
2552         for (i = 0; i < m; i++) {
2553
2554                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
2555                         r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
2556                 else {
2557                         WakeupType *t = ev_queue[i].data.ptr;
2558
2559                         switch (*t) {
2560
2561                         case WAKEUP_EVENT_SOURCE:
2562                         r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
2563                                 break;
2564
2565                         case WAKEUP_CLOCK_DATA: {
2566                                 struct clock_data *d = ev_queue[i].data.ptr;
2567                                 r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
2568                                 break;
2569                         }
2570
2571                         case WAKEUP_SIGNAL_DATA:
2572                                 r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
2573                                 break;
2574
2575                         default:
2576                                 assert_not_reached("Invalid wake-up pointer");
2577                         }
2578                 }
2579                 if (r < 0)
2580                         goto finish;
2581         }
2582
2583         r = process_watchdog(e);
2584         if (r < 0)
2585                 goto finish;
2586
2587         r = process_timer(e, e->timestamp.realtime, &e->realtime);
2588         if (r < 0)
2589                 goto finish;
2590
2591         r = process_timer(e, e->timestamp_boottime, &e->boottime);
2592         if (r < 0)
2593                 goto finish;
2594
2595         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
2596         if (r < 0)
2597                 goto finish;
2598
2599         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
2600         if (r < 0)
2601                 goto finish;
2602
2603         r = process_timer(e, e->timestamp_boottime, &e->boottime_alarm);
2604         if (r < 0)
2605                 goto finish;
2606
2607         if (e->need_process_child) {
2608                 r = process_child(e);
2609                 if (r < 0)
2610                         goto finish;
2611         }
2612
2613         if (event_next_pending(e)) {
2614                 e->state = SD_EVENT_PENDING;
2615
2616                 return 1;
2617         }
2618
2619         r = 0;
2620
2621 finish:
2622         e->state = SD_EVENT_INITIAL;
2623
2624         return r;
2625 }
2626
2627 _public_ int sd_event_dispatch(sd_event *e) {
2628         sd_event_source *p;
2629         int r;
2630
2631         assert_return(e, -EINVAL);
2632         assert_return(!event_pid_changed(e), -ECHILD);
2633         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2634         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
2635
2636         if (e->exit_requested)
2637                 return dispatch_exit(e);
2638
2639         p = event_next_pending(e);
2640         if (p) {
2641                 sd_event_ref(e);
2642
2643                 e->state = SD_EVENT_RUNNING;
2644                 r = source_dispatch(p);
2645                 e->state = SD_EVENT_INITIAL;
2646
2647                 sd_event_unref(e);
2648
2649                 return r;
2650         }
2651
2652         e->state = SD_EVENT_INITIAL;
2653
2654         return 1;
2655 }
2656
2657 static void event_log_delays(sd_event *e) {
2658         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1];
2659         unsigned i;
2660         int o;
2661
2662         for (i = o = 0; i < ELEMENTSOF(e->delays); i++) {
2663                 o += snprintf(&b[o], sizeof(b) - o, "%u ", e->delays[i]);
2664                 e->delays[i] = 0;
2665         }
2666         log_debug("Event loop iterations: %.*s", o, b);
2667 }
2668
2669 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
2670         int r;
2671
2672         assert_return(e, -EINVAL);
2673         assert_return(!event_pid_changed(e), -ECHILD);
2674         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2675         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2676
2677         if (e->profile_delays && e->last_run) {
2678                 usec_t this_run;
2679                 unsigned l;
2680
2681                 this_run = now(CLOCK_MONOTONIC);
2682
2683                 l = u64log2(this_run - e->last_run);
2684                 assert(l < sizeof(e->delays));
2685                 e->delays[l]++;
2686
2687                 if (this_run - e->last_log >= 5*USEC_PER_SEC) {
2688                         event_log_delays(e);
2689                         e->last_log = this_run;
2690                 }
2691         }
2692
2693         r = sd_event_prepare(e);
2694         if (r == 0)
2695                 /* There was nothing? Then wait... */
2696                 r = sd_event_wait(e, timeout);
2697
2698         if (e->profile_delays)
2699                 e->last_run = now(CLOCK_MONOTONIC);
2700
2701         if (r > 0) {
2702                 /* There's something now, then let's dispatch it */
2703                 r = sd_event_dispatch(e);
2704                 if (r < 0)
2705                         return r;
2706
2707                 return 1;
2708         }
2709
2710         return r;
2711 }
2712
2713 #if 0 /// UNNEEDED by elogind
2714 _public_ int sd_event_loop(sd_event *e) {
2715         int r;
2716
2717         assert_return(e, -EINVAL);
2718         assert_return(!event_pid_changed(e), -ECHILD);
2719         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2720
2721         sd_event_ref(e);
2722
2723         while (e->state != SD_EVENT_FINISHED) {
2724                 r = sd_event_run(e, (uint64_t) -1);
2725                 if (r < 0)
2726                         goto finish;
2727         }
2728
2729         r = e->exit_code;
2730
2731 finish:
2732         sd_event_unref(e);
2733         return r;
2734 }
2735
2736 _public_ int sd_event_get_fd(sd_event *e) {
2737
2738         assert_return(e, -EINVAL);
2739         assert_return(!event_pid_changed(e), -ECHILD);
2740
2741         return e->epoll_fd;
2742 }
2743 #endif // 0
2744
2745 _public_ int sd_event_get_state(sd_event *e) {
2746         assert_return(e, -EINVAL);
2747         assert_return(!event_pid_changed(e), -ECHILD);
2748
2749         return e->state;
2750 }
2751
2752 #if 0 /// UNNEEDED by elogind
2753 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2754         assert_return(e, -EINVAL);
2755         assert_return(code, -EINVAL);
2756         assert_return(!event_pid_changed(e), -ECHILD);
2757
2758         if (!e->exit_requested)
2759                 return -ENODATA;
2760
2761         *code = e->exit_code;
2762         return 0;
2763 }
2764 #endif // 0
2765
2766 _public_ int sd_event_exit(sd_event *e, int code) {
2767         assert_return(e, -EINVAL);
2768         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2769         assert_return(!event_pid_changed(e), -ECHILD);
2770
2771         e->exit_requested = true;
2772         e->exit_code = code;
2773
2774         return 0;
2775 }
2776
2777 #if 0 /// UNNEEDED by elogind
2778 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
2779         assert_return(e, -EINVAL);
2780         assert_return(usec, -EINVAL);
2781         assert_return(!event_pid_changed(e), -ECHILD);
2782         assert_return(IN_SET(clock,
2783                              CLOCK_REALTIME,
2784                              CLOCK_REALTIME_ALARM,
2785                              CLOCK_MONOTONIC,
2786                              CLOCK_BOOTTIME,
2787                              CLOCK_BOOTTIME_ALARM), -EOPNOTSUPP);
2788
2789         if (!dual_timestamp_is_set(&e->timestamp)) {
2790                 /* Implicitly fall back to now() if we never ran
2791                  * before and thus have no cached time. */
2792                 *usec = now(clock);
2793                 return 1;
2794         }
2795
2796         switch (clock) {
2797
2798         case CLOCK_REALTIME:
2799         case CLOCK_REALTIME_ALARM:
2800                 *usec = e->timestamp.realtime;
2801                 break;
2802
2803         case CLOCK_MONOTONIC:
2804                 *usec = e->timestamp.monotonic;
2805                 break;
2806
2807         case CLOCK_BOOTTIME:
2808         case CLOCK_BOOTTIME_ALARM:
2809                 *usec = e->timestamp_boottime;
2810                 break;
2811
2812         default:
2813                 assert_not_reached("Unknown clock?");
2814         }
2815
2816         return 0;
2817 }
2818 #endif // 0
2819
2820 _public_ int sd_event_default(sd_event **ret) {
2821
2822         static thread_local sd_event *default_event = NULL;
2823         sd_event *e = NULL;
2824         int r;
2825
2826         if (!ret)
2827                 return !!default_event;
2828
2829         if (default_event) {
2830                 *ret = sd_event_ref(default_event);
2831                 return 0;
2832         }
2833
2834         r = sd_event_new(&e);
2835         if (r < 0)
2836                 return r;
2837
2838         e->default_event_ptr = &default_event;
2839         e->tid = gettid();
2840         default_event = e;
2841
2842         *ret = e;
2843         return 1;
2844 }
2845
2846 #if 0 /// UNNEEDED by elogind
2847 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2848         assert_return(e, -EINVAL);
2849         assert_return(tid, -EINVAL);
2850         assert_return(!event_pid_changed(e), -ECHILD);
2851
2852         if (e->tid != 0) {
2853                 *tid = e->tid;
2854                 return 0;
2855         }
2856
2857         return -ENXIO;
2858 }
2859 #endif // 0
2860
2861 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2862         int r;
2863
2864         assert_return(e, -EINVAL);
2865         assert_return(!event_pid_changed(e), -ECHILD);
2866
2867         if (e->watchdog == !!b)
2868                 return e->watchdog;
2869
2870         if (b) {
2871                 struct epoll_event ev = {};
2872
2873                 r = sd_watchdog_enabled(false, &e->watchdog_period);
2874                 if (r <= 0)
2875                         return r;
2876
2877                 /* Issue first ping immediately */
2878                 sd_notify(false, "WATCHDOG=1");
2879                 e->watchdog_last = now(CLOCK_MONOTONIC);
2880
2881                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2882                 if (e->watchdog_fd < 0)
2883                         return -errno;
2884
2885                 r = arm_watchdog(e);
2886                 if (r < 0)
2887                         goto fail;
2888
2889                 ev.events = EPOLLIN;
2890                 ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
2891
2892                 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2893                 if (r < 0) {
2894                         r = -errno;
2895                         goto fail;
2896                 }
2897
2898         } else {
2899                 if (e->watchdog_fd >= 0) {
2900                         epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2901                         e->watchdog_fd = safe_close(e->watchdog_fd);
2902                 }
2903         }
2904
2905         e->watchdog = !!b;
2906         return e->watchdog;
2907
2908 fail:
2909         e->watchdog_fd = safe_close(e->watchdog_fd);
2910         return r;
2911 }
2912
2913 #if 0 /// UNNEEDED by elogind
2914 _public_ int sd_event_get_watchdog(sd_event *e) {
2915         assert_return(e, -EINVAL);
2916         assert_return(!event_pid_changed(e), -ECHILD);
2917
2918         return e->watchdog;
2919 }
2920 #endif // 0