chiark / gitweb /
Prep v229: Add missing fixes from upstream [3/6] src/libelogind
[elogind.git] / src / libelogind / sd-event / sd-event.c
1 /***
2   This file is part of systemd.
3
4   Copyright 2013 Lennart Poettering
5
6   systemd is free software; you can redistribute it and/or modify it
7   under the terms of the GNU Lesser General Public License as published by
8   the Free Software Foundation; either version 2.1 of the License, or
9   (at your option) any later version.
10
11   systemd is distributed in the hope that it will be useful, but
12   WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public License
17   along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <sys/epoll.h>
21 #include <sys/timerfd.h>
22 #include <sys/wait.h>
23
24 #include "sd-daemon.h"
25 #include "sd-event.h"
26 #include "sd-id128.h"
27
28 #include "alloc-util.h"
29 #include "fd-util.h"
30 #include "hashmap.h"
31 #include "list.h"
32 #include "macro.h"
33 #include "missing.h"
34 #include "prioq.h"
35 #include "process-util.h"
36 #include "set.h"
37 #include "signal-util.h"
38 #include "string-table.h"
39 #include "string-util.h"
40 #include "time-util.h"
41 #include "util.h"
42
43 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
44
45 typedef enum EventSourceType {
46         SOURCE_IO,
47         SOURCE_TIME_REALTIME,
48         SOURCE_TIME_BOOTTIME,
49         SOURCE_TIME_MONOTONIC,
50         SOURCE_TIME_REALTIME_ALARM,
51         SOURCE_TIME_BOOTTIME_ALARM,
52         SOURCE_SIGNAL,
53         SOURCE_CHILD,
54         SOURCE_DEFER,
55         SOURCE_POST,
56         SOURCE_EXIT,
57         SOURCE_WATCHDOG,
58         _SOURCE_EVENT_SOURCE_TYPE_MAX,
59         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
60 } EventSourceType;
61
62 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
63         [SOURCE_IO] = "io",
64         [SOURCE_TIME_REALTIME] = "realtime",
65         [SOURCE_TIME_BOOTTIME] = "bootime",
66         [SOURCE_TIME_MONOTONIC] = "monotonic",
67         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
68         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
69         [SOURCE_SIGNAL] = "signal",
70         [SOURCE_CHILD] = "child",
71         [SOURCE_DEFER] = "defer",
72         [SOURCE_POST] = "post",
73         [SOURCE_EXIT] = "exit",
74         [SOURCE_WATCHDOG] = "watchdog",
75 };
76
77 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
78
79 /* All objects we use in epoll events start with this value, so that
80  * we know how to dispatch it */
81 typedef enum WakeupType {
82         WAKEUP_NONE,
83         WAKEUP_EVENT_SOURCE,
84         WAKEUP_CLOCK_DATA,
85         WAKEUP_SIGNAL_DATA,
86         _WAKEUP_TYPE_MAX,
87         _WAKEUP_TYPE_INVALID = -1,
88 } WakeupType;
89
90 #define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
91
92 struct sd_event_source {
93         WakeupType wakeup;
94
95         unsigned n_ref;
96
97         sd_event *event;
98         void *userdata;
99         sd_event_handler_t prepare;
100
101         char *description;
102
103         EventSourceType type:5;
104         int enabled:3;
105         bool pending:1;
106         bool dispatching:1;
107         bool floating:1;
108
109         int64_t priority;
110         unsigned pending_index;
111         unsigned prepare_index;
112         unsigned pending_iteration;
113         unsigned prepare_iteration;
114
115         LIST_FIELDS(sd_event_source, sources);
116
117         union {
118                 struct {
119                         sd_event_io_handler_t callback;
120                         int fd;
121                         uint32_t events;
122                         uint32_t revents;
123                         bool registered:1;
124                 } io;
125                 struct {
126                         sd_event_time_handler_t callback;
127                         usec_t next, accuracy;
128                         unsigned earliest_index;
129                         unsigned latest_index;
130                 } time;
131                 struct {
132                         sd_event_signal_handler_t callback;
133                         struct signalfd_siginfo siginfo;
134                         int sig;
135                 } signal;
136                 struct {
137                         sd_event_child_handler_t callback;
138                         siginfo_t siginfo;
139                         pid_t pid;
140                         int options;
141                 } child;
142                 struct {
143                         sd_event_handler_t callback;
144                 } defer;
145                 struct {
146                         sd_event_handler_t callback;
147                 } post;
148                 struct {
149                         sd_event_handler_t callback;
150                         unsigned prioq_index;
151                 } exit;
152         };
153 };
154
155 struct clock_data {
156         WakeupType wakeup;
157         int fd;
158
159         /* For all clocks we maintain two priority queues each, one
160          * ordered for the earliest times the events may be
161          * dispatched, and one ordered by the latest times they must
162          * have been dispatched. The range between the top entries in
163          * the two prioqs is the time window we can freely schedule
164          * wakeups in */
165
166         Prioq *earliest;
167         Prioq *latest;
168         usec_t next;
169
170         bool needs_rearm:1;
171 };
172
173 struct signal_data {
174         WakeupType wakeup;
175
176         /* For each priority we maintain one signal fd, so that we
177          * only have to dequeue a single event per priority at a
178          * time. */
179
180         int fd;
181         int64_t priority;
182         sigset_t sigset;
183         sd_event_source *current;
184 };
185
186 struct sd_event {
187         unsigned n_ref;
188
189         int epoll_fd;
190         int watchdog_fd;
191
192         Prioq *pending;
193         Prioq *prepare;
194
195         /* timerfd_create() only supports these five clocks so far. We
196          * can add support for more clocks when the kernel learns to
197          * deal with them, too. */
198         struct clock_data realtime;
199         struct clock_data boottime;
200         struct clock_data monotonic;
201         struct clock_data realtime_alarm;
202         struct clock_data boottime_alarm;
203
204         usec_t perturb;
205
206         sd_event_source **signal_sources; /* indexed by signal number */
207         Hashmap *signal_data; /* indexed by priority */
208
209         Hashmap *child_sources;
210         unsigned n_enabled_child_sources;
211
212         Set *post_sources;
213
214         Prioq *exit;
215
216         pid_t original_pid;
217
218         unsigned iteration;
219         dual_timestamp timestamp;
220         usec_t timestamp_boottime;
221         int state;
222
223         bool exit_requested:1;
224         bool need_process_child:1;
225         bool watchdog:1;
226         bool profile_delays:1;
227
228         int exit_code;
229
230         pid_t tid;
231         sd_event **default_event_ptr;
232
233         usec_t watchdog_last, watchdog_period;
234
235         unsigned n_sources;
236
237         LIST_HEAD(sd_event_source, sources);
238
239         usec_t last_run, last_log;
240         unsigned delays[sizeof(usec_t) * 8];
241 };
242
243 static void source_disconnect(sd_event_source *s);
244
245 static int pending_prioq_compare(const void *a, const void *b) {
246         const sd_event_source *x = a, *y = b;
247
248         assert(x->pending);
249         assert(y->pending);
250
251         /* Enabled ones first */
252         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
253                 return -1;
254         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
255                 return 1;
256
257         /* Lower priority values first */
258         if (x->priority < y->priority)
259                 return -1;
260         if (x->priority > y->priority)
261                 return 1;
262
263         /* Older entries first */
264         if (x->pending_iteration < y->pending_iteration)
265                 return -1;
266         if (x->pending_iteration > y->pending_iteration)
267                 return 1;
268
269         return 0;
270 }
271
272 static int prepare_prioq_compare(const void *a, const void *b) {
273         const sd_event_source *x = a, *y = b;
274
275         assert(x->prepare);
276         assert(y->prepare);
277
278         /* Enabled ones first */
279         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
280                 return -1;
281         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
282                 return 1;
283
284         /* Move most recently prepared ones last, so that we can stop
285          * preparing as soon as we hit one that has already been
286          * prepared in the current iteration */
287         if (x->prepare_iteration < y->prepare_iteration)
288                 return -1;
289         if (x->prepare_iteration > y->prepare_iteration)
290                 return 1;
291
292         /* Lower priority values first */
293         if (x->priority < y->priority)
294                 return -1;
295         if (x->priority > y->priority)
296                 return 1;
297
298         return 0;
299 }
300
301 static int earliest_time_prioq_compare(const void *a, const void *b) {
302         const sd_event_source *x = a, *y = b;
303
304         assert(EVENT_SOURCE_IS_TIME(x->type));
305         assert(x->type == y->type);
306
307         /* Enabled ones first */
308         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
309                 return -1;
310         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
311                 return 1;
312
313         /* Move the pending ones to the end */
314         if (!x->pending && y->pending)
315                 return -1;
316         if (x->pending && !y->pending)
317                 return 1;
318
319         /* Order by time */
320         if (x->time.next < y->time.next)
321                 return -1;
322         if (x->time.next > y->time.next)
323                 return 1;
324
325         return 0;
326 }
327
328 static usec_t time_event_source_latest(const sd_event_source *s) {
329         return usec_add(s->time.next, s->time.accuracy);
330 }
331
332 static int latest_time_prioq_compare(const void *a, const void *b) {
333         const sd_event_source *x = a, *y = b;
334
335         assert(EVENT_SOURCE_IS_TIME(x->type));
336         assert(x->type == y->type);
337
338         /* Enabled ones first */
339         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
340                 return -1;
341         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
342                 return 1;
343
344         /* Move the pending ones to the end */
345         if (!x->pending && y->pending)
346                 return -1;
347         if (x->pending && !y->pending)
348                 return 1;
349
350         /* Order by time */
351         if (time_event_source_latest(x) < time_event_source_latest(y))
352                 return -1;
353         if (time_event_source_latest(x) > time_event_source_latest(y))
354                 return 1;
355
356         return 0;
357 }
358
359 static int exit_prioq_compare(const void *a, const void *b) {
360         const sd_event_source *x = a, *y = b;
361
362         assert(x->type == SOURCE_EXIT);
363         assert(y->type == SOURCE_EXIT);
364
365         /* Enabled ones first */
366         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
367                 return -1;
368         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
369                 return 1;
370
371         /* Lower priority values first */
372         if (x->priority < y->priority)
373                 return -1;
374         if (x->priority > y->priority)
375                 return 1;
376
377         return 0;
378 }
379
380 static void free_clock_data(struct clock_data *d) {
381         assert(d);
382         assert(d->wakeup == WAKEUP_CLOCK_DATA);
383
384         safe_close(d->fd);
385         prioq_free(d->earliest);
386         prioq_free(d->latest);
387 }
388
389 static void event_free(sd_event *e) {
390         sd_event_source *s;
391
392         assert(e);
393
394         while ((s = e->sources)) {
395                 assert(s->floating);
396                 source_disconnect(s);
397                 sd_event_source_unref(s);
398         }
399
400         assert(e->n_sources == 0);
401
402         if (e->default_event_ptr)
403                 *(e->default_event_ptr) = NULL;
404
405         safe_close(e->epoll_fd);
406         safe_close(e->watchdog_fd);
407
408         free_clock_data(&e->realtime);
409         free_clock_data(&e->boottime);
410         free_clock_data(&e->monotonic);
411         free_clock_data(&e->realtime_alarm);
412         free_clock_data(&e->boottime_alarm);
413
414         prioq_free(e->pending);
415         prioq_free(e->prepare);
416         prioq_free(e->exit);
417
418         free(e->signal_sources);
419         hashmap_free(e->signal_data);
420
421         hashmap_free(e->child_sources);
422         set_free(e->post_sources);
423         free(e);
424 }
425
426 _public_ int sd_event_new(sd_event** ret) {
427         sd_event *e;
428         int r;
429
430         assert_return(ret, -EINVAL);
431
432         e = new0(sd_event, 1);
433         if (!e)
434                 return -ENOMEM;
435
436         e->n_ref = 1;
437         e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
438         e->realtime.next = e->boottime.next = e->monotonic.next = e->realtime_alarm.next = e->boottime_alarm.next = USEC_INFINITY;
439         e->realtime.wakeup = e->boottime.wakeup = e->monotonic.wakeup = e->realtime_alarm.wakeup = e->boottime_alarm.wakeup = WAKEUP_CLOCK_DATA;
440         e->original_pid = getpid();
441         e->perturb = USEC_INFINITY;
442
443         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
444         if (r < 0)
445                 goto fail;
446
447         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
448         if (e->epoll_fd < 0) {
449                 r = -errno;
450                 goto fail;
451         }
452
453         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
454                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 ... 2^63 us will be logged every 5s.");
455                 e->profile_delays = true;
456         }
457
458         *ret = e;
459         return 0;
460
461 fail:
462         event_free(e);
463         return r;
464 }
465
466 _public_ sd_event* sd_event_ref(sd_event *e) {
467
468         if (!e)
469                 return NULL;
470
471         assert(e->n_ref >= 1);
472         e->n_ref++;
473
474         return e;
475 }
476
477 _public_ sd_event* sd_event_unref(sd_event *e) {
478
479         if (!e)
480                 return NULL;
481
482         assert(e->n_ref >= 1);
483         e->n_ref--;
484
485         if (e->n_ref <= 0)
486                 event_free(e);
487
488         return NULL;
489 }
490
491 static bool event_pid_changed(sd_event *e) {
492         assert(e);
493
494         /* We don't support people creating an event loop and keeping
495          * it around over a fork(). Let's complain. */
496
497         return e->original_pid != getpid();
498 }
499
500 static void source_io_unregister(sd_event_source *s) {
501         int r;
502
503         assert(s);
504         assert(s->type == SOURCE_IO);
505
506         if (event_pid_changed(s->event))
507                 return;
508
509         if (!s->io.registered)
510                 return;
511
512         r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
513         if (r < 0)
514                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
515                                 strna(s->description), event_source_type_to_string(s->type));
516
517         s->io.registered = false;
518 }
519
520 static int source_io_register(
521                 sd_event_source *s,
522                 int enabled,
523                 uint32_t events) {
524
525         struct epoll_event ev = {};
526         int r;
527
528         assert(s);
529         assert(s->type == SOURCE_IO);
530         assert(enabled != SD_EVENT_OFF);
531
532         ev.events = events;
533         ev.data.ptr = s;
534
535         if (enabled == SD_EVENT_ONESHOT)
536                 ev.events |= EPOLLONESHOT;
537
538         if (s->io.registered)
539                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
540         else
541                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
542         if (r < 0)
543                 return -errno;
544
545         s->io.registered = true;
546
547         return 0;
548 }
549
550 #if 0 /// UNNEEDED by elogind
551 static clockid_t event_source_type_to_clock(EventSourceType t) {
552
553         switch (t) {
554
555         case SOURCE_TIME_REALTIME:
556                 return CLOCK_REALTIME;
557
558         case SOURCE_TIME_BOOTTIME:
559                 return CLOCK_BOOTTIME;
560
561         case SOURCE_TIME_MONOTONIC:
562                 return CLOCK_MONOTONIC;
563
564         case SOURCE_TIME_REALTIME_ALARM:
565                 return CLOCK_REALTIME_ALARM;
566
567         case SOURCE_TIME_BOOTTIME_ALARM:
568                 return CLOCK_BOOTTIME_ALARM;
569
570         default:
571                 return (clockid_t) -1;
572         }
573 }
574 #endif // 0
575
576 static EventSourceType clock_to_event_source_type(clockid_t clock) {
577
578         switch (clock) {
579
580         case CLOCK_REALTIME:
581                 return SOURCE_TIME_REALTIME;
582
583         case CLOCK_BOOTTIME:
584                 return SOURCE_TIME_BOOTTIME;
585
586         case CLOCK_MONOTONIC:
587                 return SOURCE_TIME_MONOTONIC;
588
589         case CLOCK_REALTIME_ALARM:
590                 return SOURCE_TIME_REALTIME_ALARM;
591
592         case CLOCK_BOOTTIME_ALARM:
593                 return SOURCE_TIME_BOOTTIME_ALARM;
594
595         default:
596                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
597         }
598 }
599
600 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
601         assert(e);
602
603         switch (t) {
604
605         case SOURCE_TIME_REALTIME:
606                 return &e->realtime;
607
608         case SOURCE_TIME_BOOTTIME:
609                 return &e->boottime;
610
611         case SOURCE_TIME_MONOTONIC:
612                 return &e->monotonic;
613
614         case SOURCE_TIME_REALTIME_ALARM:
615                 return &e->realtime_alarm;
616
617         case SOURCE_TIME_BOOTTIME_ALARM:
618                 return &e->boottime_alarm;
619
620         default:
621                 return NULL;
622         }
623 }
624
625 static int event_make_signal_data(
626                 sd_event *e,
627                 int sig,
628                 struct signal_data **ret) {
629
630         struct epoll_event ev = {};
631         struct signal_data *d;
632         bool added = false;
633         sigset_t ss_copy;
634         int64_t priority;
635         int r;
636
637         assert(e);
638
639         if (event_pid_changed(e))
640                 return -ECHILD;
641
642         if (e->signal_sources && e->signal_sources[sig])
643                 priority = e->signal_sources[sig]->priority;
644         else
645                 priority = 0;
646
647         d = hashmap_get(e->signal_data, &priority);
648         if (d) {
649                 if (sigismember(&d->sigset, sig) > 0) {
650                         if (ret)
651                                 *ret = d;
652                 return 0;
653                 }
654         } else {
655                 r = hashmap_ensure_allocated(&e->signal_data, &uint64_hash_ops);
656                 if (r < 0)
657                         return r;
658
659                 d = new0(struct signal_data, 1);
660                 if (!d)
661                         return -ENOMEM;
662
663                 d->wakeup = WAKEUP_SIGNAL_DATA;
664                 d->fd  = -1;
665                 d->priority = priority;
666
667                 r = hashmap_put(e->signal_data, &d->priority, d);
668                 if (r < 0) {
669                         free(d);
670                         return r;
671                 }
672
673                 added = true;
674         }
675
676         ss_copy = d->sigset;
677         assert_se(sigaddset(&ss_copy, sig) >= 0);
678
679         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
680         if (r < 0) {
681                 r = -errno;
682                 goto fail;
683         }
684
685         d->sigset = ss_copy;
686
687         if (d->fd >= 0) {
688                 if (ret)
689                         *ret = d;
690                 return 0;
691         }
692
693         d->fd = r;
694
695         ev.events = EPOLLIN;
696         ev.data.ptr = d;
697
698         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev);
699         if (r < 0) {
700                 r = -errno;
701                 goto fail;
702         }
703
704         if (ret)
705                 *ret = d;
706
707         return 0;
708
709 fail:
710         if (added) {
711                 d->fd = safe_close(d->fd);
712                 hashmap_remove(e->signal_data, &d->priority);
713                 free(d);
714         }
715
716         return r;
717 }
718
719 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
720         assert(e);
721         assert(d);
722
723         /* Turns off the specified signal in the signal data
724          * object. If the signal mask of the object becomes empty that
725          * way removes it. */
726
727         if (sigismember(&d->sigset, sig) == 0)
728                 return;
729
730         assert_se(sigdelset(&d->sigset, sig) >= 0);
731
732         if (sigisemptyset(&d->sigset)) {
733
734                 /* If all the mask is all-zero we can get rid of the structure */
735                 hashmap_remove(e->signal_data, &d->priority);
736                 assert(!d->current);
737                 safe_close(d->fd);
738                 free(d);
739                 return;
740         }
741
742         assert(d->fd >= 0);
743
744         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
745                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
746 }
747
748 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
749         struct signal_data *d;
750         static const int64_t zero_priority = 0;
751
752         assert(e);
753
754         /* Rechecks if the specified signal is still something we are
755          * interested in. If not, we'll unmask it, and possibly drop
756          * the signalfd for it. */
757
758         if (sig == SIGCHLD &&
759             e->n_enabled_child_sources > 0)
760                 return;
761
762         if (e->signal_sources &&
763             e->signal_sources[sig] &&
764             e->signal_sources[sig]->enabled != SD_EVENT_OFF)
765                 return;
766
767         /*
768          * The specified signal might be enabled in three different queues:
769          *
770          * 1) the one that belongs to the priority passed (if it is non-NULL)
771          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
772          * 3) the 0 priority (to cover the SIGCHLD case)
773          *
774          * Hence, let's remove it from all three here.
775          */
776
777         if (priority) {
778                 d = hashmap_get(e->signal_data, priority);
779                 if (d)
780                         event_unmask_signal_data(e, d, sig);
781         }
782
783         if (e->signal_sources && e->signal_sources[sig]) {
784                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
785                 if (d)
786                         event_unmask_signal_data(e, d, sig);
787         }
788
789         d = hashmap_get(e->signal_data, &zero_priority);
790         if (d)
791                 event_unmask_signal_data(e, d, sig);
792 }
793
794 static void source_disconnect(sd_event_source *s) {
795         sd_event *event;
796
797         assert(s);
798
799         if (!s->event)
800                 return;
801
802         assert(s->event->n_sources > 0);
803
804         switch (s->type) {
805
806         case SOURCE_IO:
807                 if (s->io.fd >= 0)
808                         source_io_unregister(s);
809
810                 break;
811
812         case SOURCE_TIME_REALTIME:
813         case SOURCE_TIME_BOOTTIME:
814         case SOURCE_TIME_MONOTONIC:
815         case SOURCE_TIME_REALTIME_ALARM:
816         case SOURCE_TIME_BOOTTIME_ALARM: {
817                 struct clock_data *d;
818
819                 d = event_get_clock_data(s->event, s->type);
820                 assert(d);
821
822                 prioq_remove(d->earliest, s, &s->time.earliest_index);
823                 prioq_remove(d->latest, s, &s->time.latest_index);
824                 d->needs_rearm = true;
825                 break;
826         }
827
828         case SOURCE_SIGNAL:
829                 if (s->signal.sig > 0) {
830
831                         if (s->event->signal_sources)
832                                 s->event->signal_sources[s->signal.sig] = NULL;
833
834                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
835                 }
836
837                 break;
838
839         case SOURCE_CHILD:
840                 if (s->child.pid > 0) {
841                         if (s->enabled != SD_EVENT_OFF) {
842                                 assert(s->event->n_enabled_child_sources > 0);
843                                 s->event->n_enabled_child_sources--;
844                         }
845
846                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
847                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
848                 }
849
850                 break;
851
852         case SOURCE_DEFER:
853                 /* nothing */
854                 break;
855
856         case SOURCE_POST:
857                 set_remove(s->event->post_sources, s);
858                 break;
859
860         case SOURCE_EXIT:
861                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
862                 break;
863
864         default:
865                 assert_not_reached("Wut? I shouldn't exist.");
866         }
867
868         if (s->pending)
869                 prioq_remove(s->event->pending, s, &s->pending_index);
870
871         if (s->prepare)
872                 prioq_remove(s->event->prepare, s, &s->prepare_index);
873
874         event = s->event;
875
876         s->type = _SOURCE_EVENT_SOURCE_TYPE_INVALID;
877         s->event = NULL;
878         LIST_REMOVE(sources, event->sources, s);
879         event->n_sources--;
880
881         if (!s->floating)
882                 sd_event_unref(event);
883 }
884
885 static void source_free(sd_event_source *s) {
886         assert(s);
887
888         source_disconnect(s);
889         free(s->description);
890         free(s);
891 }
892
893 static int source_set_pending(sd_event_source *s, bool b) {
894         int r;
895
896         assert(s);
897         assert(s->type != SOURCE_EXIT);
898
899         if (s->pending == b)
900                 return 0;
901
902         s->pending = b;
903
904         if (b) {
905                 s->pending_iteration = s->event->iteration;
906
907                 r = prioq_put(s->event->pending, s, &s->pending_index);
908                 if (r < 0) {
909                         s->pending = false;
910                         return r;
911                 }
912         } else
913                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
914
915         if (EVENT_SOURCE_IS_TIME(s->type)) {
916                 struct clock_data *d;
917
918                 d = event_get_clock_data(s->event, s->type);
919                 assert(d);
920
921                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
922                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
923                 d->needs_rearm = true;
924         }
925
926         if (s->type == SOURCE_SIGNAL && !b) {
927                 struct signal_data *d;
928
929                 d = hashmap_get(s->event->signal_data, &s->priority);
930                 if (d && d->current == s)
931                         d->current = NULL;
932         }
933
934         return 0;
935 }
936
937 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
938         sd_event_source *s;
939
940         assert(e);
941
942         s = new0(sd_event_source, 1);
943         if (!s)
944                 return NULL;
945
946         s->n_ref = 1;
947         s->event = e;
948         s->floating = floating;
949         s->type = type;
950         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
951
952         if (!floating)
953                 sd_event_ref(e);
954
955         LIST_PREPEND(sources, e->sources, s);
956         e->n_sources ++;
957
958         return s;
959 }
960
961 _public_ int sd_event_add_io(
962                 sd_event *e,
963                 sd_event_source **ret,
964                 int fd,
965                 uint32_t events,
966                 sd_event_io_handler_t callback,
967                 void *userdata) {
968
969         sd_event_source *s;
970         int r;
971
972         assert_return(e, -EINVAL);
973         assert_return(fd >= 0, -EBADF);
974         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
975         assert_return(callback, -EINVAL);
976         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
977         assert_return(!event_pid_changed(e), -ECHILD);
978
979         s = source_new(e, !ret, SOURCE_IO);
980         if (!s)
981                 return -ENOMEM;
982
983         s->wakeup = WAKEUP_EVENT_SOURCE;
984         s->io.fd = fd;
985         s->io.events = events;
986         s->io.callback = callback;
987         s->userdata = userdata;
988         s->enabled = SD_EVENT_ON;
989
990         r = source_io_register(s, s->enabled, events);
991         if (r < 0) {
992                 source_free(s);
993                 return r;
994         }
995
996         if (ret)
997                 *ret = s;
998
999         return 0;
1000 }
1001
1002 static void initialize_perturb(sd_event *e) {
1003         sd_id128_t bootid = {};
1004
1005         /* When we sleep for longer, we try to realign the wakeup to
1006            the same time wihtin each minute/second/250ms, so that
1007            events all across the system can be coalesced into a single
1008            CPU wakeup. However, let's take some system-specific
1009            randomness for this value, so that in a network of systems
1010            with synced clocks timer events are distributed a
1011            bit. Here, we calculate a perturbation usec offset from the
1012            boot ID. */
1013
1014         if (_likely_(e->perturb != USEC_INFINITY))
1015                 return;
1016
1017         if (sd_id128_get_boot(&bootid) >= 0)
1018                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1019 }
1020
1021 static int event_setup_timer_fd(
1022                 sd_event *e,
1023                 struct clock_data *d,
1024                 clockid_t clock) {
1025
1026         struct epoll_event ev = {};
1027         int r, fd;
1028
1029         assert(e);
1030         assert(d);
1031
1032         if (_likely_(d->fd >= 0))
1033                 return 0;
1034
1035         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1036         if (fd < 0)
1037                 return -errno;
1038
1039         ev.events = EPOLLIN;
1040         ev.data.ptr = d;
1041
1042         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
1043         if (r < 0) {
1044                 safe_close(fd);
1045                 return -errno;
1046         }
1047
1048         d->fd = fd;
1049         return 0;
1050 }
1051
1052 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1053         assert(s);
1054
1055         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1056 }
1057
1058 _public_ int sd_event_add_time(
1059                 sd_event *e,
1060                 sd_event_source **ret,
1061                 clockid_t clock,
1062                 uint64_t usec,
1063                 uint64_t accuracy,
1064                 sd_event_time_handler_t callback,
1065                 void *userdata) {
1066
1067         EventSourceType type;
1068         sd_event_source *s;
1069         struct clock_data *d;
1070         int r;
1071
1072         assert_return(e, -EINVAL);
1073         assert_return(accuracy != (uint64_t) -1, -EINVAL);
1074         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1075         assert_return(!event_pid_changed(e), -ECHILD);
1076
1077         if (!callback)
1078                 callback = time_exit_callback;
1079
1080         type = clock_to_event_source_type(clock);
1081         assert_return(type >= 0, -EOPNOTSUPP);
1082
1083         d = event_get_clock_data(e, type);
1084         assert(d);
1085
1086         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1087         if (r < 0)
1088                 return r;
1089
1090         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1091         if (r < 0)
1092                 return r;
1093
1094         if (d->fd < 0) {
1095                 r = event_setup_timer_fd(e, d, clock);
1096                 if (r < 0)
1097                         return r;
1098         }
1099
1100         s = source_new(e, !ret, type);
1101         if (!s)
1102                 return -ENOMEM;
1103
1104         s->time.next = usec;
1105         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1106         s->time.callback = callback;
1107         s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
1108         s->userdata = userdata;
1109         s->enabled = SD_EVENT_ONESHOT;
1110
1111         d->needs_rearm = true;
1112
1113         r = prioq_put(d->earliest, s, &s->time.earliest_index);
1114         if (r < 0)
1115                 goto fail;
1116
1117         r = prioq_put(d->latest, s, &s->time.latest_index);
1118         if (r < 0)
1119                 goto fail;
1120
1121         if (ret)
1122                 *ret = s;
1123
1124         return 0;
1125
1126 fail:
1127         source_free(s);
1128         return r;
1129 }
1130
1131 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1132         assert(s);
1133
1134         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1135 }
1136
1137 _public_ int sd_event_add_signal(
1138                 sd_event *e,
1139                 sd_event_source **ret,
1140                 int sig,
1141                 sd_event_signal_handler_t callback,
1142                 void *userdata) {
1143
1144         sd_event_source *s;
1145         struct signal_data *d;
1146         sigset_t ss;
1147         int r;
1148
1149         assert_return(e, -EINVAL);
1150         assert_return(sig > 0, -EINVAL);
1151         assert_return(sig < _NSIG, -EINVAL);
1152         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1153         assert_return(!event_pid_changed(e), -ECHILD);
1154
1155         if (!callback)
1156                 callback = signal_exit_callback;
1157
1158         r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
1159         if (r != 0)
1160                 return -r;
1161
1162         if (!sigismember(&ss, sig))
1163                 return -EBUSY;
1164
1165         if (!e->signal_sources) {
1166                 e->signal_sources = new0(sd_event_source*, _NSIG);
1167                 if (!e->signal_sources)
1168                         return -ENOMEM;
1169         } else if (e->signal_sources[sig])
1170                 return -EBUSY;
1171
1172         s = source_new(e, !ret, SOURCE_SIGNAL);
1173         if (!s)
1174                 return -ENOMEM;
1175
1176         s->signal.sig = sig;
1177         s->signal.callback = callback;
1178         s->userdata = userdata;
1179         s->enabled = SD_EVENT_ON;
1180
1181         e->signal_sources[sig] = s;
1182
1183         r = event_make_signal_data(e, sig, &d);
1184                 if (r < 0) {
1185                         source_free(s);
1186                         return r;
1187                 }
1188
1189         /* Use the signal name as description for the event source by default */
1190         (void) sd_event_source_set_description(s, signal_to_string(sig));
1191
1192         if (ret)
1193                 *ret = s;
1194
1195         return 0;
1196 }
1197
1198 #if 0 /// UNNEEDED by elogind
1199 _public_ int sd_event_add_child(
1200                 sd_event *e,
1201                 sd_event_source **ret,
1202                 pid_t pid,
1203                 int options,
1204                 sd_event_child_handler_t callback,
1205                 void *userdata) {
1206
1207         sd_event_source *s;
1208         int r;
1209
1210         assert_return(e, -EINVAL);
1211         assert_return(pid > 1, -EINVAL);
1212         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1213         assert_return(options != 0, -EINVAL);
1214         assert_return(callback, -EINVAL);
1215         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1216         assert_return(!event_pid_changed(e), -ECHILD);
1217
1218         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1219         if (r < 0)
1220                 return r;
1221
1222         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1223                 return -EBUSY;
1224
1225         s = source_new(e, !ret, SOURCE_CHILD);
1226         if (!s)
1227                 return -ENOMEM;
1228
1229         s->child.pid = pid;
1230         s->child.options = options;
1231         s->child.callback = callback;
1232         s->userdata = userdata;
1233         s->enabled = SD_EVENT_ONESHOT;
1234
1235         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1236         if (r < 0) {
1237                 source_free(s);
1238                 return r;
1239         }
1240
1241         e->n_enabled_child_sources ++;
1242
1243         r = event_make_signal_data(e, SIGCHLD, NULL);
1244                 if (r < 0) {
1245                 e->n_enabled_child_sources--;
1246                         source_free(s);
1247                         return r;
1248                 }
1249
1250         e->need_process_child = true;
1251
1252         if (ret)
1253                 *ret = s;
1254
1255         return 0;
1256 }
1257
1258 _public_ int sd_event_add_defer(
1259                 sd_event *e,
1260                 sd_event_source **ret,
1261                 sd_event_handler_t callback,
1262                 void *userdata) {
1263
1264         sd_event_source *s;
1265         int r;
1266
1267         assert_return(e, -EINVAL);
1268         assert_return(callback, -EINVAL);
1269         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1270         assert_return(!event_pid_changed(e), -ECHILD);
1271
1272         s = source_new(e, !ret, SOURCE_DEFER);
1273         if (!s)
1274                 return -ENOMEM;
1275
1276         s->defer.callback = callback;
1277         s->userdata = userdata;
1278         s->enabled = SD_EVENT_ONESHOT;
1279
1280         r = source_set_pending(s, true);
1281         if (r < 0) {
1282                 source_free(s);
1283                 return r;
1284         }
1285
1286         if (ret)
1287                 *ret = s;
1288
1289         return 0;
1290 }
1291 #endif // 0
1292
1293 _public_ int sd_event_add_post(
1294                 sd_event *e,
1295                 sd_event_source **ret,
1296                 sd_event_handler_t callback,
1297                 void *userdata) {
1298
1299         sd_event_source *s;
1300         int r;
1301
1302         assert_return(e, -EINVAL);
1303         assert_return(callback, -EINVAL);
1304         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1305         assert_return(!event_pid_changed(e), -ECHILD);
1306
1307         r = set_ensure_allocated(&e->post_sources, NULL);
1308         if (r < 0)
1309                 return r;
1310
1311         s = source_new(e, !ret, SOURCE_POST);
1312         if (!s)
1313                 return -ENOMEM;
1314
1315         s->post.callback = callback;
1316         s->userdata = userdata;
1317         s->enabled = SD_EVENT_ON;
1318
1319         r = set_put(e->post_sources, s);
1320         if (r < 0) {
1321                 source_free(s);
1322                 return r;
1323         }
1324
1325         if (ret)
1326                 *ret = s;
1327
1328         return 0;
1329 }
1330
1331 _public_ int sd_event_add_exit(
1332                 sd_event *e,
1333                 sd_event_source **ret,
1334                 sd_event_handler_t callback,
1335                 void *userdata) {
1336
1337         sd_event_source *s;
1338         int r;
1339
1340         assert_return(e, -EINVAL);
1341         assert_return(callback, -EINVAL);
1342         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1343         assert_return(!event_pid_changed(e), -ECHILD);
1344
1345         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1346         if (r < 0)
1347                 return r;
1348
1349         s = source_new(e, !ret, SOURCE_EXIT);
1350         if (!s)
1351                 return -ENOMEM;
1352
1353         s->exit.callback = callback;
1354         s->userdata = userdata;
1355         s->exit.prioq_index = PRIOQ_IDX_NULL;
1356         s->enabled = SD_EVENT_ONESHOT;
1357
1358         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1359         if (r < 0) {
1360                 source_free(s);
1361                 return r;
1362         }
1363
1364         if (ret)
1365                 *ret = s;
1366
1367         return 0;
1368 }
1369
1370 #if 0 /// UNNEEDED by elogind
1371 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
1372
1373         if (!s)
1374                 return NULL;
1375
1376         assert(s->n_ref >= 1);
1377         s->n_ref++;
1378
1379         return s;
1380 }
1381 #endif // 0
1382
1383 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
1384
1385         if (!s)
1386                 return NULL;
1387
1388         assert(s->n_ref >= 1);
1389         s->n_ref--;
1390
1391         if (s->n_ref <= 0) {
1392                 /* Here's a special hack: when we are called from a
1393                  * dispatch handler we won't free the event source
1394                  * immediately, but we will detach the fd from the
1395                  * epoll. This way it is safe for the caller to unref
1396                  * the event source and immediately close the fd, but
1397                  * we still retain a valid event source object after
1398                  * the callback. */
1399
1400                 if (s->dispatching) {
1401                         if (s->type == SOURCE_IO)
1402                                 source_io_unregister(s);
1403
1404                         source_disconnect(s);
1405                 } else
1406                         source_free(s);
1407         }
1408
1409         return NULL;
1410 }
1411
1412 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
1413         assert_return(s, -EINVAL);
1414         assert_return(!event_pid_changed(s->event), -ECHILD);
1415
1416         return free_and_strdup(&s->description, description);
1417 }
1418
1419 #if 0 /// UNNEEDED by elogind
1420 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
1421         assert_return(s, -EINVAL);
1422         assert_return(description, -EINVAL);
1423         assert_return(s->description, -ENXIO);
1424         assert_return(!event_pid_changed(s->event), -ECHILD);
1425
1426         *description = s->description;
1427         return 0;
1428 }
1429 #endif // 0
1430
1431 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1432         assert_return(s, NULL);
1433
1434         return s->event;
1435 }
1436
1437 #if 0 /// UNNEEDED by elogind
1438 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1439         assert_return(s, -EINVAL);
1440         assert_return(s->type != SOURCE_EXIT, -EDOM);
1441         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1442         assert_return(!event_pid_changed(s->event), -ECHILD);
1443
1444         return s->pending;
1445 }
1446
1447 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1448         assert_return(s, -EINVAL);
1449         assert_return(s->type == SOURCE_IO, -EDOM);
1450         assert_return(!event_pid_changed(s->event), -ECHILD);
1451
1452         return s->io.fd;
1453 }
1454 #endif // 0
1455
1456 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1457         int r;
1458
1459         assert_return(s, -EINVAL);
1460         assert_return(fd >= 0, -EBADF);
1461         assert_return(s->type == SOURCE_IO, -EDOM);
1462         assert_return(!event_pid_changed(s->event), -ECHILD);
1463
1464         if (s->io.fd == fd)
1465                 return 0;
1466
1467         if (s->enabled == SD_EVENT_OFF) {
1468                 s->io.fd = fd;
1469                 s->io.registered = false;
1470         } else {
1471                 int saved_fd;
1472
1473                 saved_fd = s->io.fd;
1474                 assert(s->io.registered);
1475
1476                 s->io.fd = fd;
1477                 s->io.registered = false;
1478
1479                 r = source_io_register(s, s->enabled, s->io.events);
1480                 if (r < 0) {
1481                         s->io.fd = saved_fd;
1482                         s->io.registered = true;
1483                         return r;
1484                 }
1485
1486                 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
1487         }
1488
1489         return 0;
1490 }
1491
1492 #if 0 /// UNNEEDED by elogind
1493 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1494         assert_return(s, -EINVAL);
1495         assert_return(events, -EINVAL);
1496         assert_return(s->type == SOURCE_IO, -EDOM);
1497         assert_return(!event_pid_changed(s->event), -ECHILD);
1498
1499         *events = s->io.events;
1500         return 0;
1501 }
1502 #endif // 0
1503
1504 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1505         int r;
1506
1507         assert_return(s, -EINVAL);
1508         assert_return(s->type == SOURCE_IO, -EDOM);
1509         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1510         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1511         assert_return(!event_pid_changed(s->event), -ECHILD);
1512
1513         /* edge-triggered updates are never skipped, so we can reset edges */
1514         if (s->io.events == events && !(events & EPOLLET))
1515                 return 0;
1516
1517         if (s->enabled != SD_EVENT_OFF) {
1518                 r = source_io_register(s, s->enabled, events);
1519                 if (r < 0)
1520                         return r;
1521         }
1522
1523         s->io.events = events;
1524         source_set_pending(s, false);
1525
1526         return 0;
1527 }
1528
1529 #if 0 /// UNNEEDED by elogind
1530 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1531         assert_return(s, -EINVAL);
1532         assert_return(revents, -EINVAL);
1533         assert_return(s->type == SOURCE_IO, -EDOM);
1534         assert_return(s->pending, -ENODATA);
1535         assert_return(!event_pid_changed(s->event), -ECHILD);
1536
1537         *revents = s->io.revents;
1538         return 0;
1539 }
1540
1541 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1542         assert_return(s, -EINVAL);
1543         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1544         assert_return(!event_pid_changed(s->event), -ECHILD);
1545
1546         return s->signal.sig;
1547 }
1548
1549 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
1550         assert_return(s, -EINVAL);
1551         assert_return(!event_pid_changed(s->event), -ECHILD);
1552
1553         return s->priority;
1554 }
1555 #endif // 0
1556
1557 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
1558         int r;
1559
1560         assert_return(s, -EINVAL);
1561         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1562         assert_return(!event_pid_changed(s->event), -ECHILD);
1563
1564         if (s->priority == priority)
1565                 return 0;
1566
1567         if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
1568                 struct signal_data *old, *d;
1569
1570                 /* Move us from the signalfd belonging to the old
1571                  * priority to the signalfd of the new priority */
1572
1573                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
1574
1575                 s->priority = priority;
1576
1577                 r = event_make_signal_data(s->event, s->signal.sig, &d);
1578                 if (r < 0) {
1579                         s->priority = old->priority;
1580                         return r;
1581                 }
1582
1583                 event_unmask_signal_data(s->event, old, s->signal.sig);
1584         } else
1585         s->priority = priority;
1586
1587         if (s->pending)
1588                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1589
1590         if (s->prepare)
1591                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1592
1593         if (s->type == SOURCE_EXIT)
1594                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1595
1596         return 0;
1597 }
1598
1599 #if 0 /// UNNEEDED by elogind
1600 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1601         assert_return(s, -EINVAL);
1602         assert_return(m, -EINVAL);
1603         assert_return(!event_pid_changed(s->event), -ECHILD);
1604
1605         *m = s->enabled;
1606         return 0;
1607 }
1608 #endif // 0
1609
1610 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1611         int r;
1612
1613         assert_return(s, -EINVAL);
1614         assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
1615         assert_return(!event_pid_changed(s->event), -ECHILD);
1616
1617         /* If we are dead anyway, we are fine with turning off
1618          * sources, but everything else needs to fail. */
1619         if (s->event->state == SD_EVENT_FINISHED)
1620                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
1621
1622         if (s->enabled == m)
1623                 return 0;
1624
1625         if (m == SD_EVENT_OFF) {
1626
1627                 switch (s->type) {
1628
1629                 case SOURCE_IO:
1630                         source_io_unregister(s);
1631                         s->enabled = m;
1632                         break;
1633
1634                 case SOURCE_TIME_REALTIME:
1635                 case SOURCE_TIME_BOOTTIME:
1636                 case SOURCE_TIME_MONOTONIC:
1637                 case SOURCE_TIME_REALTIME_ALARM:
1638                 case SOURCE_TIME_BOOTTIME_ALARM: {
1639                         struct clock_data *d;
1640
1641                         s->enabled = m;
1642                         d = event_get_clock_data(s->event, s->type);
1643                         assert(d);
1644
1645                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1646                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1647                         d->needs_rearm = true;
1648                         break;
1649                 }
1650
1651                 case SOURCE_SIGNAL:
1652                         s->enabled = m;
1653
1654                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1655                         break;
1656
1657                 case SOURCE_CHILD:
1658                         s->enabled = m;
1659
1660                         assert(s->event->n_enabled_child_sources > 0);
1661                         s->event->n_enabled_child_sources--;
1662
1663                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1664                         break;
1665
1666                 case SOURCE_EXIT:
1667                         s->enabled = m;
1668                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1669                         break;
1670
1671                 case SOURCE_DEFER:
1672                 case SOURCE_POST:
1673                         s->enabled = m;
1674                         break;
1675
1676                 default:
1677                         assert_not_reached("Wut? I shouldn't exist.");
1678                 }
1679
1680         } else {
1681                 switch (s->type) {
1682
1683                 case SOURCE_IO:
1684                         r = source_io_register(s, m, s->io.events);
1685                         if (r < 0)
1686                                 return r;
1687
1688                         s->enabled = m;
1689                         break;
1690
1691                 case SOURCE_TIME_REALTIME:
1692                 case SOURCE_TIME_BOOTTIME:
1693                 case SOURCE_TIME_MONOTONIC:
1694                 case SOURCE_TIME_REALTIME_ALARM:
1695                 case SOURCE_TIME_BOOTTIME_ALARM: {
1696                         struct clock_data *d;
1697
1698                         s->enabled = m;
1699                         d = event_get_clock_data(s->event, s->type);
1700                         assert(d);
1701
1702                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1703                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1704                         d->needs_rearm = true;
1705                         break;
1706                 }
1707
1708                 case SOURCE_SIGNAL:
1709
1710                         s->enabled = m;
1711
1712                         r = event_make_signal_data(s->event, s->signal.sig, NULL);
1713                                 if (r < 0) {
1714                                         s->enabled = SD_EVENT_OFF;
1715                                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1716                                         return r;
1717                                 }
1718
1719                         break;
1720
1721                 case SOURCE_CHILD:
1722
1723                         if (s->enabled == SD_EVENT_OFF)
1724                                 s->event->n_enabled_child_sources++;
1725
1726                         s->enabled = m;
1727
1728                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
1729                                         if (r < 0) {
1730                                                 s->enabled = SD_EVENT_OFF;
1731                                 s->event->n_enabled_child_sources--;
1732                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1733                                                 return r;
1734                                         }
1735
1736                         break;
1737
1738                 case SOURCE_EXIT:
1739                         s->enabled = m;
1740                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1741                         break;
1742
1743                 case SOURCE_DEFER:
1744                 case SOURCE_POST:
1745                         s->enabled = m;
1746                         break;
1747
1748                 default:
1749                         assert_not_reached("Wut? I shouldn't exist.");
1750                 }
1751         }
1752
1753         if (s->pending)
1754                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1755
1756         if (s->prepare)
1757                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1758
1759         return 0;
1760 }
1761
1762 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1763         assert_return(s, -EINVAL);
1764         assert_return(usec, -EINVAL);
1765         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1766         assert_return(!event_pid_changed(s->event), -ECHILD);
1767
1768         *usec = s->time.next;
1769         return 0;
1770 }
1771
1772 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1773         struct clock_data *d;
1774
1775         assert_return(s, -EINVAL);
1776         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1777         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1778         assert_return(!event_pid_changed(s->event), -ECHILD);
1779
1780         s->time.next = usec;
1781
1782         source_set_pending(s, false);
1783
1784         d = event_get_clock_data(s->event, s->type);
1785         assert(d);
1786
1787         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1788         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1789         d->needs_rearm = true;
1790
1791         return 0;
1792 }
1793
1794 #if 0 /// UNNEEDED by elogind
1795 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1796         assert_return(s, -EINVAL);
1797         assert_return(usec, -EINVAL);
1798         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1799         assert_return(!event_pid_changed(s->event), -ECHILD);
1800
1801         *usec = s->time.accuracy;
1802         return 0;
1803 }
1804
1805 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1806         struct clock_data *d;
1807
1808         assert_return(s, -EINVAL);
1809         assert_return(usec != (uint64_t) -1, -EINVAL);
1810         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1811         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1812         assert_return(!event_pid_changed(s->event), -ECHILD);
1813
1814         if (usec == 0)
1815                 usec = DEFAULT_ACCURACY_USEC;
1816
1817         s->time.accuracy = usec;
1818
1819         source_set_pending(s, false);
1820
1821         d = event_get_clock_data(s->event, s->type);
1822         assert(d);
1823
1824         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1825         d->needs_rearm = true;
1826
1827         return 0;
1828 }
1829
1830 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
1831         assert_return(s, -EINVAL);
1832         assert_return(clock, -EINVAL);
1833         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1834         assert_return(!event_pid_changed(s->event), -ECHILD);
1835
1836         *clock = event_source_type_to_clock(s->type);
1837         return 0;
1838 }
1839
1840 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1841         assert_return(s, -EINVAL);
1842         assert_return(pid, -EINVAL);
1843         assert_return(s->type == SOURCE_CHILD, -EDOM);
1844         assert_return(!event_pid_changed(s->event), -ECHILD);
1845
1846         *pid = s->child.pid;
1847         return 0;
1848 }
1849 #endif // 0
1850
1851 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1852         int r;
1853
1854         assert_return(s, -EINVAL);
1855         assert_return(s->type != SOURCE_EXIT, -EDOM);
1856         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1857         assert_return(!event_pid_changed(s->event), -ECHILD);
1858
1859         if (s->prepare == callback)
1860                 return 0;
1861
1862         if (callback && s->prepare) {
1863                 s->prepare = callback;
1864                 return 0;
1865         }
1866
1867         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1868         if (r < 0)
1869                 return r;
1870
1871         s->prepare = callback;
1872
1873         if (callback) {
1874                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1875                 if (r < 0)
1876                         return r;
1877         } else
1878                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1879
1880         return 0;
1881 }
1882
1883 #if 0 /// UNNEEDED by elogind
1884 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1885         assert_return(s, NULL);
1886
1887         return s->userdata;
1888 }
1889
1890 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1891         void *ret;
1892
1893         assert_return(s, NULL);
1894
1895         ret = s->userdata;
1896         s->userdata = userdata;
1897
1898         return ret;
1899 }
1900 #endif // 0
1901
1902 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1903         usec_t c;
1904         assert(e);
1905         assert(a <= b);
1906
1907         if (a <= 0)
1908                 return 0;
1909         if (a >= USEC_INFINITY)
1910                 return USEC_INFINITY;
1911
1912         if (b <= a + 1)
1913                 return a;
1914
1915         initialize_perturb(e);
1916
1917         /*
1918           Find a good time to wake up again between times a and b. We
1919           have two goals here:
1920
1921           a) We want to wake up as seldom as possible, hence prefer
1922              later times over earlier times.
1923
1924           b) But if we have to wake up, then let's make sure to
1925              dispatch as much as possible on the entire system.
1926
1927           We implement this by waking up everywhere at the same time
1928           within any given minute if we can, synchronised via the
1929           perturbation value determined from the boot ID. If we can't,
1930           then we try to find the same spot in every 10s, then 1s and
1931           then 250ms step. Otherwise, we pick the last possible time
1932           to wake up.
1933         */
1934
1935         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1936         if (c >= b) {
1937                 if (_unlikely_(c < USEC_PER_MINUTE))
1938                         return b;
1939
1940                 c -= USEC_PER_MINUTE;
1941         }
1942
1943         if (c >= a)
1944                 return c;
1945
1946         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1947         if (c >= b) {
1948                 if (_unlikely_(c < USEC_PER_SEC*10))
1949                         return b;
1950
1951                 c -= USEC_PER_SEC*10;
1952         }
1953
1954         if (c >= a)
1955                 return c;
1956
1957         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1958         if (c >= b) {
1959                 if (_unlikely_(c < USEC_PER_SEC))
1960                         return b;
1961
1962                 c -= USEC_PER_SEC;
1963         }
1964
1965         if (c >= a)
1966                 return c;
1967
1968         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1969         if (c >= b) {
1970                 if (_unlikely_(c < USEC_PER_MSEC*250))
1971                         return b;
1972
1973                 c -= USEC_PER_MSEC*250;
1974         }
1975
1976         if (c >= a)
1977                 return c;
1978
1979         return b;
1980 }
1981
1982 static int event_arm_timer(
1983                 sd_event *e,
1984                 struct clock_data *d) {
1985
1986         struct itimerspec its = {};
1987         sd_event_source *a, *b;
1988         usec_t t;
1989         int r;
1990
1991         assert(e);
1992         assert(d);
1993
1994         if (!d->needs_rearm)
1995                 return 0;
1996         else
1997                 d->needs_rearm = false;
1998
1999         a = prioq_peek(d->earliest);
2000         if (!a || a->enabled == SD_EVENT_OFF || a->time.next == USEC_INFINITY) {
2001
2002                 if (d->fd < 0)
2003                         return 0;
2004
2005                 if (d->next == USEC_INFINITY)
2006                         return 0;
2007
2008                 /* disarm */
2009                 r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2010                 if (r < 0)
2011                         return r;
2012
2013                 d->next = USEC_INFINITY;
2014                 return 0;
2015         }
2016
2017         b = prioq_peek(d->latest);
2018         assert_se(b && b->enabled != SD_EVENT_OFF);
2019
2020         t = sleep_between(e, a->time.next, time_event_source_latest(b));
2021         if (d->next == t)
2022                 return 0;
2023
2024         assert_se(d->fd >= 0);
2025
2026         if (t == 0) {
2027                 /* We don' want to disarm here, just mean some time looooong ago. */
2028                 its.it_value.tv_sec = 0;
2029                 its.it_value.tv_nsec = 1;
2030         } else
2031                 timespec_store(&its.it_value, t);
2032
2033         r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2034         if (r < 0)
2035                 return -errno;
2036
2037         d->next = t;
2038         return 0;
2039 }
2040
2041 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
2042         assert(e);
2043         assert(s);
2044         assert(s->type == SOURCE_IO);
2045
2046         /* If the event source was already pending, we just OR in the
2047          * new revents, otherwise we reset the value. The ORing is
2048          * necessary to handle EPOLLONESHOT events properly where
2049          * readability might happen independently of writability, and
2050          * we need to keep track of both */
2051
2052         if (s->pending)
2053                 s->io.revents |= revents;
2054         else
2055                 s->io.revents = revents;
2056
2057         return source_set_pending(s, true);
2058 }
2059
2060 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
2061         uint64_t x;
2062         ssize_t ss;
2063
2064         assert(e);
2065         assert(fd >= 0);
2066
2067         assert_return(events == EPOLLIN, -EIO);
2068
2069         ss = read(fd, &x, sizeof(x));
2070         if (ss < 0) {
2071                 if (errno == EAGAIN || errno == EINTR)
2072                         return 0;
2073
2074                 return -errno;
2075         }
2076
2077         if (_unlikely_(ss != sizeof(x)))
2078                 return -EIO;
2079
2080         if (next)
2081                 *next = USEC_INFINITY;
2082
2083         return 0;
2084 }
2085
2086 static int process_timer(
2087                 sd_event *e,
2088                 usec_t n,
2089                 struct clock_data *d) {
2090
2091         sd_event_source *s;
2092         int r;
2093
2094         assert(e);
2095         assert(d);
2096
2097         for (;;) {
2098                 s = prioq_peek(d->earliest);
2099                 if (!s ||
2100                     s->time.next > n ||
2101                     s->enabled == SD_EVENT_OFF ||
2102                     s->pending)
2103                         break;
2104
2105                 r = source_set_pending(s, true);
2106                 if (r < 0)
2107                         return r;
2108
2109                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2110                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
2111                 d->needs_rearm = true;
2112         }
2113
2114         return 0;
2115 }
2116
2117 static int process_child(sd_event *e) {
2118         sd_event_source *s;
2119         Iterator i;
2120         int r;
2121
2122         assert(e);
2123
2124         e->need_process_child = false;
2125
2126         /*
2127            So, this is ugly. We iteratively invoke waitid() with P_PID
2128            + WNOHANG for each PID we wait for, instead of using
2129            P_ALL. This is because we only want to get child
2130            information of very specific child processes, and not all
2131            of them. We might not have processed the SIGCHLD even of a
2132            previous invocation and we don't want to maintain a
2133            unbounded *per-child* event queue, hence we really don't
2134            want anything flushed out of the kernel's queue that we
2135            don't care about. Since this is O(n) this means that if you
2136            have a lot of processes you probably want to handle SIGCHLD
2137            yourself.
2138
2139            We do not reap the children here (by using WNOWAIT), this
2140            is only done after the event source is dispatched so that
2141            the callback still sees the process as a zombie.
2142         */
2143
2144         HASHMAP_FOREACH(s, e->child_sources, i) {
2145                 assert(s->type == SOURCE_CHILD);
2146
2147                 if (s->pending)
2148                         continue;
2149
2150                 if (s->enabled == SD_EVENT_OFF)
2151                         continue;
2152
2153                 zero(s->child.siginfo);
2154                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
2155                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
2156                 if (r < 0)
2157                         return -errno;
2158
2159                 if (s->child.siginfo.si_pid != 0) {
2160                         bool zombie =
2161                                 s->child.siginfo.si_code == CLD_EXITED ||
2162                                 s->child.siginfo.si_code == CLD_KILLED ||
2163                                 s->child.siginfo.si_code == CLD_DUMPED;
2164
2165                         if (!zombie && (s->child.options & WEXITED)) {
2166                                 /* If the child isn't dead then let's
2167                                  * immediately remove the state change
2168                                  * from the queue, since there's no
2169                                  * benefit in leaving it queued */
2170
2171                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
2172                                 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
2173                         }
2174
2175                         r = source_set_pending(s, true);
2176                         if (r < 0)
2177                                 return r;
2178                 }
2179         }
2180
2181         return 0;
2182 }
2183
2184 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
2185         bool read_one = false;
2186         int r;
2187
2188         assert(e);
2189         assert_return(events == EPOLLIN, -EIO);
2190
2191         /* If there's a signal queued on this priority and SIGCHLD is
2192            on this priority too, then make sure to recheck the
2193            children we watch. This is because we only ever dequeue
2194            the first signal per priority, and if we dequeue one, and
2195            SIGCHLD might be enqueued later we wouldn't know, but we
2196            might have higher priority children we care about hence we
2197            need to check that explicitly. */
2198
2199         if (sigismember(&d->sigset, SIGCHLD))
2200                 e->need_process_child = true;
2201
2202         /* If there's already an event source pending for this
2203          * priority we don't read another */
2204         if (d->current)
2205                 return 0;
2206
2207         for (;;) {
2208                 struct signalfd_siginfo si;
2209                 ssize_t n;
2210                 sd_event_source *s = NULL;
2211
2212                 n = read(d->fd, &si, sizeof(si));
2213                 if (n < 0) {
2214                         if (errno == EAGAIN || errno == EINTR)
2215                                 return read_one;
2216
2217                         return -errno;
2218                 }
2219
2220                 if (_unlikely_(n != sizeof(si)))
2221                         return -EIO;
2222
2223                 assert(si.ssi_signo < _NSIG);
2224
2225                 read_one = true;
2226
2227                 if (e->signal_sources)
2228                         s = e->signal_sources[si.ssi_signo];
2229                 if (!s)
2230                         continue;
2231                 if (s->pending)
2232                         continue;
2233
2234                 s->signal.siginfo = si;
2235                 d->current = s;
2236
2237                 r = source_set_pending(s, true);
2238                 if (r < 0)
2239                         return r;
2240
2241                 return 1;
2242         }
2243 }
2244
2245 static int source_dispatch(sd_event_source *s) {
2246         int r = 0;
2247
2248         assert(s);
2249         assert(s->pending || s->type == SOURCE_EXIT);
2250
2251         if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) {
2252                 r = source_set_pending(s, false);
2253                 if (r < 0)
2254                         return r;
2255         }
2256
2257         if (s->type != SOURCE_POST) {
2258                 sd_event_source *z;
2259                 Iterator i;
2260
2261                 /* If we execute a non-post source, let's mark all
2262                  * post sources as pending */
2263
2264                 SET_FOREACH(z, s->event->post_sources, i) {
2265                         if (z->enabled == SD_EVENT_OFF)
2266                                 continue;
2267
2268                         r = source_set_pending(z, true);
2269                         if (r < 0)
2270                                 return r;
2271                 }
2272         }
2273
2274         if (s->enabled == SD_EVENT_ONESHOT) {
2275                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
2276                 if (r < 0)
2277                         return r;
2278         }
2279
2280         s->dispatching = true;
2281
2282         switch (s->type) {
2283
2284         case SOURCE_IO:
2285                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
2286                 break;
2287
2288         case SOURCE_TIME_REALTIME:
2289         case SOURCE_TIME_BOOTTIME:
2290         case SOURCE_TIME_MONOTONIC:
2291         case SOURCE_TIME_REALTIME_ALARM:
2292         case SOURCE_TIME_BOOTTIME_ALARM:
2293                 r = s->time.callback(s, s->time.next, s->userdata);
2294                 break;
2295
2296         case SOURCE_SIGNAL:
2297                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
2298                 break;
2299
2300         case SOURCE_CHILD: {
2301                 bool zombie;
2302
2303                 zombie = s->child.siginfo.si_code == CLD_EXITED ||
2304                          s->child.siginfo.si_code == CLD_KILLED ||
2305                          s->child.siginfo.si_code == CLD_DUMPED;
2306
2307                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
2308
2309                 /* Now, reap the PID for good. */
2310                 if (zombie)
2311                         waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
2312
2313                 break;
2314         }
2315
2316         case SOURCE_DEFER:
2317                 r = s->defer.callback(s, s->userdata);
2318                 break;
2319
2320         case SOURCE_POST:
2321                 r = s->post.callback(s, s->userdata);
2322                 break;
2323
2324         case SOURCE_EXIT:
2325                 r = s->exit.callback(s, s->userdata);
2326                 break;
2327
2328         case SOURCE_WATCHDOG:
2329         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
2330         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
2331                 assert_not_reached("Wut? I shouldn't exist.");
2332         }
2333
2334         s->dispatching = false;
2335
2336         if (r < 0)
2337                 log_debug_errno(r, "Event source %s (type %s) returned error, disabling: %m",
2338                                 strna(s->description), event_source_type_to_string(s->type));
2339
2340         if (s->n_ref == 0)
2341                 source_free(s);
2342         else if (r < 0)
2343                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
2344
2345         return 1;
2346 }
2347
2348 static int event_prepare(sd_event *e) {
2349         int r;
2350
2351         assert(e);
2352
2353         for (;;) {
2354                 sd_event_source *s;
2355
2356                 s = prioq_peek(e->prepare);
2357                 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
2358                         break;
2359
2360                 s->prepare_iteration = e->iteration;
2361                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
2362                 if (r < 0)
2363                         return r;
2364
2365                 assert(s->prepare);
2366
2367                 s->dispatching = true;
2368                 r = s->prepare(s, s->userdata);
2369                 s->dispatching = false;
2370
2371                 if (r < 0)
2372                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, disabling: %m",
2373                                         strna(s->description), event_source_type_to_string(s->type));
2374
2375                 if (s->n_ref == 0)
2376                         source_free(s);
2377                 else if (r < 0)
2378                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
2379         }
2380
2381         return 0;
2382 }
2383
2384 static int dispatch_exit(sd_event *e) {
2385         sd_event_source *p;
2386         int r;
2387
2388         assert(e);
2389
2390         p = prioq_peek(e->exit);
2391         if (!p || p->enabled == SD_EVENT_OFF) {
2392                 e->state = SD_EVENT_FINISHED;
2393                 return 0;
2394         }
2395
2396         sd_event_ref(e);
2397         e->iteration++;
2398         e->state = SD_EVENT_EXITING;
2399
2400         r = source_dispatch(p);
2401
2402         e->state = SD_EVENT_INITIAL;
2403         sd_event_unref(e);
2404
2405         return r;
2406 }
2407
2408 static sd_event_source* event_next_pending(sd_event *e) {
2409         sd_event_source *p;
2410
2411         assert(e);
2412
2413         p = prioq_peek(e->pending);
2414         if (!p)
2415                 return NULL;
2416
2417         if (p->enabled == SD_EVENT_OFF)
2418                 return NULL;
2419
2420         return p;
2421 }
2422
2423 static int arm_watchdog(sd_event *e) {
2424         struct itimerspec its = {};
2425         usec_t t;
2426         int r;
2427
2428         assert(e);
2429         assert(e->watchdog_fd >= 0);
2430
2431         t = sleep_between(e,
2432                           e->watchdog_last + (e->watchdog_period / 2),
2433                           e->watchdog_last + (e->watchdog_period * 3 / 4));
2434
2435         timespec_store(&its.it_value, t);
2436
2437         /* Make sure we never set the watchdog to 0, which tells the
2438          * kernel to disable it. */
2439         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
2440                 its.it_value.tv_nsec = 1;
2441
2442         r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
2443         if (r < 0)
2444                 return -errno;
2445
2446         return 0;
2447 }
2448
2449 static int process_watchdog(sd_event *e) {
2450         assert(e);
2451
2452         if (!e->watchdog)
2453                 return 0;
2454
2455         /* Don't notify watchdog too often */
2456         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
2457                 return 0;
2458
2459         sd_notify(false, "WATCHDOG=1");
2460         e->watchdog_last = e->timestamp.monotonic;
2461
2462         return arm_watchdog(e);
2463 }
2464
2465 _public_ int sd_event_prepare(sd_event *e) {
2466         int r;
2467
2468         assert_return(e, -EINVAL);
2469         assert_return(!event_pid_changed(e), -ECHILD);
2470         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2471         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2472
2473         if (e->exit_requested)
2474                 goto pending;
2475
2476         e->iteration++;
2477
2478         e->state = SD_EVENT_PREPARING;
2479         r = event_prepare(e);
2480         e->state = SD_EVENT_INITIAL;
2481         if (r < 0)
2482                 return r;
2483
2484         r = event_arm_timer(e, &e->realtime);
2485         if (r < 0)
2486                 return r;
2487
2488         r = event_arm_timer(e, &e->boottime);
2489         if (r < 0)
2490                 return r;
2491
2492         r = event_arm_timer(e, &e->monotonic);
2493         if (r < 0)
2494                 return r;
2495
2496         r = event_arm_timer(e, &e->realtime_alarm);
2497         if (r < 0)
2498                 return r;
2499
2500         r = event_arm_timer(e, &e->boottime_alarm);
2501         if (r < 0)
2502                 return r;
2503
2504         if (event_next_pending(e) || e->need_process_child)
2505                 goto pending;
2506
2507         e->state = SD_EVENT_ARMED;
2508
2509         return 0;
2510
2511 pending:
2512         e->state = SD_EVENT_ARMED;
2513         r = sd_event_wait(e, 0);
2514         if (r == 0)
2515                 e->state = SD_EVENT_ARMED;
2516
2517         return r;
2518 }
2519
2520 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
2521         struct epoll_event *ev_queue;
2522         unsigned ev_queue_max;
2523         int r, m, i;
2524
2525         assert_return(e, -EINVAL);
2526         assert_return(!event_pid_changed(e), -ECHILD);
2527         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2528         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
2529
2530         if (e->exit_requested) {
2531                 e->state = SD_EVENT_PENDING;
2532                 return 1;
2533         }
2534
2535         ev_queue_max = MAX(e->n_sources, 1u);
2536         ev_queue = newa(struct epoll_event, ev_queue_max);
2537
2538         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
2539                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
2540         if (m < 0) {
2541                 if (errno == EINTR) {
2542                         e->state = SD_EVENT_PENDING;
2543                         return 1;
2544                 }
2545
2546                 r = -errno;
2547                 goto finish;
2548         }
2549
2550         dual_timestamp_get(&e->timestamp);
2551         e->timestamp_boottime = now(CLOCK_BOOTTIME);
2552
2553         for (i = 0; i < m; i++) {
2554
2555                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
2556                         r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
2557                 else {
2558                         WakeupType *t = ev_queue[i].data.ptr;
2559
2560                         switch (*t) {
2561
2562                         case WAKEUP_EVENT_SOURCE:
2563                         r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
2564                                 break;
2565
2566                         case WAKEUP_CLOCK_DATA: {
2567                                 struct clock_data *d = ev_queue[i].data.ptr;
2568                                 r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
2569                                 break;
2570                         }
2571
2572                         case WAKEUP_SIGNAL_DATA:
2573                                 r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
2574                                 break;
2575
2576                         default:
2577                                 assert_not_reached("Invalid wake-up pointer");
2578                         }
2579                 }
2580                 if (r < 0)
2581                         goto finish;
2582         }
2583
2584         r = process_watchdog(e);
2585         if (r < 0)
2586                 goto finish;
2587
2588         r = process_timer(e, e->timestamp.realtime, &e->realtime);
2589         if (r < 0)
2590                 goto finish;
2591
2592         r = process_timer(e, e->timestamp_boottime, &e->boottime);
2593         if (r < 0)
2594                 goto finish;
2595
2596         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
2597         if (r < 0)
2598                 goto finish;
2599
2600         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
2601         if (r < 0)
2602                 goto finish;
2603
2604         r = process_timer(e, e->timestamp_boottime, &e->boottime_alarm);
2605         if (r < 0)
2606                 goto finish;
2607
2608         if (e->need_process_child) {
2609                 r = process_child(e);
2610                 if (r < 0)
2611                         goto finish;
2612         }
2613
2614         if (event_next_pending(e)) {
2615                 e->state = SD_EVENT_PENDING;
2616
2617                 return 1;
2618         }
2619
2620         r = 0;
2621
2622 finish:
2623         e->state = SD_EVENT_INITIAL;
2624
2625         return r;
2626 }
2627
2628 _public_ int sd_event_dispatch(sd_event *e) {
2629         sd_event_source *p;
2630         int r;
2631
2632         assert_return(e, -EINVAL);
2633         assert_return(!event_pid_changed(e), -ECHILD);
2634         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2635         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
2636
2637         if (e->exit_requested)
2638                 return dispatch_exit(e);
2639
2640         p = event_next_pending(e);
2641         if (p) {
2642                 sd_event_ref(e);
2643
2644                 e->state = SD_EVENT_RUNNING;
2645                 r = source_dispatch(p);
2646                 e->state = SD_EVENT_INITIAL;
2647
2648                 sd_event_unref(e);
2649
2650                 return r;
2651         }
2652
2653         e->state = SD_EVENT_INITIAL;
2654
2655         return 1;
2656 }
2657
2658 static void event_log_delays(sd_event *e) {
2659         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1];
2660         unsigned i;
2661         int o;
2662
2663         for (i = o = 0; i < ELEMENTSOF(e->delays); i++) {
2664                 o += snprintf(&b[o], sizeof(b) - o, "%u ", e->delays[i]);
2665                 e->delays[i] = 0;
2666         }
2667         log_debug("Event loop iterations: %.*s", o, b);
2668 }
2669
2670 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
2671         int r;
2672
2673         assert_return(e, -EINVAL);
2674         assert_return(!event_pid_changed(e), -ECHILD);
2675         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2676         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2677
2678         if (e->profile_delays && e->last_run) {
2679                 usec_t this_run;
2680                 unsigned l;
2681
2682                 this_run = now(CLOCK_MONOTONIC);
2683
2684                 l = u64log2(this_run - e->last_run);
2685                 assert(l < sizeof(e->delays));
2686                 e->delays[l]++;
2687
2688                 if (this_run - e->last_log >= 5*USEC_PER_SEC) {
2689                         event_log_delays(e);
2690                         e->last_log = this_run;
2691                 }
2692         }
2693
2694         r = sd_event_prepare(e);
2695         if (r == 0)
2696                 /* There was nothing? Then wait... */
2697                 r = sd_event_wait(e, timeout);
2698
2699         if (e->profile_delays)
2700                 e->last_run = now(CLOCK_MONOTONIC);
2701
2702         if (r > 0) {
2703                 /* There's something now, then let's dispatch it */
2704                 r = sd_event_dispatch(e);
2705                 if (r < 0)
2706                         return r;
2707
2708                 return 1;
2709         }
2710
2711         return r;
2712 }
2713
2714 #if 0 /// UNNEEDED by elogind
2715 _public_ int sd_event_loop(sd_event *e) {
2716         int r;
2717
2718         assert_return(e, -EINVAL);
2719         assert_return(!event_pid_changed(e), -ECHILD);
2720         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2721
2722         sd_event_ref(e);
2723
2724         while (e->state != SD_EVENT_FINISHED) {
2725                 r = sd_event_run(e, (uint64_t) -1);
2726                 if (r < 0)
2727                         goto finish;
2728         }
2729
2730         r = e->exit_code;
2731
2732 finish:
2733         sd_event_unref(e);
2734         return r;
2735 }
2736
2737 _public_ int sd_event_get_fd(sd_event *e) {
2738
2739         assert_return(e, -EINVAL);
2740         assert_return(!event_pid_changed(e), -ECHILD);
2741
2742         return e->epoll_fd;
2743 }
2744 #endif // 0
2745
2746 _public_ int sd_event_get_state(sd_event *e) {
2747         assert_return(e, -EINVAL);
2748         assert_return(!event_pid_changed(e), -ECHILD);
2749
2750         return e->state;
2751 }
2752
2753 #if 0 /// UNNEEDED by elogind
2754 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2755         assert_return(e, -EINVAL);
2756         assert_return(code, -EINVAL);
2757         assert_return(!event_pid_changed(e), -ECHILD);
2758
2759         if (!e->exit_requested)
2760                 return -ENODATA;
2761
2762         *code = e->exit_code;
2763         return 0;
2764 }
2765 #endif // 0
2766
2767 _public_ int sd_event_exit(sd_event *e, int code) {
2768         assert_return(e, -EINVAL);
2769         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2770         assert_return(!event_pid_changed(e), -ECHILD);
2771
2772         e->exit_requested = true;
2773         e->exit_code = code;
2774
2775         return 0;
2776 }
2777
2778 #if 0 /// UNNEEDED by elogind
2779 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
2780         assert_return(e, -EINVAL);
2781         assert_return(usec, -EINVAL);
2782         assert_return(!event_pid_changed(e), -ECHILD);
2783         assert_return(IN_SET(clock,
2784                              CLOCK_REALTIME,
2785                              CLOCK_REALTIME_ALARM,
2786                              CLOCK_MONOTONIC,
2787                              CLOCK_BOOTTIME,
2788                              CLOCK_BOOTTIME_ALARM), -EOPNOTSUPP);
2789
2790         if (!dual_timestamp_is_set(&e->timestamp)) {
2791                 /* Implicitly fall back to now() if we never ran
2792                  * before and thus have no cached time. */
2793                 *usec = now(clock);
2794                 return 1;
2795         }
2796
2797         switch (clock) {
2798
2799         case CLOCK_REALTIME:
2800         case CLOCK_REALTIME_ALARM:
2801                 *usec = e->timestamp.realtime;
2802                 break;
2803
2804         case CLOCK_MONOTONIC:
2805                 *usec = e->timestamp.monotonic;
2806                 break;
2807
2808         default:
2809                 *usec = e->timestamp_boottime;
2810                 break;
2811         }
2812
2813         return 0;
2814 }
2815 #endif // 0
2816
2817 _public_ int sd_event_default(sd_event **ret) {
2818
2819         static thread_local sd_event *default_event = NULL;
2820         sd_event *e = NULL;
2821         int r;
2822
2823         if (!ret)
2824                 return !!default_event;
2825
2826         if (default_event) {
2827                 *ret = sd_event_ref(default_event);
2828                 return 0;
2829         }
2830
2831         r = sd_event_new(&e);
2832         if (r < 0)
2833                 return r;
2834
2835         e->default_event_ptr = &default_event;
2836         e->tid = gettid();
2837         default_event = e;
2838
2839         *ret = e;
2840         return 1;
2841 }
2842
2843 #if 0 /// UNNEEDED by elogind
2844 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2845         assert_return(e, -EINVAL);
2846         assert_return(tid, -EINVAL);
2847         assert_return(!event_pid_changed(e), -ECHILD);
2848
2849         if (e->tid != 0) {
2850                 *tid = e->tid;
2851                 return 0;
2852         }
2853
2854         return -ENXIO;
2855 }
2856 #endif // 0
2857
2858 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2859         int r;
2860
2861         assert_return(e, -EINVAL);
2862         assert_return(!event_pid_changed(e), -ECHILD);
2863
2864         if (e->watchdog == !!b)
2865                 return e->watchdog;
2866
2867         if (b) {
2868                 struct epoll_event ev = {};
2869
2870                 r = sd_watchdog_enabled(false, &e->watchdog_period);
2871                 if (r <= 0)
2872                         return r;
2873
2874                 /* Issue first ping immediately */
2875                 sd_notify(false, "WATCHDOG=1");
2876                 e->watchdog_last = now(CLOCK_MONOTONIC);
2877
2878                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2879                 if (e->watchdog_fd < 0)
2880                         return -errno;
2881
2882                 r = arm_watchdog(e);
2883                 if (r < 0)
2884                         goto fail;
2885
2886                 ev.events = EPOLLIN;
2887                 ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
2888
2889                 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2890                 if (r < 0) {
2891                         r = -errno;
2892                         goto fail;
2893                 }
2894
2895         } else {
2896                 if (e->watchdog_fd >= 0) {
2897                         epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2898                         e->watchdog_fd = safe_close(e->watchdog_fd);
2899                 }
2900         }
2901
2902         e->watchdog = !!b;
2903         return e->watchdog;
2904
2905 fail:
2906         e->watchdog_fd = safe_close(e->watchdog_fd);
2907         return r;
2908 }
2909
2910 #if 0 /// UNNEEDED by elogind
2911 _public_ int sd_event_get_watchdog(sd_event *e) {
2912         assert_return(e, -EINVAL);
2913         assert_return(!event_pid_changed(e), -ECHILD);
2914
2915         return e->watchdog;
2916 }
2917 #endif // 0