chiark / gitweb /
Prep v230: Apply missing upstream fixes and updates (5/8) src/libelogind.
[elogind.git] / src / libelogind / sd-event / sd-event.c
1 /***
2   This file is part of systemd.
3
4   Copyright 2013 Lennart Poettering
5
6   systemd is free software; you can redistribute it and/or modify it
7   under the terms of the GNU Lesser General Public License as published by
8   the Free Software Foundation; either version 2.1 of the License, or
9   (at your option) any later version.
10
11   systemd is distributed in the hope that it will be useful, but
12   WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public License
17   along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <sys/epoll.h>
21 #include <sys/timerfd.h>
22 #include <sys/wait.h>
23
24 #include "sd-daemon.h"
25 #include "sd-event.h"
26 #include "sd-id128.h"
27
28 #include "alloc-util.h"
29 #include "fd-util.h"
30 #include "hashmap.h"
31 #include "list.h"
32 #include "macro.h"
33 #include "missing.h"
34 #include "prioq.h"
35 #include "process-util.h"
36 #include "set.h"
37 #include "signal-util.h"
38 #include "string-table.h"
39 #include "string-util.h"
40 #include "time-util.h"
41 #include "util.h"
42
43 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
44
45 typedef enum EventSourceType {
46         SOURCE_IO,
47         SOURCE_TIME_REALTIME,
48         SOURCE_TIME_BOOTTIME,
49         SOURCE_TIME_MONOTONIC,
50         SOURCE_TIME_REALTIME_ALARM,
51         SOURCE_TIME_BOOTTIME_ALARM,
52         SOURCE_SIGNAL,
53         SOURCE_CHILD,
54         SOURCE_DEFER,
55         SOURCE_POST,
56         SOURCE_EXIT,
57         SOURCE_WATCHDOG,
58         _SOURCE_EVENT_SOURCE_TYPE_MAX,
59         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
60 } EventSourceType;
61
62 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
63         [SOURCE_IO] = "io",
64         [SOURCE_TIME_REALTIME] = "realtime",
65         [SOURCE_TIME_BOOTTIME] = "bootime",
66         [SOURCE_TIME_MONOTONIC] = "monotonic",
67         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
68         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
69         [SOURCE_SIGNAL] = "signal",
70         [SOURCE_CHILD] = "child",
71         [SOURCE_DEFER] = "defer",
72         [SOURCE_POST] = "post",
73         [SOURCE_EXIT] = "exit",
74         [SOURCE_WATCHDOG] = "watchdog",
75 };
76
77 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
78
79 /* All objects we use in epoll events start with this value, so that
80  * we know how to dispatch it */
81 typedef enum WakeupType {
82         WAKEUP_NONE,
83         WAKEUP_EVENT_SOURCE,
84         WAKEUP_CLOCK_DATA,
85         WAKEUP_SIGNAL_DATA,
86         _WAKEUP_TYPE_MAX,
87         _WAKEUP_TYPE_INVALID = -1,
88 } WakeupType;
89
90 #define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
91
92 struct sd_event_source {
93         WakeupType wakeup;
94
95         unsigned n_ref;
96
97         sd_event *event;
98         void *userdata;
99         sd_event_handler_t prepare;
100
101         char *description;
102
103         EventSourceType type:5;
104         int enabled:3;
105         bool pending:1;
106         bool dispatching:1;
107         bool floating:1;
108
109         int64_t priority;
110         unsigned pending_index;
111         unsigned prepare_index;
112         unsigned pending_iteration;
113         unsigned prepare_iteration;
114
115         LIST_FIELDS(sd_event_source, sources);
116
117         union {
118                 struct {
119                         sd_event_io_handler_t callback;
120                         int fd;
121                         uint32_t events;
122                         uint32_t revents;
123                         bool registered:1;
124                 } io;
125                 struct {
126                         sd_event_time_handler_t callback;
127                         usec_t next, accuracy;
128                         unsigned earliest_index;
129                         unsigned latest_index;
130                 } time;
131                 struct {
132                         sd_event_signal_handler_t callback;
133                         struct signalfd_siginfo siginfo;
134                         int sig;
135                 } signal;
136                 struct {
137                         sd_event_child_handler_t callback;
138                         siginfo_t siginfo;
139                         pid_t pid;
140                         int options;
141                 } child;
142                 struct {
143                         sd_event_handler_t callback;
144                 } defer;
145                 struct {
146                         sd_event_handler_t callback;
147                 } post;
148                 struct {
149                         sd_event_handler_t callback;
150                         unsigned prioq_index;
151                 } exit;
152         };
153 };
154
155 struct clock_data {
156         WakeupType wakeup;
157         int fd;
158
159         /* For all clocks we maintain two priority queues each, one
160          * ordered for the earliest times the events may be
161          * dispatched, and one ordered by the latest times they must
162          * have been dispatched. The range between the top entries in
163          * the two prioqs is the time window we can freely schedule
164          * wakeups in */
165
166         Prioq *earliest;
167         Prioq *latest;
168         usec_t next;
169
170         bool needs_rearm:1;
171 };
172
173 struct signal_data {
174         WakeupType wakeup;
175
176         /* For each priority we maintain one signal fd, so that we
177          * only have to dequeue a single event per priority at a
178          * time. */
179
180         int fd;
181         int64_t priority;
182         sigset_t sigset;
183         sd_event_source *current;
184 };
185
186 struct sd_event {
187         unsigned n_ref;
188
189         int epoll_fd;
190         int watchdog_fd;
191
192         Prioq *pending;
193         Prioq *prepare;
194
195         /* timerfd_create() only supports these five clocks so far. We
196          * can add support for more clocks when the kernel learns to
197          * deal with them, too. */
198         struct clock_data realtime;
199         struct clock_data boottime;
200         struct clock_data monotonic;
201         struct clock_data realtime_alarm;
202         struct clock_data boottime_alarm;
203
204         usec_t perturb;
205
206         sd_event_source **signal_sources; /* indexed by signal number */
207         Hashmap *signal_data; /* indexed by priority */
208
209         Hashmap *child_sources;
210         unsigned n_enabled_child_sources;
211
212         Set *post_sources;
213
214         Prioq *exit;
215
216         pid_t original_pid;
217
218         unsigned iteration;
219         dual_timestamp timestamp;
220         usec_t timestamp_boottime;
221         int state;
222
223         bool exit_requested:1;
224         bool need_process_child:1;
225         bool watchdog:1;
226         bool profile_delays:1;
227
228         int exit_code;
229
230         pid_t tid;
231         sd_event **default_event_ptr;
232
233         usec_t watchdog_last, watchdog_period;
234
235         unsigned n_sources;
236
237         LIST_HEAD(sd_event_source, sources);
238
239         usec_t last_run, last_log;
240         unsigned delays[sizeof(usec_t) * 8];
241 };
242
243 static void source_disconnect(sd_event_source *s);
244
245 static int pending_prioq_compare(const void *a, const void *b) {
246         const sd_event_source *x = a, *y = b;
247
248         assert(x->pending);
249         assert(y->pending);
250
251         /* Enabled ones first */
252         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
253                 return -1;
254         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
255                 return 1;
256
257         /* Lower priority values first */
258         if (x->priority < y->priority)
259                 return -1;
260         if (x->priority > y->priority)
261                 return 1;
262
263         /* Older entries first */
264         if (x->pending_iteration < y->pending_iteration)
265                 return -1;
266         if (x->pending_iteration > y->pending_iteration)
267                 return 1;
268
269         return 0;
270 }
271
272 static int prepare_prioq_compare(const void *a, const void *b) {
273         const sd_event_source *x = a, *y = b;
274
275         assert(x->prepare);
276         assert(y->prepare);
277
278         /* Enabled ones first */
279         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
280                 return -1;
281         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
282                 return 1;
283
284         /* Move most recently prepared ones last, so that we can stop
285          * preparing as soon as we hit one that has already been
286          * prepared in the current iteration */
287         if (x->prepare_iteration < y->prepare_iteration)
288                 return -1;
289         if (x->prepare_iteration > y->prepare_iteration)
290                 return 1;
291
292         /* Lower priority values first */
293         if (x->priority < y->priority)
294                 return -1;
295         if (x->priority > y->priority)
296                 return 1;
297
298         return 0;
299 }
300
301 static int earliest_time_prioq_compare(const void *a, const void *b) {
302         const sd_event_source *x = a, *y = b;
303
304         assert(EVENT_SOURCE_IS_TIME(x->type));
305         assert(x->type == y->type);
306
307         /* Enabled ones first */
308         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
309                 return -1;
310         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
311                 return 1;
312
313         /* Move the pending ones to the end */
314         if (!x->pending && y->pending)
315                 return -1;
316         if (x->pending && !y->pending)
317                 return 1;
318
319         /* Order by time */
320         if (x->time.next < y->time.next)
321                 return -1;
322         if (x->time.next > y->time.next)
323                 return 1;
324
325         return 0;
326 }
327
328 static usec_t time_event_source_latest(const sd_event_source *s) {
329         return usec_add(s->time.next, s->time.accuracy);
330 }
331
332 static int latest_time_prioq_compare(const void *a, const void *b) {
333         const sd_event_source *x = a, *y = b;
334
335         assert(EVENT_SOURCE_IS_TIME(x->type));
336         assert(x->type == y->type);
337
338         /* Enabled ones first */
339         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
340                 return -1;
341         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
342                 return 1;
343
344         /* Move the pending ones to the end */
345         if (!x->pending && y->pending)
346                 return -1;
347         if (x->pending && !y->pending)
348                 return 1;
349
350         /* Order by time */
351         if (time_event_source_latest(x) < time_event_source_latest(y))
352                 return -1;
353         if (time_event_source_latest(x) > time_event_source_latest(y))
354                 return 1;
355
356         return 0;
357 }
358
359 static int exit_prioq_compare(const void *a, const void *b) {
360         const sd_event_source *x = a, *y = b;
361
362         assert(x->type == SOURCE_EXIT);
363         assert(y->type == SOURCE_EXIT);
364
365         /* Enabled ones first */
366         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
367                 return -1;
368         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
369                 return 1;
370
371         /* Lower priority values first */
372         if (x->priority < y->priority)
373                 return -1;
374         if (x->priority > y->priority)
375                 return 1;
376
377         return 0;
378 }
379
380 static void free_clock_data(struct clock_data *d) {
381         assert(d);
382         assert(d->wakeup == WAKEUP_CLOCK_DATA);
383
384         safe_close(d->fd);
385         prioq_free(d->earliest);
386         prioq_free(d->latest);
387 }
388
389 static void event_free(sd_event *e) {
390         sd_event_source *s;
391
392         assert(e);
393
394         while ((s = e->sources)) {
395                 assert(s->floating);
396                 source_disconnect(s);
397                 sd_event_source_unref(s);
398         }
399
400         assert(e->n_sources == 0);
401
402         if (e->default_event_ptr)
403                 *(e->default_event_ptr) = NULL;
404
405         safe_close(e->epoll_fd);
406         safe_close(e->watchdog_fd);
407
408         free_clock_data(&e->realtime);
409         free_clock_data(&e->boottime);
410         free_clock_data(&e->monotonic);
411         free_clock_data(&e->realtime_alarm);
412         free_clock_data(&e->boottime_alarm);
413
414         prioq_free(e->pending);
415         prioq_free(e->prepare);
416         prioq_free(e->exit);
417
418         free(e->signal_sources);
419         hashmap_free(e->signal_data);
420
421         hashmap_free(e->child_sources);
422         set_free(e->post_sources);
423         free(e);
424 }
425
426 _public_ int sd_event_new(sd_event** ret) {
427         sd_event *e;
428         int r;
429
430         assert_return(ret, -EINVAL);
431
432         e = new0(sd_event, 1);
433         if (!e)
434                 return -ENOMEM;
435
436         e->n_ref = 1;
437         e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
438         e->realtime.next = e->boottime.next = e->monotonic.next = e->realtime_alarm.next = e->boottime_alarm.next = USEC_INFINITY;
439         e->realtime.wakeup = e->boottime.wakeup = e->monotonic.wakeup = e->realtime_alarm.wakeup = e->boottime_alarm.wakeup = WAKEUP_CLOCK_DATA;
440         e->original_pid = getpid();
441         e->perturb = USEC_INFINITY;
442
443         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
444         if (r < 0)
445                 goto fail;
446
447         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
448         if (e->epoll_fd < 0) {
449                 r = -errno;
450                 goto fail;
451         }
452
453         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
454                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 ... 2^63 us will be logged every 5s.");
455                 e->profile_delays = true;
456         }
457
458         *ret = e;
459         return 0;
460
461 fail:
462         event_free(e);
463         return r;
464 }
465
466 _public_ sd_event* sd_event_ref(sd_event *e) {
467
468         if (!e)
469                 return NULL;
470
471         assert(e->n_ref >= 1);
472         e->n_ref++;
473
474         return e;
475 }
476
477 _public_ sd_event* sd_event_unref(sd_event *e) {
478
479         if (!e)
480                 return NULL;
481
482         assert(e->n_ref >= 1);
483         e->n_ref--;
484
485         if (e->n_ref <= 0)
486                 event_free(e);
487
488         return NULL;
489 }
490
491 static bool event_pid_changed(sd_event *e) {
492         assert(e);
493
494         /* We don't support people creating an event loop and keeping
495          * it around over a fork(). Let's complain. */
496
497         return e->original_pid != getpid();
498 }
499
500 static void source_io_unregister(sd_event_source *s) {
501         int r;
502
503         assert(s);
504         assert(s->type == SOURCE_IO);
505
506         if (event_pid_changed(s->event))
507                 return;
508
509         if (!s->io.registered)
510                 return;
511
512         r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
513         if (r < 0)
514                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
515                                 strna(s->description), event_source_type_to_string(s->type));
516
517         s->io.registered = false;
518 }
519
520 static int source_io_register(
521                 sd_event_source *s,
522                 int enabled,
523                 uint32_t events) {
524
525         struct epoll_event ev = {};
526         int r;
527
528         assert(s);
529         assert(s->type == SOURCE_IO);
530         assert(enabled != SD_EVENT_OFF);
531
532         ev.events = events;
533         ev.data.ptr = s;
534
535         if (enabled == SD_EVENT_ONESHOT)
536                 ev.events |= EPOLLONESHOT;
537
538         if (s->io.registered)
539                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
540         else
541                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
542         if (r < 0)
543                 return -errno;
544
545         s->io.registered = true;
546
547         return 0;
548 }
549
550 #if 0 /// UNNEEDED by elogind
551 static clockid_t event_source_type_to_clock(EventSourceType t) {
552
553         switch (t) {
554
555         case SOURCE_TIME_REALTIME:
556                 return CLOCK_REALTIME;
557
558         case SOURCE_TIME_BOOTTIME:
559                 return CLOCK_BOOTTIME;
560
561         case SOURCE_TIME_MONOTONIC:
562                 return CLOCK_MONOTONIC;
563
564         case SOURCE_TIME_REALTIME_ALARM:
565                 return CLOCK_REALTIME_ALARM;
566
567         case SOURCE_TIME_BOOTTIME_ALARM:
568                 return CLOCK_BOOTTIME_ALARM;
569
570         default:
571                 return (clockid_t) -1;
572         }
573 }
574 #endif // 0
575
576 static EventSourceType clock_to_event_source_type(clockid_t clock) {
577
578         switch (clock) {
579
580         case CLOCK_REALTIME:
581                 return SOURCE_TIME_REALTIME;
582
583         case CLOCK_BOOTTIME:
584                 return SOURCE_TIME_BOOTTIME;
585
586         case CLOCK_MONOTONIC:
587                 return SOURCE_TIME_MONOTONIC;
588
589         case CLOCK_REALTIME_ALARM:
590                 return SOURCE_TIME_REALTIME_ALARM;
591
592         case CLOCK_BOOTTIME_ALARM:
593                 return SOURCE_TIME_BOOTTIME_ALARM;
594
595         default:
596                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
597         }
598 }
599
600 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
601         assert(e);
602
603         switch (t) {
604
605         case SOURCE_TIME_REALTIME:
606                 return &e->realtime;
607
608         case SOURCE_TIME_BOOTTIME:
609                 return &e->boottime;
610
611         case SOURCE_TIME_MONOTONIC:
612                 return &e->monotonic;
613
614         case SOURCE_TIME_REALTIME_ALARM:
615                 return &e->realtime_alarm;
616
617         case SOURCE_TIME_BOOTTIME_ALARM:
618                 return &e->boottime_alarm;
619
620         default:
621                 return NULL;
622         }
623 }
624
625 static int event_make_signal_data(
626                 sd_event *e,
627                 int sig,
628                 struct signal_data **ret) {
629
630         struct epoll_event ev = {};
631         struct signal_data *d;
632         bool added = false;
633         sigset_t ss_copy;
634         int64_t priority;
635         int r;
636
637         assert(e);
638
639         if (event_pid_changed(e))
640                 return -ECHILD;
641
642         if (e->signal_sources && e->signal_sources[sig])
643                 priority = e->signal_sources[sig]->priority;
644         else
645                 priority = 0;
646
647         d = hashmap_get(e->signal_data, &priority);
648         if (d) {
649                 if (sigismember(&d->sigset, sig) > 0) {
650                         if (ret)
651                                 *ret = d;
652                         return 0;
653                 }
654         } else {
655                 r = hashmap_ensure_allocated(&e->signal_data, &uint64_hash_ops);
656                 if (r < 0)
657                         return r;
658
659                 d = new0(struct signal_data, 1);
660                 if (!d)
661                         return -ENOMEM;
662
663                 d->wakeup = WAKEUP_SIGNAL_DATA;
664                 d->fd  = -1;
665                 d->priority = priority;
666
667                 r = hashmap_put(e->signal_data, &d->priority, d);
668                 if (r < 0) {
669                         free(d);
670                         return r;
671                 }
672
673                 added = true;
674         }
675
676         ss_copy = d->sigset;
677         assert_se(sigaddset(&ss_copy, sig) >= 0);
678
679         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
680         if (r < 0) {
681                 r = -errno;
682                 goto fail;
683         }
684
685         d->sigset = ss_copy;
686
687         if (d->fd >= 0) {
688                 if (ret)
689                         *ret = d;
690                 return 0;
691         }
692
693         d->fd = r;
694
695         ev.events = EPOLLIN;
696         ev.data.ptr = d;
697
698         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev);
699         if (r < 0)  {
700                 r = -errno;
701                 goto fail;
702         }
703
704         if (ret)
705                 *ret = d;
706
707         return 0;
708
709 fail:
710         if (added) {
711                 d->fd = safe_close(d->fd);
712                 hashmap_remove(e->signal_data, &d->priority);
713                 free(d);
714         }
715
716         return r;
717 }
718
719 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
720         assert(e);
721         assert(d);
722
723         /* Turns off the specified signal in the signal data
724          * object. If the signal mask of the object becomes empty that
725          * way removes it. */
726
727         if (sigismember(&d->sigset, sig) == 0)
728                 return;
729
730         assert_se(sigdelset(&d->sigset, sig) >= 0);
731
732         if (sigisemptyset(&d->sigset)) {
733
734                 /* If all the mask is all-zero we can get rid of the structure */
735                 hashmap_remove(e->signal_data, &d->priority);
736                 assert(!d->current);
737                 safe_close(d->fd);
738                 free(d);
739                 return;
740         }
741
742         assert(d->fd >= 0);
743
744         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
745                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
746 }
747
748 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
749         struct signal_data *d;
750         static const int64_t zero_priority = 0;
751
752         assert(e);
753
754         /* Rechecks if the specified signal is still something we are
755          * interested in. If not, we'll unmask it, and possibly drop
756          * the signalfd for it. */
757
758         if (sig == SIGCHLD &&
759             e->n_enabled_child_sources > 0)
760                 return;
761
762         if (e->signal_sources &&
763             e->signal_sources[sig] &&
764             e->signal_sources[sig]->enabled != SD_EVENT_OFF)
765                 return;
766
767         /*
768          * The specified signal might be enabled in three different queues:
769          *
770          * 1) the one that belongs to the priority passed (if it is non-NULL)
771          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
772          * 3) the 0 priority (to cover the SIGCHLD case)
773          *
774          * Hence, let's remove it from all three here.
775          */
776
777         if (priority) {
778                 d = hashmap_get(e->signal_data, priority);
779                 if (d)
780                         event_unmask_signal_data(e, d, sig);
781         }
782
783         if (e->signal_sources && e->signal_sources[sig]) {
784                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
785                 if (d)
786                         event_unmask_signal_data(e, d, sig);
787         }
788
789         d = hashmap_get(e->signal_data, &zero_priority);
790         if (d)
791                 event_unmask_signal_data(e, d, sig);
792 }
793
794 static void source_disconnect(sd_event_source *s) {
795         sd_event *event;
796
797         assert(s);
798
799         if (!s->event)
800                 return;
801
802         assert(s->event->n_sources > 0);
803
804         switch (s->type) {
805
806         case SOURCE_IO:
807                 if (s->io.fd >= 0)
808                         source_io_unregister(s);
809
810                 break;
811
812         case SOURCE_TIME_REALTIME:
813         case SOURCE_TIME_BOOTTIME:
814         case SOURCE_TIME_MONOTONIC:
815         case SOURCE_TIME_REALTIME_ALARM:
816         case SOURCE_TIME_BOOTTIME_ALARM: {
817                 struct clock_data *d;
818
819                 d = event_get_clock_data(s->event, s->type);
820                 assert(d);
821
822                 prioq_remove(d->earliest, s, &s->time.earliest_index);
823                 prioq_remove(d->latest, s, &s->time.latest_index);
824                 d->needs_rearm = true;
825                 break;
826         }
827
828         case SOURCE_SIGNAL:
829                 if (s->signal.sig > 0) {
830
831                         if (s->event->signal_sources)
832                                 s->event->signal_sources[s->signal.sig] = NULL;
833
834                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
835                 }
836
837                 break;
838
839         case SOURCE_CHILD:
840                 if (s->child.pid > 0) {
841                         if (s->enabled != SD_EVENT_OFF) {
842                                 assert(s->event->n_enabled_child_sources > 0);
843                                 s->event->n_enabled_child_sources--;
844                         }
845
846                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
847                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
848                 }
849
850                 break;
851
852         case SOURCE_DEFER:
853                 /* nothing */
854                 break;
855
856         case SOURCE_POST:
857                 set_remove(s->event->post_sources, s);
858                 break;
859
860         case SOURCE_EXIT:
861                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
862                 break;
863
864         default:
865                 assert_not_reached("Wut? I shouldn't exist.");
866         }
867
868         if (s->pending)
869                 prioq_remove(s->event->pending, s, &s->pending_index);
870
871         if (s->prepare)
872                 prioq_remove(s->event->prepare, s, &s->prepare_index);
873
874         event = s->event;
875
876         s->type = _SOURCE_EVENT_SOURCE_TYPE_INVALID;
877         s->event = NULL;
878         LIST_REMOVE(sources, event->sources, s);
879         event->n_sources--;
880
881         if (!s->floating)
882                 sd_event_unref(event);
883 }
884
885 static void source_free(sd_event_source *s) {
886         assert(s);
887
888         source_disconnect(s);
889         free(s->description);
890         free(s);
891 }
892
893 static int source_set_pending(sd_event_source *s, bool b) {
894         int r;
895
896         assert(s);
897         assert(s->type != SOURCE_EXIT);
898
899         if (s->pending == b)
900                 return 0;
901
902         s->pending = b;
903
904         if (b) {
905                 s->pending_iteration = s->event->iteration;
906
907                 r = prioq_put(s->event->pending, s, &s->pending_index);
908                 if (r < 0) {
909                         s->pending = false;
910                         return r;
911                 }
912         } else
913                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
914
915         if (EVENT_SOURCE_IS_TIME(s->type)) {
916                 struct clock_data *d;
917
918                 d = event_get_clock_data(s->event, s->type);
919                 assert(d);
920
921                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
922                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
923                 d->needs_rearm = true;
924         }
925
926         if (s->type == SOURCE_SIGNAL && !b) {
927                 struct signal_data *d;
928
929                 d = hashmap_get(s->event->signal_data, &s->priority);
930                 if (d && d->current == s)
931                         d->current = NULL;
932         }
933
934         return 0;
935 }
936
937 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
938         sd_event_source *s;
939
940         assert(e);
941
942         s = new0(sd_event_source, 1);
943         if (!s)
944                 return NULL;
945
946         s->n_ref = 1;
947         s->event = e;
948         s->floating = floating;
949         s->type = type;
950         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
951
952         if (!floating)
953                 sd_event_ref(e);
954
955         LIST_PREPEND(sources, e->sources, s);
956         e->n_sources++;
957
958         return s;
959 }
960
961 _public_ int sd_event_add_io(
962                 sd_event *e,
963                 sd_event_source **ret,
964                 int fd,
965                 uint32_t events,
966                 sd_event_io_handler_t callback,
967                 void *userdata) {
968
969         sd_event_source *s;
970         int r;
971
972         assert_return(e, -EINVAL);
973         assert_return(fd >= 0, -EBADF);
974         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
975         assert_return(callback, -EINVAL);
976         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
977         assert_return(!event_pid_changed(e), -ECHILD);
978
979         s = source_new(e, !ret, SOURCE_IO);
980         if (!s)
981                 return -ENOMEM;
982
983         s->wakeup = WAKEUP_EVENT_SOURCE;
984         s->io.fd = fd;
985         s->io.events = events;
986         s->io.callback = callback;
987         s->userdata = userdata;
988         s->enabled = SD_EVENT_ON;
989
990         r = source_io_register(s, s->enabled, events);
991         if (r < 0) {
992                 source_free(s);
993                 return r;
994         }
995
996         if (ret)
997                 *ret = s;
998
999         return 0;
1000 }
1001
1002 static void initialize_perturb(sd_event *e) {
1003         sd_id128_t bootid = {};
1004
1005         /* When we sleep for longer, we try to realign the wakeup to
1006            the same time wihtin each minute/second/250ms, so that
1007            events all across the system can be coalesced into a single
1008            CPU wakeup. However, let's take some system-specific
1009            randomness for this value, so that in a network of systems
1010            with synced clocks timer events are distributed a
1011            bit. Here, we calculate a perturbation usec offset from the
1012            boot ID. */
1013
1014         if (_likely_(e->perturb != USEC_INFINITY))
1015                 return;
1016
1017         if (sd_id128_get_boot(&bootid) >= 0)
1018                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1019 }
1020
1021 static int event_setup_timer_fd(
1022                 sd_event *e,
1023                 struct clock_data *d,
1024                 clockid_t clock) {
1025
1026         struct epoll_event ev = {};
1027         int r, fd;
1028
1029         assert(e);
1030         assert(d);
1031
1032         if (_likely_(d->fd >= 0))
1033                 return 0;
1034
1035         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1036         if (fd < 0)
1037                 return -errno;
1038
1039         ev.events = EPOLLIN;
1040         ev.data.ptr = d;
1041
1042         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
1043         if (r < 0) {
1044                 safe_close(fd);
1045                 return -errno;
1046         }
1047
1048         d->fd = fd;
1049         return 0;
1050 }
1051
1052 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1053         assert(s);
1054
1055         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1056 }
1057
1058 _public_ int sd_event_add_time(
1059                 sd_event *e,
1060                 sd_event_source **ret,
1061                 clockid_t clock,
1062                 uint64_t usec,
1063                 uint64_t accuracy,
1064                 sd_event_time_handler_t callback,
1065                 void *userdata) {
1066
1067         EventSourceType type;
1068         sd_event_source *s;
1069         struct clock_data *d;
1070         int r;
1071
1072         assert_return(e, -EINVAL);
1073         assert_return(accuracy != (uint64_t) -1, -EINVAL);
1074         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1075         assert_return(!event_pid_changed(e), -ECHILD);
1076
1077         if (IN_SET(clock, CLOCK_BOOTTIME, CLOCK_BOOTTIME_ALARM) &&
1078             !clock_boottime_supported())
1079                 return -EOPNOTSUPP;
1080
1081         if (!callback)
1082                 callback = time_exit_callback;
1083
1084         type = clock_to_event_source_type(clock);
1085         assert_return(type >= 0, -EOPNOTSUPP);
1086
1087         d = event_get_clock_data(e, type);
1088         assert(d);
1089
1090         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1091         if (r < 0)
1092                 return r;
1093
1094         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1095         if (r < 0)
1096                 return r;
1097
1098         if (d->fd < 0) {
1099                 r = event_setup_timer_fd(e, d, clock);
1100                 if (r < 0)
1101                         return r;
1102         }
1103
1104         s = source_new(e, !ret, type);
1105         if (!s)
1106                 return -ENOMEM;
1107
1108         s->time.next = usec;
1109         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1110         s->time.callback = callback;
1111         s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
1112         s->userdata = userdata;
1113         s->enabled = SD_EVENT_ONESHOT;
1114
1115         d->needs_rearm = true;
1116
1117         r = prioq_put(d->earliest, s, &s->time.earliest_index);
1118         if (r < 0)
1119                 goto fail;
1120
1121         r = prioq_put(d->latest, s, &s->time.latest_index);
1122         if (r < 0)
1123                 goto fail;
1124
1125         if (ret)
1126                 *ret = s;
1127
1128         return 0;
1129
1130 fail:
1131         source_free(s);
1132         return r;
1133 }
1134
1135 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1136         assert(s);
1137
1138         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1139 }
1140
1141 _public_ int sd_event_add_signal(
1142                 sd_event *e,
1143                 sd_event_source **ret,
1144                 int sig,
1145                 sd_event_signal_handler_t callback,
1146                 void *userdata) {
1147
1148         sd_event_source *s;
1149         struct signal_data *d;
1150         sigset_t ss;
1151         int r;
1152
1153         assert_return(e, -EINVAL);
1154         assert_return(SIGNAL_VALID(sig), -EINVAL);
1155         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1156         assert_return(!event_pid_changed(e), -ECHILD);
1157
1158         if (!callback)
1159                 callback = signal_exit_callback;
1160
1161         r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
1162         if (r != 0)
1163                 return -r;
1164
1165         if (!sigismember(&ss, sig))
1166                 return -EBUSY;
1167
1168         if (!e->signal_sources) {
1169                 e->signal_sources = new0(sd_event_source*, _NSIG);
1170                 if (!e->signal_sources)
1171                         return -ENOMEM;
1172         } else if (e->signal_sources[sig])
1173                 return -EBUSY;
1174
1175         s = source_new(e, !ret, SOURCE_SIGNAL);
1176         if (!s)
1177                 return -ENOMEM;
1178
1179         s->signal.sig = sig;
1180         s->signal.callback = callback;
1181         s->userdata = userdata;
1182         s->enabled = SD_EVENT_ON;
1183
1184         e->signal_sources[sig] = s;
1185
1186         r = event_make_signal_data(e, sig, &d);
1187         if (r < 0) {
1188                 source_free(s);
1189                 return r;
1190         }
1191
1192         /* Use the signal name as description for the event source by default */
1193         (void) sd_event_source_set_description(s, signal_to_string(sig));
1194
1195         if (ret)
1196                 *ret = s;
1197
1198         return 0;
1199 }
1200
1201 #if 0 /// UNNEEDED by elogind
1202 _public_ int sd_event_add_child(
1203                 sd_event *e,
1204                 sd_event_source **ret,
1205                 pid_t pid,
1206                 int options,
1207                 sd_event_child_handler_t callback,
1208                 void *userdata) {
1209
1210         sd_event_source *s;
1211         int r;
1212
1213         assert_return(e, -EINVAL);
1214         assert_return(pid > 1, -EINVAL);
1215         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1216         assert_return(options != 0, -EINVAL);
1217         assert_return(callback, -EINVAL);
1218         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1219         assert_return(!event_pid_changed(e), -ECHILD);
1220
1221         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1222         if (r < 0)
1223                 return r;
1224
1225         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1226                 return -EBUSY;
1227
1228         s = source_new(e, !ret, SOURCE_CHILD);
1229         if (!s)
1230                 return -ENOMEM;
1231
1232         s->child.pid = pid;
1233         s->child.options = options;
1234         s->child.callback = callback;
1235         s->userdata = userdata;
1236         s->enabled = SD_EVENT_ONESHOT;
1237
1238         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1239         if (r < 0) {
1240                 source_free(s);
1241                 return r;
1242         }
1243
1244         e->n_enabled_child_sources++;
1245
1246         r = event_make_signal_data(e, SIGCHLD, NULL);
1247         if (r < 0) {
1248                 e->n_enabled_child_sources--;
1249                 source_free(s);
1250                 return r;
1251         }
1252
1253         e->need_process_child = true;
1254
1255         if (ret)
1256                 *ret = s;
1257
1258         return 0;
1259 }
1260
1261 _public_ int sd_event_add_defer(
1262                 sd_event *e,
1263                 sd_event_source **ret,
1264                 sd_event_handler_t callback,
1265                 void *userdata) {
1266
1267         sd_event_source *s;
1268         int r;
1269
1270         assert_return(e, -EINVAL);
1271         assert_return(callback, -EINVAL);
1272         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1273         assert_return(!event_pid_changed(e), -ECHILD);
1274
1275         s = source_new(e, !ret, SOURCE_DEFER);
1276         if (!s)
1277                 return -ENOMEM;
1278
1279         s->defer.callback = callback;
1280         s->userdata = userdata;
1281         s->enabled = SD_EVENT_ONESHOT;
1282
1283         r = source_set_pending(s, true);
1284         if (r < 0) {
1285                 source_free(s);
1286                 return r;
1287         }
1288
1289         if (ret)
1290                 *ret = s;
1291
1292         return 0;
1293 }
1294 #endif // 0
1295
1296 _public_ int sd_event_add_post(
1297                 sd_event *e,
1298                 sd_event_source **ret,
1299                 sd_event_handler_t callback,
1300                 void *userdata) {
1301
1302         sd_event_source *s;
1303         int r;
1304
1305         assert_return(e, -EINVAL);
1306         assert_return(callback, -EINVAL);
1307         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1308         assert_return(!event_pid_changed(e), -ECHILD);
1309
1310         r = set_ensure_allocated(&e->post_sources, NULL);
1311         if (r < 0)
1312                 return r;
1313
1314         s = source_new(e, !ret, SOURCE_POST);
1315         if (!s)
1316                 return -ENOMEM;
1317
1318         s->post.callback = callback;
1319         s->userdata = userdata;
1320         s->enabled = SD_EVENT_ON;
1321
1322         r = set_put(e->post_sources, s);
1323         if (r < 0) {
1324                 source_free(s);
1325                 return r;
1326         }
1327
1328         if (ret)
1329                 *ret = s;
1330
1331         return 0;
1332 }
1333
1334 _public_ int sd_event_add_exit(
1335                 sd_event *e,
1336                 sd_event_source **ret,
1337                 sd_event_handler_t callback,
1338                 void *userdata) {
1339
1340         sd_event_source *s;
1341         int r;
1342
1343         assert_return(e, -EINVAL);
1344         assert_return(callback, -EINVAL);
1345         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1346         assert_return(!event_pid_changed(e), -ECHILD);
1347
1348         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1349         if (r < 0)
1350                 return r;
1351
1352         s = source_new(e, !ret, SOURCE_EXIT);
1353         if (!s)
1354                 return -ENOMEM;
1355
1356         s->exit.callback = callback;
1357         s->userdata = userdata;
1358         s->exit.prioq_index = PRIOQ_IDX_NULL;
1359         s->enabled = SD_EVENT_ONESHOT;
1360
1361         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1362         if (r < 0) {
1363                 source_free(s);
1364                 return r;
1365         }
1366
1367         if (ret)
1368                 *ret = s;
1369
1370         return 0;
1371 }
1372
1373 #if 0 /// UNNEEDED by elogind
1374 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
1375
1376         if (!s)
1377                 return NULL;
1378
1379         assert(s->n_ref >= 1);
1380         s->n_ref++;
1381
1382         return s;
1383 }
1384 #endif // 0
1385
1386 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
1387
1388         if (!s)
1389                 return NULL;
1390
1391         assert(s->n_ref >= 1);
1392         s->n_ref--;
1393
1394         if (s->n_ref <= 0) {
1395                 /* Here's a special hack: when we are called from a
1396                  * dispatch handler we won't free the event source
1397                  * immediately, but we will detach the fd from the
1398                  * epoll. This way it is safe for the caller to unref
1399                  * the event source and immediately close the fd, but
1400                  * we still retain a valid event source object after
1401                  * the callback. */
1402
1403                 if (s->dispatching) {
1404                         if (s->type == SOURCE_IO)
1405                                 source_io_unregister(s);
1406
1407                         source_disconnect(s);
1408                 } else
1409                         source_free(s);
1410         }
1411
1412         return NULL;
1413 }
1414
1415 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
1416         assert_return(s, -EINVAL);
1417         assert_return(!event_pid_changed(s->event), -ECHILD);
1418
1419         return free_and_strdup(&s->description, description);
1420 }
1421
1422 #if 0 /// UNNEEDED by elogind
1423 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
1424         assert_return(s, -EINVAL);
1425         assert_return(description, -EINVAL);
1426         assert_return(s->description, -ENXIO);
1427         assert_return(!event_pid_changed(s->event), -ECHILD);
1428
1429         *description = s->description;
1430         return 0;
1431 }
1432 #endif // 0
1433
1434 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1435         assert_return(s, NULL);
1436
1437         return s->event;
1438 }
1439
1440 #if 0 /// UNNEEDED by elogind
1441 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1442         assert_return(s, -EINVAL);
1443         assert_return(s->type != SOURCE_EXIT, -EDOM);
1444         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1445         assert_return(!event_pid_changed(s->event), -ECHILD);
1446
1447         return s->pending;
1448 }
1449
1450 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1451         assert_return(s, -EINVAL);
1452         assert_return(s->type == SOURCE_IO, -EDOM);
1453         assert_return(!event_pid_changed(s->event), -ECHILD);
1454
1455         return s->io.fd;
1456 }
1457 #endif // 0
1458
1459 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1460         int r;
1461
1462         assert_return(s, -EINVAL);
1463         assert_return(fd >= 0, -EBADF);
1464         assert_return(s->type == SOURCE_IO, -EDOM);
1465         assert_return(!event_pid_changed(s->event), -ECHILD);
1466
1467         if (s->io.fd == fd)
1468                 return 0;
1469
1470         if (s->enabled == SD_EVENT_OFF) {
1471                 s->io.fd = fd;
1472                 s->io.registered = false;
1473         } else {
1474                 int saved_fd;
1475
1476                 saved_fd = s->io.fd;
1477                 assert(s->io.registered);
1478
1479                 s->io.fd = fd;
1480                 s->io.registered = false;
1481
1482                 r = source_io_register(s, s->enabled, s->io.events);
1483                 if (r < 0) {
1484                         s->io.fd = saved_fd;
1485                         s->io.registered = true;
1486                         return r;
1487                 }
1488
1489                 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
1490         }
1491
1492         return 0;
1493 }
1494
1495 #if 0 /// UNNEEDED by elogind
1496 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1497         assert_return(s, -EINVAL);
1498         assert_return(events, -EINVAL);
1499         assert_return(s->type == SOURCE_IO, -EDOM);
1500         assert_return(!event_pid_changed(s->event), -ECHILD);
1501
1502         *events = s->io.events;
1503         return 0;
1504 }
1505 #endif // 0
1506
1507 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1508         int r;
1509
1510         assert_return(s, -EINVAL);
1511         assert_return(s->type == SOURCE_IO, -EDOM);
1512         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1513         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1514         assert_return(!event_pid_changed(s->event), -ECHILD);
1515
1516         /* edge-triggered updates are never skipped, so we can reset edges */
1517         if (s->io.events == events && !(events & EPOLLET))
1518                 return 0;
1519
1520         if (s->enabled != SD_EVENT_OFF) {
1521                 r = source_io_register(s, s->enabled, events);
1522                 if (r < 0)
1523                         return r;
1524         }
1525
1526         s->io.events = events;
1527         source_set_pending(s, false);
1528
1529         return 0;
1530 }
1531
1532 #if 0 /// UNNEEDED by elogind
1533 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1534         assert_return(s, -EINVAL);
1535         assert_return(revents, -EINVAL);
1536         assert_return(s->type == SOURCE_IO, -EDOM);
1537         assert_return(s->pending, -ENODATA);
1538         assert_return(!event_pid_changed(s->event), -ECHILD);
1539
1540         *revents = s->io.revents;
1541         return 0;
1542 }
1543
1544 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1545         assert_return(s, -EINVAL);
1546         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1547         assert_return(!event_pid_changed(s->event), -ECHILD);
1548
1549         return s->signal.sig;
1550 }
1551
1552 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
1553         assert_return(s, -EINVAL);
1554         assert_return(!event_pid_changed(s->event), -ECHILD);
1555
1556         return s->priority;
1557 }
1558 #endif // 0
1559
1560 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
1561         int r;
1562
1563         assert_return(s, -EINVAL);
1564         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1565         assert_return(!event_pid_changed(s->event), -ECHILD);
1566
1567         if (s->priority == priority)
1568                 return 0;
1569
1570         if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
1571                 struct signal_data *old, *d;
1572
1573                 /* Move us from the signalfd belonging to the old
1574                  * priority to the signalfd of the new priority */
1575
1576                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
1577
1578                 s->priority = priority;
1579
1580                 r = event_make_signal_data(s->event, s->signal.sig, &d);
1581                 if (r < 0) {
1582                         s->priority = old->priority;
1583                         return r;
1584                 }
1585
1586                 event_unmask_signal_data(s->event, old, s->signal.sig);
1587         } else
1588                 s->priority = priority;
1589
1590         if (s->pending)
1591                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1592
1593         if (s->prepare)
1594                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1595
1596         if (s->type == SOURCE_EXIT)
1597                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1598
1599         return 0;
1600 }
1601
1602 #if 0 /// UNNEEDED by elogind
1603 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1604         assert_return(s, -EINVAL);
1605         assert_return(m, -EINVAL);
1606         assert_return(!event_pid_changed(s->event), -ECHILD);
1607
1608         *m = s->enabled;
1609         return 0;
1610 }
1611 #endif // 0
1612
1613 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1614         int r;
1615
1616         assert_return(s, -EINVAL);
1617         assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
1618         assert_return(!event_pid_changed(s->event), -ECHILD);
1619
1620         /* If we are dead anyway, we are fine with turning off
1621          * sources, but everything else needs to fail. */
1622         if (s->event->state == SD_EVENT_FINISHED)
1623                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
1624
1625         if (s->enabled == m)
1626                 return 0;
1627
1628         if (m == SD_EVENT_OFF) {
1629
1630                 switch (s->type) {
1631
1632                 case SOURCE_IO:
1633                         source_io_unregister(s);
1634                         s->enabled = m;
1635                         break;
1636
1637                 case SOURCE_TIME_REALTIME:
1638                 case SOURCE_TIME_BOOTTIME:
1639                 case SOURCE_TIME_MONOTONIC:
1640                 case SOURCE_TIME_REALTIME_ALARM:
1641                 case SOURCE_TIME_BOOTTIME_ALARM: {
1642                         struct clock_data *d;
1643
1644                         s->enabled = m;
1645                         d = event_get_clock_data(s->event, s->type);
1646                         assert(d);
1647
1648                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1649                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1650                         d->needs_rearm = true;
1651                         break;
1652                 }
1653
1654                 case SOURCE_SIGNAL:
1655                         s->enabled = m;
1656
1657                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1658                         break;
1659
1660                 case SOURCE_CHILD:
1661                         s->enabled = m;
1662
1663                         assert(s->event->n_enabled_child_sources > 0);
1664                         s->event->n_enabled_child_sources--;
1665
1666                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1667                         break;
1668
1669                 case SOURCE_EXIT:
1670                         s->enabled = m;
1671                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1672                         break;
1673
1674                 case SOURCE_DEFER:
1675                 case SOURCE_POST:
1676                         s->enabled = m;
1677                         break;
1678
1679                 default:
1680                         assert_not_reached("Wut? I shouldn't exist.");
1681                 }
1682
1683         } else {
1684                 switch (s->type) {
1685
1686                 case SOURCE_IO:
1687                         r = source_io_register(s, m, s->io.events);
1688                         if (r < 0)
1689                                 return r;
1690
1691                         s->enabled = m;
1692                         break;
1693
1694                 case SOURCE_TIME_REALTIME:
1695                 case SOURCE_TIME_BOOTTIME:
1696                 case SOURCE_TIME_MONOTONIC:
1697                 case SOURCE_TIME_REALTIME_ALARM:
1698                 case SOURCE_TIME_BOOTTIME_ALARM: {
1699                         struct clock_data *d;
1700
1701                         s->enabled = m;
1702                         d = event_get_clock_data(s->event, s->type);
1703                         assert(d);
1704
1705                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1706                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1707                         d->needs_rearm = true;
1708                         break;
1709                 }
1710
1711                 case SOURCE_SIGNAL:
1712
1713                         s->enabled = m;
1714
1715                         r = event_make_signal_data(s->event, s->signal.sig, NULL);
1716                         if (r < 0) {
1717                                 s->enabled = SD_EVENT_OFF;
1718                                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1719                                 return r;
1720                         }
1721
1722                         break;
1723
1724                 case SOURCE_CHILD:
1725
1726                         if (s->enabled == SD_EVENT_OFF)
1727                                 s->event->n_enabled_child_sources++;
1728
1729                         s->enabled = m;
1730
1731                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
1732                         if (r < 0) {
1733                                 s->enabled = SD_EVENT_OFF;
1734                                 s->event->n_enabled_child_sources--;
1735                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1736                                 return r;
1737                         }
1738
1739                         break;
1740
1741                 case SOURCE_EXIT:
1742                         s->enabled = m;
1743                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1744                         break;
1745
1746                 case SOURCE_DEFER:
1747                 case SOURCE_POST:
1748                         s->enabled = m;
1749                         break;
1750
1751                 default:
1752                         assert_not_reached("Wut? I shouldn't exist.");
1753                 }
1754         }
1755
1756         if (s->pending)
1757                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1758
1759         if (s->prepare)
1760                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1761
1762         return 0;
1763 }
1764
1765 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1766         assert_return(s, -EINVAL);
1767         assert_return(usec, -EINVAL);
1768         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1769         assert_return(!event_pid_changed(s->event), -ECHILD);
1770
1771         *usec = s->time.next;
1772         return 0;
1773 }
1774
1775 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1776         struct clock_data *d;
1777
1778         assert_return(s, -EINVAL);
1779         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1780         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1781         assert_return(!event_pid_changed(s->event), -ECHILD);
1782
1783         s->time.next = usec;
1784
1785         source_set_pending(s, false);
1786
1787         d = event_get_clock_data(s->event, s->type);
1788         assert(d);
1789
1790         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1791         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1792         d->needs_rearm = true;
1793
1794         return 0;
1795 }
1796
1797 #if 0 /// UNNEEDED by elogind
1798 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1799         assert_return(s, -EINVAL);
1800         assert_return(usec, -EINVAL);
1801         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1802         assert_return(!event_pid_changed(s->event), -ECHILD);
1803
1804         *usec = s->time.accuracy;
1805         return 0;
1806 }
1807
1808 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1809         struct clock_data *d;
1810
1811         assert_return(s, -EINVAL);
1812         assert_return(usec != (uint64_t) -1, -EINVAL);
1813         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1814         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1815         assert_return(!event_pid_changed(s->event), -ECHILD);
1816
1817         if (usec == 0)
1818                 usec = DEFAULT_ACCURACY_USEC;
1819
1820         s->time.accuracy = usec;
1821
1822         source_set_pending(s, false);
1823
1824         d = event_get_clock_data(s->event, s->type);
1825         assert(d);
1826
1827         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1828         d->needs_rearm = true;
1829
1830         return 0;
1831 }
1832
1833 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
1834         assert_return(s, -EINVAL);
1835         assert_return(clock, -EINVAL);
1836         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1837         assert_return(!event_pid_changed(s->event), -ECHILD);
1838
1839         *clock = event_source_type_to_clock(s->type);
1840         return 0;
1841 }
1842
1843 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1844         assert_return(s, -EINVAL);
1845         assert_return(pid, -EINVAL);
1846         assert_return(s->type == SOURCE_CHILD, -EDOM);
1847         assert_return(!event_pid_changed(s->event), -ECHILD);
1848
1849         *pid = s->child.pid;
1850         return 0;
1851 }
1852 #endif // 0
1853
1854 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1855         int r;
1856
1857         assert_return(s, -EINVAL);
1858         assert_return(s->type != SOURCE_EXIT, -EDOM);
1859         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1860         assert_return(!event_pid_changed(s->event), -ECHILD);
1861
1862         if (s->prepare == callback)
1863                 return 0;
1864
1865         if (callback && s->prepare) {
1866                 s->prepare = callback;
1867                 return 0;
1868         }
1869
1870         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1871         if (r < 0)
1872                 return r;
1873
1874         s->prepare = callback;
1875
1876         if (callback) {
1877                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1878                 if (r < 0)
1879                         return r;
1880         } else
1881                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1882
1883         return 0;
1884 }
1885
1886 #if 0 /// UNNEEDED by elogind
1887 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1888         assert_return(s, NULL);
1889
1890         return s->userdata;
1891 }
1892
1893 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1894         void *ret;
1895
1896         assert_return(s, NULL);
1897
1898         ret = s->userdata;
1899         s->userdata = userdata;
1900
1901         return ret;
1902 }
1903 #endif // 0
1904
1905 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1906         usec_t c;
1907         assert(e);
1908         assert(a <= b);
1909
1910         if (a <= 0)
1911                 return 0;
1912         if (a >= USEC_INFINITY)
1913                 return USEC_INFINITY;
1914
1915         if (b <= a + 1)
1916                 return a;
1917
1918         initialize_perturb(e);
1919
1920         /*
1921           Find a good time to wake up again between times a and b. We
1922           have two goals here:
1923
1924           a) We want to wake up as seldom as possible, hence prefer
1925              later times over earlier times.
1926
1927           b) But if we have to wake up, then let's make sure to
1928              dispatch as much as possible on the entire system.
1929
1930           We implement this by waking up everywhere at the same time
1931           within any given minute if we can, synchronised via the
1932           perturbation value determined from the boot ID. If we can't,
1933           then we try to find the same spot in every 10s, then 1s and
1934           then 250ms step. Otherwise, we pick the last possible time
1935           to wake up.
1936         */
1937
1938         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1939         if (c >= b) {
1940                 if (_unlikely_(c < USEC_PER_MINUTE))
1941                         return b;
1942
1943                 c -= USEC_PER_MINUTE;
1944         }
1945
1946         if (c >= a)
1947                 return c;
1948
1949         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1950         if (c >= b) {
1951                 if (_unlikely_(c < USEC_PER_SEC*10))
1952                         return b;
1953
1954                 c -= USEC_PER_SEC*10;
1955         }
1956
1957         if (c >= a)
1958                 return c;
1959
1960         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1961         if (c >= b) {
1962                 if (_unlikely_(c < USEC_PER_SEC))
1963                         return b;
1964
1965                 c -= USEC_PER_SEC;
1966         }
1967
1968         if (c >= a)
1969                 return c;
1970
1971         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1972         if (c >= b) {
1973                 if (_unlikely_(c < USEC_PER_MSEC*250))
1974                         return b;
1975
1976                 c -= USEC_PER_MSEC*250;
1977         }
1978
1979         if (c >= a)
1980                 return c;
1981
1982         return b;
1983 }
1984
1985 static int event_arm_timer(
1986                 sd_event *e,
1987                 struct clock_data *d) {
1988
1989         struct itimerspec its = {};
1990         sd_event_source *a, *b;
1991         usec_t t;
1992         int r;
1993
1994         assert(e);
1995         assert(d);
1996
1997         if (!d->needs_rearm)
1998                 return 0;
1999         else
2000                 d->needs_rearm = false;
2001
2002         a = prioq_peek(d->earliest);
2003         if (!a || a->enabled == SD_EVENT_OFF || a->time.next == USEC_INFINITY) {
2004
2005                 if (d->fd < 0)
2006                         return 0;
2007
2008                 if (d->next == USEC_INFINITY)
2009                         return 0;
2010
2011                 /* disarm */
2012                 r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2013                 if (r < 0)
2014                         return r;
2015
2016                 d->next = USEC_INFINITY;
2017                 return 0;
2018         }
2019
2020         b = prioq_peek(d->latest);
2021         assert_se(b && b->enabled != SD_EVENT_OFF);
2022
2023         t = sleep_between(e, a->time.next, time_event_source_latest(b));
2024         if (d->next == t)
2025                 return 0;
2026
2027         assert_se(d->fd >= 0);
2028
2029         if (t == 0) {
2030                 /* We don' want to disarm here, just mean some time looooong ago. */
2031                 its.it_value.tv_sec = 0;
2032                 its.it_value.tv_nsec = 1;
2033         } else
2034                 timespec_store(&its.it_value, t);
2035
2036         r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2037         if (r < 0)
2038                 return -errno;
2039
2040         d->next = t;
2041         return 0;
2042 }
2043
2044 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
2045         assert(e);
2046         assert(s);
2047         assert(s->type == SOURCE_IO);
2048
2049         /* If the event source was already pending, we just OR in the
2050          * new revents, otherwise we reset the value. The ORing is
2051          * necessary to handle EPOLLONESHOT events properly where
2052          * readability might happen independently of writability, and
2053          * we need to keep track of both */
2054
2055         if (s->pending)
2056                 s->io.revents |= revents;
2057         else
2058                 s->io.revents = revents;
2059
2060         return source_set_pending(s, true);
2061 }
2062
2063 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
2064         uint64_t x;
2065         ssize_t ss;
2066
2067         assert(e);
2068         assert(fd >= 0);
2069
2070         assert_return(events == EPOLLIN, -EIO);
2071
2072         ss = read(fd, &x, sizeof(x));
2073         if (ss < 0) {
2074                 if (errno == EAGAIN || errno == EINTR)
2075                         return 0;
2076
2077                 return -errno;
2078         }
2079
2080         if (_unlikely_(ss != sizeof(x)))
2081                 return -EIO;
2082
2083         if (next)
2084                 *next = USEC_INFINITY;
2085
2086         return 0;
2087 }
2088
2089 static int process_timer(
2090                 sd_event *e,
2091                 usec_t n,
2092                 struct clock_data *d) {
2093
2094         sd_event_source *s;
2095         int r;
2096
2097         assert(e);
2098         assert(d);
2099
2100         for (;;) {
2101                 s = prioq_peek(d->earliest);
2102                 if (!s ||
2103                     s->time.next > n ||
2104                     s->enabled == SD_EVENT_OFF ||
2105                     s->pending)
2106                         break;
2107
2108                 r = source_set_pending(s, true);
2109                 if (r < 0)
2110                         return r;
2111
2112                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2113                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
2114                 d->needs_rearm = true;
2115         }
2116
2117         return 0;
2118 }
2119
2120 static int process_child(sd_event *e) {
2121         sd_event_source *s;
2122         Iterator i;
2123         int r;
2124
2125         assert(e);
2126
2127         e->need_process_child = false;
2128
2129         /*
2130            So, this is ugly. We iteratively invoke waitid() with P_PID
2131            + WNOHANG for each PID we wait for, instead of using
2132            P_ALL. This is because we only want to get child
2133            information of very specific child processes, and not all
2134            of them. We might not have processed the SIGCHLD even of a
2135            previous invocation and we don't want to maintain a
2136            unbounded *per-child* event queue, hence we really don't
2137            want anything flushed out of the kernel's queue that we
2138            don't care about. Since this is O(n) this means that if you
2139            have a lot of processes you probably want to handle SIGCHLD
2140            yourself.
2141
2142            We do not reap the children here (by using WNOWAIT), this
2143            is only done after the event source is dispatched so that
2144            the callback still sees the process as a zombie.
2145         */
2146
2147         HASHMAP_FOREACH(s, e->child_sources, i) {
2148                 assert(s->type == SOURCE_CHILD);
2149
2150                 if (s->pending)
2151                         continue;
2152
2153                 if (s->enabled == SD_EVENT_OFF)
2154                         continue;
2155
2156                 zero(s->child.siginfo);
2157                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
2158                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
2159                 if (r < 0)
2160                         return -errno;
2161
2162                 if (s->child.siginfo.si_pid != 0) {
2163                         bool zombie =
2164                                 s->child.siginfo.si_code == CLD_EXITED ||
2165                                 s->child.siginfo.si_code == CLD_KILLED ||
2166                                 s->child.siginfo.si_code == CLD_DUMPED;
2167
2168                         if (!zombie && (s->child.options & WEXITED)) {
2169                                 /* If the child isn't dead then let's
2170                                  * immediately remove the state change
2171                                  * from the queue, since there's no
2172                                  * benefit in leaving it queued */
2173
2174                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
2175                                 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
2176                         }
2177
2178                         r = source_set_pending(s, true);
2179                         if (r < 0)
2180                                 return r;
2181                 }
2182         }
2183
2184         return 0;
2185 }
2186
2187 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
2188         bool read_one = false;
2189         int r;
2190
2191         assert(e);
2192         assert_return(events == EPOLLIN, -EIO);
2193
2194         /* If there's a signal queued on this priority and SIGCHLD is
2195            on this priority too, then make sure to recheck the
2196            children we watch. This is because we only ever dequeue
2197            the first signal per priority, and if we dequeue one, and
2198            SIGCHLD might be enqueued later we wouldn't know, but we
2199            might have higher priority children we care about hence we
2200            need to check that explicitly. */
2201
2202         if (sigismember(&d->sigset, SIGCHLD))
2203                 e->need_process_child = true;
2204
2205         /* If there's already an event source pending for this
2206          * priority we don't read another */
2207         if (d->current)
2208                 return 0;
2209
2210         for (;;) {
2211                 struct signalfd_siginfo si;
2212                 ssize_t n;
2213                 sd_event_source *s = NULL;
2214
2215                 n = read(d->fd, &si, sizeof(si));
2216                 if (n < 0) {
2217                         if (errno == EAGAIN || errno == EINTR)
2218                                 return read_one;
2219
2220                         return -errno;
2221                 }
2222
2223                 if (_unlikely_(n != sizeof(si)))
2224                         return -EIO;
2225
2226                 assert(SIGNAL_VALID(si.ssi_signo));
2227
2228                 read_one = true;
2229
2230                 if (e->signal_sources)
2231                         s = e->signal_sources[si.ssi_signo];
2232                 if (!s)
2233                         continue;
2234                 if (s->pending)
2235                         continue;
2236
2237                 s->signal.siginfo = si;
2238                 d->current = s;
2239
2240                 r = source_set_pending(s, true);
2241                 if (r < 0)
2242                         return r;
2243
2244                 return 1;
2245         }
2246 }
2247
2248 static int source_dispatch(sd_event_source *s) {
2249         int r = 0;
2250
2251         assert(s);
2252         assert(s->pending || s->type == SOURCE_EXIT);
2253
2254         if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) {
2255                 r = source_set_pending(s, false);
2256                 if (r < 0)
2257                         return r;
2258         }
2259
2260         if (s->type != SOURCE_POST) {
2261                 sd_event_source *z;
2262                 Iterator i;
2263
2264                 /* If we execute a non-post source, let's mark all
2265                  * post sources as pending */
2266
2267                 SET_FOREACH(z, s->event->post_sources, i) {
2268                         if (z->enabled == SD_EVENT_OFF)
2269                                 continue;
2270
2271                         r = source_set_pending(z, true);
2272                         if (r < 0)
2273                                 return r;
2274                 }
2275         }
2276
2277         if (s->enabled == SD_EVENT_ONESHOT) {
2278                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
2279                 if (r < 0)
2280                         return r;
2281         }
2282
2283         s->dispatching = true;
2284
2285         switch (s->type) {
2286
2287         case SOURCE_IO:
2288                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
2289                 break;
2290
2291         case SOURCE_TIME_REALTIME:
2292         case SOURCE_TIME_BOOTTIME:
2293         case SOURCE_TIME_MONOTONIC:
2294         case SOURCE_TIME_REALTIME_ALARM:
2295         case SOURCE_TIME_BOOTTIME_ALARM:
2296                 r = s->time.callback(s, s->time.next, s->userdata);
2297                 break;
2298
2299         case SOURCE_SIGNAL:
2300                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
2301                 break;
2302
2303         case SOURCE_CHILD: {
2304                 bool zombie;
2305
2306                 zombie = s->child.siginfo.si_code == CLD_EXITED ||
2307                          s->child.siginfo.si_code == CLD_KILLED ||
2308                          s->child.siginfo.si_code == CLD_DUMPED;
2309
2310                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
2311
2312                 /* Now, reap the PID for good. */
2313                 if (zombie)
2314                         waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
2315
2316                 break;
2317         }
2318
2319         case SOURCE_DEFER:
2320                 r = s->defer.callback(s, s->userdata);
2321                 break;
2322
2323         case SOURCE_POST:
2324                 r = s->post.callback(s, s->userdata);
2325                 break;
2326
2327         case SOURCE_EXIT:
2328                 r = s->exit.callback(s, s->userdata);
2329                 break;
2330
2331         case SOURCE_WATCHDOG:
2332         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
2333         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
2334                 assert_not_reached("Wut? I shouldn't exist.");
2335         }
2336
2337         s->dispatching = false;
2338
2339         if (r < 0)
2340                 log_debug_errno(r, "Event source %s (type %s) returned error, disabling: %m",
2341                                 strna(s->description), event_source_type_to_string(s->type));
2342
2343         if (s->n_ref == 0)
2344                 source_free(s);
2345         else if (r < 0)
2346                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
2347
2348         return 1;
2349 }
2350
2351 static int event_prepare(sd_event *e) {
2352         int r;
2353
2354         assert(e);
2355
2356         for (;;) {
2357                 sd_event_source *s;
2358
2359                 s = prioq_peek(e->prepare);
2360                 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
2361                         break;
2362
2363                 s->prepare_iteration = e->iteration;
2364                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
2365                 if (r < 0)
2366                         return r;
2367
2368                 assert(s->prepare);
2369
2370                 s->dispatching = true;
2371                 r = s->prepare(s, s->userdata);
2372                 s->dispatching = false;
2373
2374                 if (r < 0)
2375                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, disabling: %m",
2376                                         strna(s->description), event_source_type_to_string(s->type));
2377
2378                 if (s->n_ref == 0)
2379                         source_free(s);
2380                 else if (r < 0)
2381                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
2382         }
2383
2384         return 0;
2385 }
2386
2387 static int dispatch_exit(sd_event *e) {
2388         sd_event_source *p;
2389         int r;
2390
2391         assert(e);
2392
2393         p = prioq_peek(e->exit);
2394         if (!p || p->enabled == SD_EVENT_OFF) {
2395                 e->state = SD_EVENT_FINISHED;
2396                 return 0;
2397         }
2398
2399         sd_event_ref(e);
2400         e->iteration++;
2401         e->state = SD_EVENT_EXITING;
2402
2403         r = source_dispatch(p);
2404
2405         e->state = SD_EVENT_INITIAL;
2406         sd_event_unref(e);
2407
2408         return r;
2409 }
2410
2411 static sd_event_source* event_next_pending(sd_event *e) {
2412         sd_event_source *p;
2413
2414         assert(e);
2415
2416         p = prioq_peek(e->pending);
2417         if (!p)
2418                 return NULL;
2419
2420         if (p->enabled == SD_EVENT_OFF)
2421                 return NULL;
2422
2423         return p;
2424 }
2425
2426 static int arm_watchdog(sd_event *e) {
2427         struct itimerspec its = {};
2428         usec_t t;
2429         int r;
2430
2431         assert(e);
2432         assert(e->watchdog_fd >= 0);
2433
2434         t = sleep_between(e,
2435                           e->watchdog_last + (e->watchdog_period / 2),
2436                           e->watchdog_last + (e->watchdog_period * 3 / 4));
2437
2438         timespec_store(&its.it_value, t);
2439
2440         /* Make sure we never set the watchdog to 0, which tells the
2441          * kernel to disable it. */
2442         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
2443                 its.it_value.tv_nsec = 1;
2444
2445         r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
2446         if (r < 0)
2447                 return -errno;
2448
2449         return 0;
2450 }
2451
2452 static int process_watchdog(sd_event *e) {
2453         assert(e);
2454
2455         if (!e->watchdog)
2456                 return 0;
2457
2458         /* Don't notify watchdog too often */
2459         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
2460                 return 0;
2461
2462         sd_notify(false, "WATCHDOG=1");
2463         e->watchdog_last = e->timestamp.monotonic;
2464
2465         return arm_watchdog(e);
2466 }
2467
2468 _public_ int sd_event_prepare(sd_event *e) {
2469         int r;
2470
2471         assert_return(e, -EINVAL);
2472         assert_return(!event_pid_changed(e), -ECHILD);
2473         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2474         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2475
2476         if (e->exit_requested)
2477                 goto pending;
2478
2479         e->iteration++;
2480
2481         e->state = SD_EVENT_PREPARING;
2482         r = event_prepare(e);
2483         e->state = SD_EVENT_INITIAL;
2484         if (r < 0)
2485                 return r;
2486
2487         r = event_arm_timer(e, &e->realtime);
2488         if (r < 0)
2489                 return r;
2490
2491         r = event_arm_timer(e, &e->boottime);
2492         if (r < 0)
2493                 return r;
2494
2495         r = event_arm_timer(e, &e->monotonic);
2496         if (r < 0)
2497                 return r;
2498
2499         r = event_arm_timer(e, &e->realtime_alarm);
2500         if (r < 0)
2501                 return r;
2502
2503         r = event_arm_timer(e, &e->boottime_alarm);
2504         if (r < 0)
2505                 return r;
2506
2507         if (event_next_pending(e) || e->need_process_child)
2508                 goto pending;
2509
2510         e->state = SD_EVENT_ARMED;
2511
2512         return 0;
2513
2514 pending:
2515         e->state = SD_EVENT_ARMED;
2516         r = sd_event_wait(e, 0);
2517         if (r == 0)
2518                 e->state = SD_EVENT_ARMED;
2519
2520         return r;
2521 }
2522
2523 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
2524         struct epoll_event *ev_queue;
2525         unsigned ev_queue_max;
2526         int r, m, i;
2527
2528         assert_return(e, -EINVAL);
2529         assert_return(!event_pid_changed(e), -ECHILD);
2530         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2531         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
2532
2533         if (e->exit_requested) {
2534                 e->state = SD_EVENT_PENDING;
2535                 return 1;
2536         }
2537
2538         ev_queue_max = MAX(e->n_sources, 1u);
2539         ev_queue = newa(struct epoll_event, ev_queue_max);
2540
2541         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
2542                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
2543         if (m < 0) {
2544                 if (errno == EINTR) {
2545                         e->state = SD_EVENT_PENDING;
2546                         return 1;
2547                 }
2548
2549                 r = -errno;
2550                 goto finish;
2551         }
2552
2553         dual_timestamp_get(&e->timestamp);
2554         if (clock_boottime_supported())
2555                 e->timestamp_boottime = now(CLOCK_BOOTTIME);
2556
2557         for (i = 0; i < m; i++) {
2558
2559                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
2560                         r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
2561                 else {
2562                         WakeupType *t = ev_queue[i].data.ptr;
2563
2564                         switch (*t) {
2565
2566                         case WAKEUP_EVENT_SOURCE:
2567                                 r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
2568                                 break;
2569
2570                         case WAKEUP_CLOCK_DATA: {
2571                                 struct clock_data *d = ev_queue[i].data.ptr;
2572                                 r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
2573                                 break;
2574                         }
2575
2576                         case WAKEUP_SIGNAL_DATA:
2577                                 r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
2578                                 break;
2579
2580                         default:
2581                                 assert_not_reached("Invalid wake-up pointer");
2582                         }
2583                 }
2584                 if (r < 0)
2585                         goto finish;
2586         }
2587
2588         r = process_watchdog(e);
2589         if (r < 0)
2590                 goto finish;
2591
2592         r = process_timer(e, e->timestamp.realtime, &e->realtime);
2593         if (r < 0)
2594                 goto finish;
2595
2596         r = process_timer(e, e->timestamp_boottime, &e->boottime);
2597         if (r < 0)
2598                 goto finish;
2599
2600         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
2601         if (r < 0)
2602                 goto finish;
2603
2604         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
2605         if (r < 0)
2606                 goto finish;
2607
2608         r = process_timer(e, e->timestamp_boottime, &e->boottime_alarm);
2609         if (r < 0)
2610                 goto finish;
2611
2612         if (e->need_process_child) {
2613                 r = process_child(e);
2614                 if (r < 0)
2615                         goto finish;
2616         }
2617
2618         if (event_next_pending(e)) {
2619                 e->state = SD_EVENT_PENDING;
2620
2621                 return 1;
2622         }
2623
2624         r = 0;
2625
2626 finish:
2627         e->state = SD_EVENT_INITIAL;
2628
2629         return r;
2630 }
2631
2632 _public_ int sd_event_dispatch(sd_event *e) {
2633         sd_event_source *p;
2634         int r;
2635
2636         assert_return(e, -EINVAL);
2637         assert_return(!event_pid_changed(e), -ECHILD);
2638         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2639         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
2640
2641         if (e->exit_requested)
2642                 return dispatch_exit(e);
2643
2644         p = event_next_pending(e);
2645         if (p) {
2646                 sd_event_ref(e);
2647
2648                 e->state = SD_EVENT_RUNNING;
2649                 r = source_dispatch(p);
2650                 e->state = SD_EVENT_INITIAL;
2651
2652                 sd_event_unref(e);
2653
2654                 return r;
2655         }
2656
2657         e->state = SD_EVENT_INITIAL;
2658
2659         return 1;
2660 }
2661
2662 static void event_log_delays(sd_event *e) {
2663         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1];
2664         unsigned i;
2665         int o;
2666
2667         for (i = o = 0; i < ELEMENTSOF(e->delays); i++) {
2668                 o += snprintf(&b[o], sizeof(b) - o, "%u ", e->delays[i]);
2669                 e->delays[i] = 0;
2670         }
2671         log_debug("Event loop iterations: %.*s", o, b);
2672 }
2673
2674 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
2675         int r;
2676
2677         assert_return(e, -EINVAL);
2678         assert_return(!event_pid_changed(e), -ECHILD);
2679         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2680         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2681
2682         if (e->profile_delays && e->last_run) {
2683                 usec_t this_run;
2684                 unsigned l;
2685
2686                 this_run = now(CLOCK_MONOTONIC);
2687
2688                 l = u64log2(this_run - e->last_run);
2689                 assert(l < sizeof(e->delays));
2690                 e->delays[l]++;
2691
2692                 if (this_run - e->last_log >= 5*USEC_PER_SEC) {
2693                         event_log_delays(e);
2694                         e->last_log = this_run;
2695                 }
2696         }
2697
2698         r = sd_event_prepare(e);
2699         if (r == 0)
2700                 /* There was nothing? Then wait... */
2701                 r = sd_event_wait(e, timeout);
2702
2703         if (e->profile_delays)
2704                 e->last_run = now(CLOCK_MONOTONIC);
2705
2706         if (r > 0) {
2707                 /* There's something now, then let's dispatch it */
2708                 r = sd_event_dispatch(e);
2709                 if (r < 0)
2710                         return r;
2711
2712                 return 1;
2713         }
2714
2715         return r;
2716 }
2717
2718 #if 0 /// UNNEEDED by elogind
2719 _public_ int sd_event_loop(sd_event *e) {
2720         int r;
2721
2722         assert_return(e, -EINVAL);
2723         assert_return(!event_pid_changed(e), -ECHILD);
2724         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2725
2726         sd_event_ref(e);
2727
2728         while (e->state != SD_EVENT_FINISHED) {
2729                 r = sd_event_run(e, (uint64_t) -1);
2730                 if (r < 0)
2731                         goto finish;
2732         }
2733
2734         r = e->exit_code;
2735
2736 finish:
2737         sd_event_unref(e);
2738         return r;
2739 }
2740
2741 _public_ int sd_event_get_fd(sd_event *e) {
2742
2743         assert_return(e, -EINVAL);
2744         assert_return(!event_pid_changed(e), -ECHILD);
2745
2746         return e->epoll_fd;
2747 }
2748 #endif // 0
2749
2750 _public_ int sd_event_get_state(sd_event *e) {
2751         assert_return(e, -EINVAL);
2752         assert_return(!event_pid_changed(e), -ECHILD);
2753
2754         return e->state;
2755 }
2756
2757 #if 0 /// UNNEEDED by elogind
2758 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2759         assert_return(e, -EINVAL);
2760         assert_return(code, -EINVAL);
2761         assert_return(!event_pid_changed(e), -ECHILD);
2762
2763         if (!e->exit_requested)
2764                 return -ENODATA;
2765
2766         *code = e->exit_code;
2767         return 0;
2768 }
2769 #endif // 0
2770
2771 _public_ int sd_event_exit(sd_event *e, int code) {
2772         assert_return(e, -EINVAL);
2773         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2774         assert_return(!event_pid_changed(e), -ECHILD);
2775
2776         e->exit_requested = true;
2777         e->exit_code = code;
2778
2779         return 0;
2780 }
2781
2782 #if 0 /// UNNEEDED by elogind
2783 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
2784         assert_return(e, -EINVAL);
2785         assert_return(usec, -EINVAL);
2786         assert_return(!event_pid_changed(e), -ECHILD);
2787         assert_return(IN_SET(clock,
2788                              CLOCK_REALTIME,
2789                              CLOCK_REALTIME_ALARM,
2790                              CLOCK_MONOTONIC,
2791                              CLOCK_BOOTTIME,
2792                              CLOCK_BOOTTIME_ALARM), -EOPNOTSUPP);
2793
2794         if (IN_SET(clock, CLOCK_BOOTTIME, CLOCK_BOOTTIME_ALARM) && !clock_boottime_supported())
2795                 return -EOPNOTSUPP;
2796
2797         if (!dual_timestamp_is_set(&e->timestamp)) {
2798                 /* Implicitly fall back to now() if we never ran
2799                  * before and thus have no cached time. */
2800                 *usec = now(clock);
2801                 return 1;
2802         }
2803
2804         switch (clock) {
2805
2806         case CLOCK_REALTIME:
2807         case CLOCK_REALTIME_ALARM:
2808                 *usec = e->timestamp.realtime;
2809                 break;
2810
2811         case CLOCK_MONOTONIC:
2812                 *usec = e->timestamp.monotonic;
2813                 break;
2814
2815         case CLOCK_BOOTTIME:
2816         case CLOCK_BOOTTIME_ALARM:
2817                 *usec = e->timestamp_boottime;
2818                 break;
2819
2820         default:
2821                 assert_not_reached("Unknown clock?");
2822         }
2823
2824         return 0;
2825 }
2826 #endif // 0
2827
2828 _public_ int sd_event_default(sd_event **ret) {
2829
2830         static thread_local sd_event *default_event = NULL;
2831         sd_event *e = NULL;
2832         int r;
2833
2834         if (!ret)
2835                 return !!default_event;
2836
2837         if (default_event) {
2838                 *ret = sd_event_ref(default_event);
2839                 return 0;
2840         }
2841
2842         r = sd_event_new(&e);
2843         if (r < 0)
2844                 return r;
2845
2846         e->default_event_ptr = &default_event;
2847         e->tid = gettid();
2848         default_event = e;
2849
2850         *ret = e;
2851         return 1;
2852 }
2853
2854 #if 0 /// UNNEEDED by elogind
2855 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2856         assert_return(e, -EINVAL);
2857         assert_return(tid, -EINVAL);
2858         assert_return(!event_pid_changed(e), -ECHILD);
2859
2860         if (e->tid != 0) {
2861                 *tid = e->tid;
2862                 return 0;
2863         }
2864
2865         return -ENXIO;
2866 }
2867 #endif // 0
2868
2869 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2870         int r;
2871
2872         assert_return(e, -EINVAL);
2873         assert_return(!event_pid_changed(e), -ECHILD);
2874
2875         if (e->watchdog == !!b)
2876                 return e->watchdog;
2877
2878         if (b) {
2879                 struct epoll_event ev = {};
2880
2881                 r = sd_watchdog_enabled(false, &e->watchdog_period);
2882                 if (r <= 0)
2883                         return r;
2884
2885                 /* Issue first ping immediately */
2886                 sd_notify(false, "WATCHDOG=1");
2887                 e->watchdog_last = now(CLOCK_MONOTONIC);
2888
2889                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2890                 if (e->watchdog_fd < 0)
2891                         return -errno;
2892
2893                 r = arm_watchdog(e);
2894                 if (r < 0)
2895                         goto fail;
2896
2897                 ev.events = EPOLLIN;
2898                 ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
2899
2900                 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2901                 if (r < 0) {
2902                         r = -errno;
2903                         goto fail;
2904                 }
2905
2906         } else {
2907                 if (e->watchdog_fd >= 0) {
2908                         epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2909                         e->watchdog_fd = safe_close(e->watchdog_fd);
2910                 }
2911         }
2912
2913         e->watchdog = !!b;
2914         return e->watchdog;
2915
2916 fail:
2917         e->watchdog_fd = safe_close(e->watchdog_fd);
2918         return r;
2919 }
2920
2921 #if 0 /// UNNEEDED by elogind
2922 _public_ int sd_event_get_watchdog(sd_event *e) {
2923         assert_return(e, -EINVAL);
2924         assert_return(!event_pid_changed(e), -ECHILD);
2925
2926         return e->watchdog;
2927 }
2928 #endif // 0