chiark / gitweb /
sd-event: use symbolic name for normal priority
[elogind.git] / src / libelogind / sd-event / sd-event.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3   This file is part of systemd.
4
5   Copyright 2013 Lennart Poettering
6 ***/
7
8 #include <sys/epoll.h>
9 #include <sys/timerfd.h>
10 #include <sys/wait.h>
11
12 #include "sd-daemon.h"
13 #include "sd-event.h"
14 #include "sd-id128.h"
15
16 #include "alloc-util.h"
17 #include "fd-util.h"
18 #include "hashmap.h"
19 #include "list.h"
20 #include "macro.h"
21 #include "missing.h"
22 #include "prioq.h"
23 #include "process-util.h"
24 #include "set.h"
25 #include "signal-util.h"
26 #include "string-table.h"
27 #include "string-util.h"
28 #include "time-util.h"
29 #include "util.h"
30
31 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
32
33 typedef enum EventSourceType {
34         SOURCE_IO,
35         SOURCE_TIME_REALTIME,
36         SOURCE_TIME_BOOTTIME,
37         SOURCE_TIME_MONOTONIC,
38         SOURCE_TIME_REALTIME_ALARM,
39         SOURCE_TIME_BOOTTIME_ALARM,
40         SOURCE_SIGNAL,
41         SOURCE_CHILD,
42         SOURCE_DEFER,
43         SOURCE_POST,
44         SOURCE_EXIT,
45         SOURCE_WATCHDOG,
46         _SOURCE_EVENT_SOURCE_TYPE_MAX,
47         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
48 } EventSourceType;
49
50 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
51         [SOURCE_IO] = "io",
52         [SOURCE_TIME_REALTIME] = "realtime",
53         [SOURCE_TIME_BOOTTIME] = "bootime",
54         [SOURCE_TIME_MONOTONIC] = "monotonic",
55         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
56         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
57         [SOURCE_SIGNAL] = "signal",
58         [SOURCE_CHILD] = "child",
59         [SOURCE_DEFER] = "defer",
60         [SOURCE_POST] = "post",
61         [SOURCE_EXIT] = "exit",
62         [SOURCE_WATCHDOG] = "watchdog",
63 };
64
65 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
66
67 /* All objects we use in epoll events start with this value, so that
68  * we know how to dispatch it */
69 typedef enum WakeupType {
70         WAKEUP_NONE,
71         WAKEUP_EVENT_SOURCE,
72         WAKEUP_CLOCK_DATA,
73         WAKEUP_SIGNAL_DATA,
74         _WAKEUP_TYPE_MAX,
75         _WAKEUP_TYPE_INVALID = -1,
76 } WakeupType;
77
78 #define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
79
80 struct sd_event_source {
81         WakeupType wakeup;
82
83         unsigned n_ref;
84
85         sd_event *event;
86         void *userdata;
87         sd_event_handler_t prepare;
88
89         char *description;
90
91         EventSourceType type:5;
92         int enabled:3;
93         bool pending:1;
94         bool dispatching:1;
95         bool floating:1;
96
97         int64_t priority;
98         unsigned pending_index;
99         unsigned prepare_index;
100         uint64_t pending_iteration;
101         uint64_t prepare_iteration;
102
103         LIST_FIELDS(sd_event_source, sources);
104
105         union {
106                 struct {
107                         sd_event_io_handler_t callback;
108                         int fd;
109                         uint32_t events;
110                         uint32_t revents;
111                         bool registered:1;
112                         bool owned:1;
113                 } io;
114                 struct {
115                         sd_event_time_handler_t callback;
116                         usec_t next, accuracy;
117                         unsigned earliest_index;
118                         unsigned latest_index;
119                 } time;
120                 struct {
121                         sd_event_signal_handler_t callback;
122                         struct signalfd_siginfo siginfo;
123                         int sig;
124                 } signal;
125                 struct {
126                         sd_event_child_handler_t callback;
127                         siginfo_t siginfo;
128                         pid_t pid;
129                         int options;
130                 } child;
131                 struct {
132                         sd_event_handler_t callback;
133                 } defer;
134                 struct {
135                         sd_event_handler_t callback;
136                 } post;
137                 struct {
138                         sd_event_handler_t callback;
139                         unsigned prioq_index;
140                 } exit;
141         };
142 };
143
144 struct clock_data {
145         WakeupType wakeup;
146         int fd;
147
148         /* For all clocks we maintain two priority queues each, one
149          * ordered for the earliest times the events may be
150          * dispatched, and one ordered by the latest times they must
151          * have been dispatched. The range between the top entries in
152          * the two prioqs is the time window we can freely schedule
153          * wakeups in */
154
155         Prioq *earliest;
156         Prioq *latest;
157         usec_t next;
158
159         bool needs_rearm:1;
160 };
161
162 struct signal_data {
163         WakeupType wakeup;
164
165         /* For each priority we maintain one signal fd, so that we
166          * only have to dequeue a single event per priority at a
167          * time. */
168
169         int fd;
170         int64_t priority;
171         sigset_t sigset;
172         sd_event_source *current;
173 };
174
175 struct sd_event {
176         unsigned n_ref;
177
178         int epoll_fd;
179         int watchdog_fd;
180
181         Prioq *pending;
182         Prioq *prepare;
183
184         /* timerfd_create() only supports these five clocks so far. We
185          * can add support for more clocks when the kernel learns to
186          * deal with them, too. */
187         struct clock_data realtime;
188         struct clock_data boottime;
189         struct clock_data monotonic;
190         struct clock_data realtime_alarm;
191         struct clock_data boottime_alarm;
192
193         usec_t perturb;
194
195         sd_event_source **signal_sources; /* indexed by signal number */
196         Hashmap *signal_data; /* indexed by priority */
197
198         Hashmap *child_sources;
199         unsigned n_enabled_child_sources;
200
201         Set *post_sources;
202
203         Prioq *exit;
204
205         pid_t original_pid;
206
207         uint64_t iteration;
208         triple_timestamp timestamp;
209         int state;
210
211         bool exit_requested:1;
212         bool need_process_child:1;
213         bool watchdog:1;
214         bool profile_delays:1;
215
216         int exit_code;
217
218         pid_t tid;
219         sd_event **default_event_ptr;
220
221         usec_t watchdog_last, watchdog_period;
222
223         unsigned n_sources;
224
225         LIST_HEAD(sd_event_source, sources);
226
227         usec_t last_run, last_log;
228         unsigned delays[sizeof(usec_t) * 8];
229 };
230
231 static thread_local sd_event *default_event = NULL;
232
233 static void source_disconnect(sd_event_source *s);
234
235 static sd_event *event_resolve(sd_event *e) {
236         return e == SD_EVENT_DEFAULT ? default_event : e;
237 }
238
239 static int pending_prioq_compare(const void *a, const void *b) {
240         const sd_event_source *x = a, *y = b;
241
242         assert(x->pending);
243         assert(y->pending);
244
245         /* Enabled ones first */
246         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
247                 return -1;
248         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
249                 return 1;
250
251         /* Lower priority values first */
252         if (x->priority < y->priority)
253                 return -1;
254         if (x->priority > y->priority)
255                 return 1;
256
257         /* Older entries first */
258         if (x->pending_iteration < y->pending_iteration)
259                 return -1;
260         if (x->pending_iteration > y->pending_iteration)
261                 return 1;
262
263         return 0;
264 }
265
266 static int prepare_prioq_compare(const void *a, const void *b) {
267         const sd_event_source *x = a, *y = b;
268
269         assert(x->prepare);
270         assert(y->prepare);
271
272         /* Enabled ones first */
273         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
274                 return -1;
275         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
276                 return 1;
277
278         /* Move most recently prepared ones last, so that we can stop
279          * preparing as soon as we hit one that has already been
280          * prepared in the current iteration */
281         if (x->prepare_iteration < y->prepare_iteration)
282                 return -1;
283         if (x->prepare_iteration > y->prepare_iteration)
284                 return 1;
285
286         /* Lower priority values first */
287         if (x->priority < y->priority)
288                 return -1;
289         if (x->priority > y->priority)
290                 return 1;
291
292         return 0;
293 }
294
295 static int earliest_time_prioq_compare(const void *a, const void *b) {
296         const sd_event_source *x = a, *y = b;
297
298         assert(EVENT_SOURCE_IS_TIME(x->type));
299         assert(x->type == y->type);
300
301         /* Enabled ones first */
302         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
303                 return -1;
304         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
305                 return 1;
306
307         /* Move the pending ones to the end */
308         if (!x->pending && y->pending)
309                 return -1;
310         if (x->pending && !y->pending)
311                 return 1;
312
313         /* Order by time */
314         if (x->time.next < y->time.next)
315                 return -1;
316         if (x->time.next > y->time.next)
317                 return 1;
318
319         return 0;
320 }
321
322 static usec_t time_event_source_latest(const sd_event_source *s) {
323         return usec_add(s->time.next, s->time.accuracy);
324 }
325
326 static int latest_time_prioq_compare(const void *a, const void *b) {
327         const sd_event_source *x = a, *y = b;
328
329         assert(EVENT_SOURCE_IS_TIME(x->type));
330         assert(x->type == y->type);
331
332         /* Enabled ones first */
333         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
334                 return -1;
335         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
336                 return 1;
337
338         /* Move the pending ones to the end */
339         if (!x->pending && y->pending)
340                 return -1;
341         if (x->pending && !y->pending)
342                 return 1;
343
344         /* Order by time */
345         if (time_event_source_latest(x) < time_event_source_latest(y))
346                 return -1;
347         if (time_event_source_latest(x) > time_event_source_latest(y))
348                 return 1;
349
350         return 0;
351 }
352
353 static int exit_prioq_compare(const void *a, const void *b) {
354         const sd_event_source *x = a, *y = b;
355
356         assert(x->type == SOURCE_EXIT);
357         assert(y->type == SOURCE_EXIT);
358
359         /* Enabled ones first */
360         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
361                 return -1;
362         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
363                 return 1;
364
365         /* Lower priority values first */
366         if (x->priority < y->priority)
367                 return -1;
368         if (x->priority > y->priority)
369                 return 1;
370
371         return 0;
372 }
373
374 static void free_clock_data(struct clock_data *d) {
375         assert(d);
376         assert(d->wakeup == WAKEUP_CLOCK_DATA);
377
378         safe_close(d->fd);
379         prioq_free(d->earliest);
380         prioq_free(d->latest);
381 }
382
383 static void event_free(sd_event *e) {
384         sd_event_source *s;
385
386         assert(e);
387
388         while ((s = e->sources)) {
389                 assert(s->floating);
390                 source_disconnect(s);
391                 sd_event_source_unref(s);
392         }
393
394         assert(e->n_sources == 0);
395
396         if (e->default_event_ptr)
397                 *(e->default_event_ptr) = NULL;
398
399         safe_close(e->epoll_fd);
400         safe_close(e->watchdog_fd);
401
402         free_clock_data(&e->realtime);
403         free_clock_data(&e->boottime);
404         free_clock_data(&e->monotonic);
405         free_clock_data(&e->realtime_alarm);
406         free_clock_data(&e->boottime_alarm);
407
408         prioq_free(e->pending);
409         prioq_free(e->prepare);
410         prioq_free(e->exit);
411
412         free(e->signal_sources);
413         hashmap_free(e->signal_data);
414
415         hashmap_free(e->child_sources);
416         set_free(e->post_sources);
417         free(e);
418 }
419
420 _public_ int sd_event_new(sd_event** ret) {
421         sd_event *e;
422         int r;
423
424         assert_return(ret, -EINVAL);
425
426         e = new0(sd_event, 1);
427         if (!e)
428                 return -ENOMEM;
429
430         e->n_ref = 1;
431         e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
432         e->realtime.next = e->boottime.next = e->monotonic.next = e->realtime_alarm.next = e->boottime_alarm.next = USEC_INFINITY;
433         e->realtime.wakeup = e->boottime.wakeup = e->monotonic.wakeup = e->realtime_alarm.wakeup = e->boottime_alarm.wakeup = WAKEUP_CLOCK_DATA;
434         e->original_pid = getpid_cached();
435         e->perturb = USEC_INFINITY;
436
437         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
438         if (r < 0)
439                 goto fail;
440
441         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
442         if (e->epoll_fd < 0) {
443                 r = -errno;
444                 goto fail;
445         }
446
447         e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
448
449         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
450                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 ... 2^63 us will be logged every 5s.");
451                 e->profile_delays = true;
452         }
453
454         *ret = e;
455         return 0;
456
457 fail:
458         event_free(e);
459         return r;
460 }
461
462 _public_ sd_event* sd_event_ref(sd_event *e) {
463
464         if (!e)
465                 return NULL;
466
467         assert(e->n_ref >= 1);
468         e->n_ref++;
469
470         return e;
471 }
472
473 _public_ sd_event* sd_event_unref(sd_event *e) {
474
475         if (!e)
476                 return NULL;
477
478         assert(e->n_ref >= 1);
479         e->n_ref--;
480
481         if (e->n_ref <= 0)
482                 event_free(e);
483
484         return NULL;
485 }
486
487 static bool event_pid_changed(sd_event *e) {
488         assert(e);
489
490         /* We don't support people creating an event loop and keeping
491          * it around over a fork(). Let's complain. */
492
493         return e->original_pid != getpid_cached();
494 }
495
496 static void source_io_unregister(sd_event_source *s) {
497         int r;
498
499         assert(s);
500         assert(s->type == SOURCE_IO);
501
502         if (event_pid_changed(s->event))
503                 return;
504
505         if (!s->io.registered)
506                 return;
507
508         r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
509         if (r < 0)
510                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
511                                 strna(s->description), event_source_type_to_string(s->type));
512
513         s->io.registered = false;
514 }
515
516 static int source_io_register(
517                 sd_event_source *s,
518                 int enabled,
519                 uint32_t events) {
520
521         struct epoll_event ev;
522         int r;
523
524         assert(s);
525         assert(s->type == SOURCE_IO);
526         assert(enabled != SD_EVENT_OFF);
527
528         ev = (struct epoll_event) {
529                 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
530                 .data.ptr = s,
531         };
532
533         if (s->io.registered)
534                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
535         else
536                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
537         if (r < 0)
538                 return -errno;
539
540         s->io.registered = true;
541
542         return 0;
543 }
544
545 static clockid_t event_source_type_to_clock(EventSourceType t) {
546
547         switch (t) {
548
549         case SOURCE_TIME_REALTIME:
550                 return CLOCK_REALTIME;
551
552         case SOURCE_TIME_BOOTTIME:
553                 return CLOCK_BOOTTIME;
554
555         case SOURCE_TIME_MONOTONIC:
556                 return CLOCK_MONOTONIC;
557
558         case SOURCE_TIME_REALTIME_ALARM:
559                 return CLOCK_REALTIME_ALARM;
560
561         case SOURCE_TIME_BOOTTIME_ALARM:
562                 return CLOCK_BOOTTIME_ALARM;
563
564         default:
565                 return (clockid_t) -1;
566         }
567 }
568
569 static EventSourceType clock_to_event_source_type(clockid_t clock) {
570
571         switch (clock) {
572
573         case CLOCK_REALTIME:
574                 return SOURCE_TIME_REALTIME;
575
576         case CLOCK_BOOTTIME:
577                 return SOURCE_TIME_BOOTTIME;
578
579         case CLOCK_MONOTONIC:
580                 return SOURCE_TIME_MONOTONIC;
581
582         case CLOCK_REALTIME_ALARM:
583                 return SOURCE_TIME_REALTIME_ALARM;
584
585         case CLOCK_BOOTTIME_ALARM:
586                 return SOURCE_TIME_BOOTTIME_ALARM;
587
588         default:
589                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
590         }
591 }
592
593 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
594         assert(e);
595
596         switch (t) {
597
598         case SOURCE_TIME_REALTIME:
599                 return &e->realtime;
600
601         case SOURCE_TIME_BOOTTIME:
602                 return &e->boottime;
603
604         case SOURCE_TIME_MONOTONIC:
605                 return &e->monotonic;
606
607         case SOURCE_TIME_REALTIME_ALARM:
608                 return &e->realtime_alarm;
609
610         case SOURCE_TIME_BOOTTIME_ALARM:
611                 return &e->boottime_alarm;
612
613         default:
614                 return NULL;
615         }
616 }
617
618 static int event_make_signal_data(
619                 sd_event *e,
620                 int sig,
621                 struct signal_data **ret) {
622
623         struct epoll_event ev;
624         struct signal_data *d;
625         bool added = false;
626         sigset_t ss_copy;
627         int64_t priority;
628         int r;
629
630         assert(e);
631
632         if (event_pid_changed(e))
633                 return -ECHILD;
634
635         if (e->signal_sources && e->signal_sources[sig])
636                 priority = e->signal_sources[sig]->priority;
637         else
638                 priority = SD_EVENT_PRIORITY_NORMAL;
639
640         d = hashmap_get(e->signal_data, &priority);
641         if (d) {
642                 if (sigismember(&d->sigset, sig) > 0) {
643                         if (ret)
644                                 *ret = d;
645                         return 0;
646                 }
647         } else {
648                 r = hashmap_ensure_allocated(&e->signal_data, &uint64_hash_ops);
649                 if (r < 0)
650                         return r;
651
652                 d = new0(struct signal_data, 1);
653                 if (!d)
654                         return -ENOMEM;
655
656                 d->wakeup = WAKEUP_SIGNAL_DATA;
657                 d->fd  = -1;
658                 d->priority = priority;
659
660                 r = hashmap_put(e->signal_data, &d->priority, d);
661                 if (r < 0) {
662                         free(d);
663                         return r;
664                 }
665
666                 added = true;
667         }
668
669         ss_copy = d->sigset;
670         assert_se(sigaddset(&ss_copy, sig) >= 0);
671
672         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
673         if (r < 0) {
674                 r = -errno;
675                 goto fail;
676         }
677
678         d->sigset = ss_copy;
679
680         if (d->fd >= 0) {
681                 if (ret)
682                         *ret = d;
683                 return 0;
684         }
685
686         d->fd = fd_move_above_stdio(r);
687
688         ev = (struct epoll_event) {
689                 .events = EPOLLIN,
690                 .data.ptr = d,
691         };
692
693         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev);
694         if (r < 0)  {
695                 r = -errno;
696                 goto fail;
697         }
698
699         if (ret)
700                 *ret = d;
701
702         return 0;
703
704 fail:
705         if (added) {
706                 d->fd = safe_close(d->fd);
707                 hashmap_remove(e->signal_data, &d->priority);
708                 free(d);
709         }
710
711         return r;
712 }
713
714 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
715         assert(e);
716         assert(d);
717
718         /* Turns off the specified signal in the signal data
719          * object. If the signal mask of the object becomes empty that
720          * way removes it. */
721
722         if (sigismember(&d->sigset, sig) == 0)
723                 return;
724
725         assert_se(sigdelset(&d->sigset, sig) >= 0);
726
727         if (sigisemptyset(&d->sigset)) {
728
729                 /* If all the mask is all-zero we can get rid of the structure */
730                 hashmap_remove(e->signal_data, &d->priority);
731                 safe_close(d->fd);
732                 free(d);
733                 return;
734         }
735
736         assert(d->fd >= 0);
737
738         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
739                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
740 }
741
742 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
743         struct signal_data *d;
744         static const int64_t zero_priority = 0;
745
746         assert(e);
747
748         /* Rechecks if the specified signal is still something we are
749          * interested in. If not, we'll unmask it, and possibly drop
750          * the signalfd for it. */
751
752         if (sig == SIGCHLD &&
753             e->n_enabled_child_sources > 0)
754                 return;
755
756         if (e->signal_sources &&
757             e->signal_sources[sig] &&
758             e->signal_sources[sig]->enabled != SD_EVENT_OFF)
759                 return;
760
761         /*
762          * The specified signal might be enabled in three different queues:
763          *
764          * 1) the one that belongs to the priority passed (if it is non-NULL)
765          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
766          * 3) the 0 priority (to cover the SIGCHLD case)
767          *
768          * Hence, let's remove it from all three here.
769          */
770
771         if (priority) {
772                 d = hashmap_get(e->signal_data, priority);
773                 if (d)
774                         event_unmask_signal_data(e, d, sig);
775         }
776
777         if (e->signal_sources && e->signal_sources[sig]) {
778                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
779                 if (d)
780                         event_unmask_signal_data(e, d, sig);
781         }
782
783         d = hashmap_get(e->signal_data, &zero_priority);
784         if (d)
785                 event_unmask_signal_data(e, d, sig);
786 }
787
788 static void source_disconnect(sd_event_source *s) {
789         sd_event *event;
790
791         assert(s);
792
793         if (!s->event)
794                 return;
795
796         assert(s->event->n_sources > 0);
797
798         switch (s->type) {
799
800         case SOURCE_IO:
801                 if (s->io.fd >= 0)
802                         source_io_unregister(s);
803
804                 break;
805
806         case SOURCE_TIME_REALTIME:
807         case SOURCE_TIME_BOOTTIME:
808         case SOURCE_TIME_MONOTONIC:
809         case SOURCE_TIME_REALTIME_ALARM:
810         case SOURCE_TIME_BOOTTIME_ALARM: {
811                 struct clock_data *d;
812
813                 d = event_get_clock_data(s->event, s->type);
814                 assert(d);
815
816                 prioq_remove(d->earliest, s, &s->time.earliest_index);
817                 prioq_remove(d->latest, s, &s->time.latest_index);
818                 d->needs_rearm = true;
819                 break;
820         }
821
822         case SOURCE_SIGNAL:
823                 if (s->signal.sig > 0) {
824
825                         if (s->event->signal_sources)
826                                 s->event->signal_sources[s->signal.sig] = NULL;
827
828                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
829                 }
830
831                 break;
832
833         case SOURCE_CHILD:
834                 if (s->child.pid > 0) {
835                         if (s->enabled != SD_EVENT_OFF) {
836                                 assert(s->event->n_enabled_child_sources > 0);
837                                 s->event->n_enabled_child_sources--;
838                         }
839
840                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
841                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
842                 }
843
844                 break;
845
846         case SOURCE_DEFER:
847                 /* nothing */
848                 break;
849
850         case SOURCE_POST:
851                 set_remove(s->event->post_sources, s);
852                 break;
853
854         case SOURCE_EXIT:
855                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
856                 break;
857
858         default:
859                 assert_not_reached("Wut? I shouldn't exist.");
860         }
861
862         if (s->pending)
863                 prioq_remove(s->event->pending, s, &s->pending_index);
864
865         if (s->prepare)
866                 prioq_remove(s->event->prepare, s, &s->prepare_index);
867
868         event = s->event;
869
870         s->type = _SOURCE_EVENT_SOURCE_TYPE_INVALID;
871         s->event = NULL;
872         LIST_REMOVE(sources, event->sources, s);
873         event->n_sources--;
874
875         if (!s->floating)
876                 sd_event_unref(event);
877 }
878
879 static void source_free(sd_event_source *s) {
880         assert(s);
881
882         source_disconnect(s);
883
884         if (s->type == SOURCE_IO && s->io.owned)
885                 safe_close(s->io.fd);
886
887         free(s->description);
888         free(s);
889 }
890
891 static int source_set_pending(sd_event_source *s, bool b) {
892         int r;
893
894         assert(s);
895         assert(s->type != SOURCE_EXIT);
896
897         if (s->pending == b)
898                 return 0;
899
900         s->pending = b;
901
902         if (b) {
903                 s->pending_iteration = s->event->iteration;
904
905                 r = prioq_put(s->event->pending, s, &s->pending_index);
906                 if (r < 0) {
907                         s->pending = false;
908                         return r;
909                 }
910         } else
911                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
912
913         if (EVENT_SOURCE_IS_TIME(s->type)) {
914                 struct clock_data *d;
915
916                 d = event_get_clock_data(s->event, s->type);
917                 assert(d);
918
919                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
920                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
921                 d->needs_rearm = true;
922         }
923
924         if (s->type == SOURCE_SIGNAL && !b) {
925                 struct signal_data *d;
926
927                 d = hashmap_get(s->event->signal_data, &s->priority);
928                 if (d && d->current == s)
929                         d->current = NULL;
930         }
931
932         return 0;
933 }
934
935 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
936         sd_event_source *s;
937
938         assert(e);
939
940         s = new0(sd_event_source, 1);
941         if (!s)
942                 return NULL;
943
944         s->n_ref = 1;
945         s->event = e;
946         s->floating = floating;
947         s->type = type;
948         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
949
950         if (!floating)
951                 sd_event_ref(e);
952
953         LIST_PREPEND(sources, e->sources, s);
954         e->n_sources++;
955
956         return s;
957 }
958
959 _public_ int sd_event_add_io(
960                 sd_event *e,
961                 sd_event_source **ret,
962                 int fd,
963                 uint32_t events,
964                 sd_event_io_handler_t callback,
965                 void *userdata) {
966
967         sd_event_source *s;
968         int r;
969
970         assert_return(e, -EINVAL);
971         assert_return(e = event_resolve(e), -ENOPKG);
972         assert_return(fd >= 0, -EBADF);
973         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
974         assert_return(callback, -EINVAL);
975         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
976         assert_return(!event_pid_changed(e), -ECHILD);
977
978         s = source_new(e, !ret, SOURCE_IO);
979         if (!s)
980                 return -ENOMEM;
981
982         s->wakeup = WAKEUP_EVENT_SOURCE;
983         s->io.fd = fd;
984         s->io.events = events;
985         s->io.callback = callback;
986         s->userdata = userdata;
987         s->enabled = SD_EVENT_ON;
988
989         r = source_io_register(s, s->enabled, events);
990         if (r < 0) {
991                 source_free(s);
992                 return r;
993         }
994
995         if (ret)
996                 *ret = s;
997
998         return 0;
999 }
1000
1001 static void initialize_perturb(sd_event *e) {
1002         sd_id128_t bootid = {};
1003
1004         /* When we sleep for longer, we try to realign the wakeup to
1005            the same time wihtin each minute/second/250ms, so that
1006            events all across the system can be coalesced into a single
1007            CPU wakeup. However, let's take some system-specific
1008            randomness for this value, so that in a network of systems
1009            with synced clocks timer events are distributed a
1010            bit. Here, we calculate a perturbation usec offset from the
1011            boot ID. */
1012
1013         if (_likely_(e->perturb != USEC_INFINITY))
1014                 return;
1015
1016         if (sd_id128_get_boot(&bootid) >= 0)
1017                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1018 }
1019
1020 static int event_setup_timer_fd(
1021                 sd_event *e,
1022                 struct clock_data *d,
1023                 clockid_t clock) {
1024
1025         struct epoll_event ev;
1026         int r, fd;
1027
1028         assert(e);
1029         assert(d);
1030
1031         if (_likely_(d->fd >= 0))
1032                 return 0;
1033
1034         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1035         if (fd < 0)
1036                 return -errno;
1037
1038         fd = fd_move_above_stdio(fd);
1039
1040         ev = (struct epoll_event) {
1041                 .events = EPOLLIN,
1042                 .data.ptr = d,
1043         };
1044
1045         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
1046         if (r < 0) {
1047                 safe_close(fd);
1048                 return -errno;
1049         }
1050
1051         d->fd = fd;
1052         return 0;
1053 }
1054
1055 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1056         assert(s);
1057
1058         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1059 }
1060
1061 _public_ int sd_event_add_time(
1062                 sd_event *e,
1063                 sd_event_source **ret,
1064                 clockid_t clock,
1065                 uint64_t usec,
1066                 uint64_t accuracy,
1067                 sd_event_time_handler_t callback,
1068                 void *userdata) {
1069
1070         EventSourceType type;
1071         sd_event_source *s;
1072         struct clock_data *d;
1073         int r;
1074
1075         assert_return(e, -EINVAL);
1076         assert_return(e = event_resolve(e), -ENOPKG);
1077         assert_return(accuracy != (uint64_t) -1, -EINVAL);
1078         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1079         assert_return(!event_pid_changed(e), -ECHILD);
1080
1081         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1082                 return -EOPNOTSUPP;
1083
1084         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1085         if (type < 0)
1086                 return -EOPNOTSUPP;
1087
1088         if (!callback)
1089                 callback = time_exit_callback;
1090
1091         d = event_get_clock_data(e, type);
1092         assert(d);
1093
1094         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1095         if (r < 0)
1096                 return r;
1097
1098         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1099         if (r < 0)
1100                 return r;
1101
1102         if (d->fd < 0) {
1103                 r = event_setup_timer_fd(e, d, clock);
1104                 if (r < 0)
1105                         return r;
1106         }
1107
1108         s = source_new(e, !ret, type);
1109         if (!s)
1110                 return -ENOMEM;
1111
1112         s->time.next = usec;
1113         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1114         s->time.callback = callback;
1115         s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
1116         s->userdata = userdata;
1117         s->enabled = SD_EVENT_ONESHOT;
1118
1119         d->needs_rearm = true;
1120
1121         r = prioq_put(d->earliest, s, &s->time.earliest_index);
1122         if (r < 0)
1123                 goto fail;
1124
1125         r = prioq_put(d->latest, s, &s->time.latest_index);
1126         if (r < 0)
1127                 goto fail;
1128
1129         if (ret)
1130                 *ret = s;
1131
1132         return 0;
1133
1134 fail:
1135         source_free(s);
1136         return r;
1137 }
1138
1139 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1140         assert(s);
1141
1142         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1143 }
1144
1145 _public_ int sd_event_add_signal(
1146                 sd_event *e,
1147                 sd_event_source **ret,
1148                 int sig,
1149                 sd_event_signal_handler_t callback,
1150                 void *userdata) {
1151
1152         sd_event_source *s;
1153         struct signal_data *d;
1154         sigset_t ss;
1155         int r;
1156
1157         assert_return(e, -EINVAL);
1158         assert_return(e = event_resolve(e), -ENOPKG);
1159         assert_return(SIGNAL_VALID(sig), -EINVAL);
1160         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1161         assert_return(!event_pid_changed(e), -ECHILD);
1162
1163         if (!callback)
1164                 callback = signal_exit_callback;
1165
1166         r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
1167         if (r != 0)
1168                 return -r;
1169
1170         if (!sigismember(&ss, sig))
1171                 return -EBUSY;
1172
1173         if (!e->signal_sources) {
1174                 e->signal_sources = new0(sd_event_source*, _NSIG);
1175                 if (!e->signal_sources)
1176                         return -ENOMEM;
1177         } else if (e->signal_sources[sig])
1178                 return -EBUSY;
1179
1180         s = source_new(e, !ret, SOURCE_SIGNAL);
1181         if (!s)
1182                 return -ENOMEM;
1183
1184         s->signal.sig = sig;
1185         s->signal.callback = callback;
1186         s->userdata = userdata;
1187         s->enabled = SD_EVENT_ON;
1188
1189         e->signal_sources[sig] = s;
1190
1191         r = event_make_signal_data(e, sig, &d);
1192         if (r < 0) {
1193                 source_free(s);
1194                 return r;
1195         }
1196
1197         /* Use the signal name as description for the event source by default */
1198         (void) sd_event_source_set_description(s, signal_to_string(sig));
1199
1200         if (ret)
1201                 *ret = s;
1202
1203         return 0;
1204 }
1205
1206 _public_ int sd_event_add_child(
1207                 sd_event *e,
1208                 sd_event_source **ret,
1209                 pid_t pid,
1210                 int options,
1211                 sd_event_child_handler_t callback,
1212                 void *userdata) {
1213
1214         sd_event_source *s;
1215         int r;
1216
1217         assert_return(e, -EINVAL);
1218         assert_return(e = event_resolve(e), -ENOPKG);
1219         assert_return(pid > 1, -EINVAL);
1220         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1221         assert_return(options != 0, -EINVAL);
1222         assert_return(callback, -EINVAL);
1223         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1224         assert_return(!event_pid_changed(e), -ECHILD);
1225
1226         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1227         if (r < 0)
1228                 return r;
1229
1230         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1231                 return -EBUSY;
1232
1233         s = source_new(e, !ret, SOURCE_CHILD);
1234         if (!s)
1235                 return -ENOMEM;
1236
1237         s->child.pid = pid;
1238         s->child.options = options;
1239         s->child.callback = callback;
1240         s->userdata = userdata;
1241         s->enabled = SD_EVENT_ONESHOT;
1242
1243         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1244         if (r < 0) {
1245                 source_free(s);
1246                 return r;
1247         }
1248
1249         e->n_enabled_child_sources++;
1250
1251         r = event_make_signal_data(e, SIGCHLD, NULL);
1252         if (r < 0) {
1253                 e->n_enabled_child_sources--;
1254                 source_free(s);
1255                 return r;
1256         }
1257
1258         e->need_process_child = true;
1259
1260         if (ret)
1261                 *ret = s;
1262
1263         return 0;
1264 }
1265
1266 _public_ int sd_event_add_defer(
1267                 sd_event *e,
1268                 sd_event_source **ret,
1269                 sd_event_handler_t callback,
1270                 void *userdata) {
1271
1272         sd_event_source *s;
1273         int r;
1274
1275         assert_return(e, -EINVAL);
1276         assert_return(e = event_resolve(e), -ENOPKG);
1277         assert_return(callback, -EINVAL);
1278         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1279         assert_return(!event_pid_changed(e), -ECHILD);
1280
1281         s = source_new(e, !ret, SOURCE_DEFER);
1282         if (!s)
1283                 return -ENOMEM;
1284
1285         s->defer.callback = callback;
1286         s->userdata = userdata;
1287         s->enabled = SD_EVENT_ONESHOT;
1288
1289         r = source_set_pending(s, true);
1290         if (r < 0) {
1291                 source_free(s);
1292                 return r;
1293         }
1294
1295         if (ret)
1296                 *ret = s;
1297
1298         return 0;
1299 }
1300
1301 _public_ int sd_event_add_post(
1302                 sd_event *e,
1303                 sd_event_source **ret,
1304                 sd_event_handler_t callback,
1305                 void *userdata) {
1306
1307         sd_event_source *s;
1308         int r;
1309
1310         assert_return(e, -EINVAL);
1311         assert_return(e = event_resolve(e), -ENOPKG);
1312         assert_return(callback, -EINVAL);
1313         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1314         assert_return(!event_pid_changed(e), -ECHILD);
1315
1316         r = set_ensure_allocated(&e->post_sources, NULL);
1317         if (r < 0)
1318                 return r;
1319
1320         s = source_new(e, !ret, SOURCE_POST);
1321         if (!s)
1322                 return -ENOMEM;
1323
1324         s->post.callback = callback;
1325         s->userdata = userdata;
1326         s->enabled = SD_EVENT_ON;
1327
1328         r = set_put(e->post_sources, s);
1329         if (r < 0) {
1330                 source_free(s);
1331                 return r;
1332         }
1333
1334         if (ret)
1335                 *ret = s;
1336
1337         return 0;
1338 }
1339
1340 _public_ int sd_event_add_exit(
1341                 sd_event *e,
1342                 sd_event_source **ret,
1343                 sd_event_handler_t callback,
1344                 void *userdata) {
1345
1346         sd_event_source *s;
1347         int r;
1348
1349         assert_return(e, -EINVAL);
1350         assert_return(e = event_resolve(e), -ENOPKG);
1351         assert_return(callback, -EINVAL);
1352         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1353         assert_return(!event_pid_changed(e), -ECHILD);
1354
1355         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1356         if (r < 0)
1357                 return r;
1358
1359         s = source_new(e, !ret, SOURCE_EXIT);
1360         if (!s)
1361                 return -ENOMEM;
1362
1363         s->exit.callback = callback;
1364         s->userdata = userdata;
1365         s->exit.prioq_index = PRIOQ_IDX_NULL;
1366         s->enabled = SD_EVENT_ONESHOT;
1367
1368         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1369         if (r < 0) {
1370                 source_free(s);
1371                 return r;
1372         }
1373
1374         if (ret)
1375                 *ret = s;
1376
1377         return 0;
1378 }
1379
1380 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
1381
1382         if (!s)
1383                 return NULL;
1384
1385         assert(s->n_ref >= 1);
1386         s->n_ref++;
1387
1388         return s;
1389 }
1390
1391 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
1392
1393         if (!s)
1394                 return NULL;
1395
1396         assert(s->n_ref >= 1);
1397         s->n_ref--;
1398
1399         if (s->n_ref <= 0) {
1400                 /* Here's a special hack: when we are called from a
1401                  * dispatch handler we won't free the event source
1402                  * immediately, but we will detach the fd from the
1403                  * epoll. This way it is safe for the caller to unref
1404                  * the event source and immediately close the fd, but
1405                  * we still retain a valid event source object after
1406                  * the callback. */
1407
1408                 if (s->dispatching) {
1409                         if (s->type == SOURCE_IO)
1410                                 source_io_unregister(s);
1411
1412                         source_disconnect(s);
1413                 } else
1414                         source_free(s);
1415         }
1416
1417         return NULL;
1418 }
1419
1420 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
1421         assert_return(s, -EINVAL);
1422         assert_return(!event_pid_changed(s->event), -ECHILD);
1423
1424         return free_and_strdup(&s->description, description);
1425 }
1426
1427 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
1428         assert_return(s, -EINVAL);
1429         assert_return(description, -EINVAL);
1430         assert_return(s->description, -ENXIO);
1431         assert_return(!event_pid_changed(s->event), -ECHILD);
1432
1433         *description = s->description;
1434         return 0;
1435 }
1436
1437 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1438         assert_return(s, NULL);
1439
1440         return s->event;
1441 }
1442
1443 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1444         assert_return(s, -EINVAL);
1445         assert_return(s->type != SOURCE_EXIT, -EDOM);
1446         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1447         assert_return(!event_pid_changed(s->event), -ECHILD);
1448
1449         return s->pending;
1450 }
1451
1452 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1453         assert_return(s, -EINVAL);
1454         assert_return(s->type == SOURCE_IO, -EDOM);
1455         assert_return(!event_pid_changed(s->event), -ECHILD);
1456
1457         return s->io.fd;
1458 }
1459
1460 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1461         int r;
1462
1463         assert_return(s, -EINVAL);
1464         assert_return(fd >= 0, -EBADF);
1465         assert_return(s->type == SOURCE_IO, -EDOM);
1466         assert_return(!event_pid_changed(s->event), -ECHILD);
1467
1468         if (s->io.fd == fd)
1469                 return 0;
1470
1471         if (s->enabled == SD_EVENT_OFF) {
1472                 s->io.fd = fd;
1473                 s->io.registered = false;
1474         } else {
1475                 int saved_fd;
1476
1477                 saved_fd = s->io.fd;
1478                 assert(s->io.registered);
1479
1480                 s->io.fd = fd;
1481                 s->io.registered = false;
1482
1483                 r = source_io_register(s, s->enabled, s->io.events);
1484                 if (r < 0) {
1485                         s->io.fd = saved_fd;
1486                         s->io.registered = true;
1487                         return r;
1488                 }
1489
1490                 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
1491         }
1492
1493         return 0;
1494 }
1495
1496 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
1497         assert_return(s, -EINVAL);
1498         assert_return(s->type == SOURCE_IO, -EDOM);
1499
1500         return s->io.owned;
1501 }
1502
1503 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
1504         assert_return(s, -EINVAL);
1505         assert_return(s->type == SOURCE_IO, -EDOM);
1506
1507         s->io.owned = own;
1508         return 0;
1509 }
1510
1511 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1512         assert_return(s, -EINVAL);
1513         assert_return(events, -EINVAL);
1514         assert_return(s->type == SOURCE_IO, -EDOM);
1515         assert_return(!event_pid_changed(s->event), -ECHILD);
1516
1517         *events = s->io.events;
1518         return 0;
1519 }
1520
1521 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1522         int r;
1523
1524         assert_return(s, -EINVAL);
1525         assert_return(s->type == SOURCE_IO, -EDOM);
1526         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1527         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1528         assert_return(!event_pid_changed(s->event), -ECHILD);
1529
1530         /* edge-triggered updates are never skipped, so we can reset edges */
1531         if (s->io.events == events && !(events & EPOLLET))
1532                 return 0;
1533
1534         if (s->enabled != SD_EVENT_OFF) {
1535                 r = source_io_register(s, s->enabled, events);
1536                 if (r < 0)
1537                         return r;
1538         }
1539
1540         s->io.events = events;
1541         source_set_pending(s, false);
1542
1543         return 0;
1544 }
1545
1546 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1547         assert_return(s, -EINVAL);
1548         assert_return(revents, -EINVAL);
1549         assert_return(s->type == SOURCE_IO, -EDOM);
1550         assert_return(s->pending, -ENODATA);
1551         assert_return(!event_pid_changed(s->event), -ECHILD);
1552
1553         *revents = s->io.revents;
1554         return 0;
1555 }
1556
1557 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1558         assert_return(s, -EINVAL);
1559         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1560         assert_return(!event_pid_changed(s->event), -ECHILD);
1561
1562         return s->signal.sig;
1563 }
1564
1565 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
1566         assert_return(s, -EINVAL);
1567         assert_return(!event_pid_changed(s->event), -ECHILD);
1568
1569         *priority = s->priority;
1570         return 0;
1571 }
1572
1573 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
1574         int r;
1575
1576         assert_return(s, -EINVAL);
1577         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1578         assert_return(!event_pid_changed(s->event), -ECHILD);
1579
1580         if (s->priority == priority)
1581                 return 0;
1582
1583         if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
1584                 struct signal_data *old, *d;
1585
1586                 /* Move us from the signalfd belonging to the old
1587                  * priority to the signalfd of the new priority */
1588
1589                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
1590
1591                 s->priority = priority;
1592
1593                 r = event_make_signal_data(s->event, s->signal.sig, &d);
1594                 if (r < 0) {
1595                         s->priority = old->priority;
1596                         return r;
1597                 }
1598
1599                 event_unmask_signal_data(s->event, old, s->signal.sig);
1600         } else
1601                 s->priority = priority;
1602
1603         if (s->pending)
1604                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1605
1606         if (s->prepare)
1607                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1608
1609         if (s->type == SOURCE_EXIT)
1610                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1611
1612         return 0;
1613 }
1614
1615 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1616         assert_return(s, -EINVAL);
1617         assert_return(m, -EINVAL);
1618         assert_return(!event_pid_changed(s->event), -ECHILD);
1619
1620         *m = s->enabled;
1621         return 0;
1622 }
1623
1624 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1625         int r;
1626
1627         assert_return(s, -EINVAL);
1628         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
1629         assert_return(!event_pid_changed(s->event), -ECHILD);
1630
1631         /* If we are dead anyway, we are fine with turning off
1632          * sources, but everything else needs to fail. */
1633         if (s->event->state == SD_EVENT_FINISHED)
1634                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
1635
1636         if (s->enabled == m)
1637                 return 0;
1638
1639         if (m == SD_EVENT_OFF) {
1640
1641                 switch (s->type) {
1642
1643                 case SOURCE_IO:
1644                         source_io_unregister(s);
1645                         s->enabled = m;
1646                         break;
1647
1648                 case SOURCE_TIME_REALTIME:
1649                 case SOURCE_TIME_BOOTTIME:
1650                 case SOURCE_TIME_MONOTONIC:
1651                 case SOURCE_TIME_REALTIME_ALARM:
1652                 case SOURCE_TIME_BOOTTIME_ALARM: {
1653                         struct clock_data *d;
1654
1655                         s->enabled = m;
1656                         d = event_get_clock_data(s->event, s->type);
1657                         assert(d);
1658
1659                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1660                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1661                         d->needs_rearm = true;
1662                         break;
1663                 }
1664
1665                 case SOURCE_SIGNAL:
1666                         s->enabled = m;
1667
1668                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1669                         break;
1670
1671                 case SOURCE_CHILD:
1672                         s->enabled = m;
1673
1674                         assert(s->event->n_enabled_child_sources > 0);
1675                         s->event->n_enabled_child_sources--;
1676
1677                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1678                         break;
1679
1680                 case SOURCE_EXIT:
1681                         s->enabled = m;
1682                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1683                         break;
1684
1685                 case SOURCE_DEFER:
1686                 case SOURCE_POST:
1687                         s->enabled = m;
1688                         break;
1689
1690                 default:
1691                         assert_not_reached("Wut? I shouldn't exist.");
1692                 }
1693
1694         } else {
1695                 switch (s->type) {
1696
1697                 case SOURCE_IO:
1698                         r = source_io_register(s, m, s->io.events);
1699                         if (r < 0)
1700                                 return r;
1701
1702                         s->enabled = m;
1703                         break;
1704
1705                 case SOURCE_TIME_REALTIME:
1706                 case SOURCE_TIME_BOOTTIME:
1707                 case SOURCE_TIME_MONOTONIC:
1708                 case SOURCE_TIME_REALTIME_ALARM:
1709                 case SOURCE_TIME_BOOTTIME_ALARM: {
1710                         struct clock_data *d;
1711
1712                         s->enabled = m;
1713                         d = event_get_clock_data(s->event, s->type);
1714                         assert(d);
1715
1716                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1717                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1718                         d->needs_rearm = true;
1719                         break;
1720                 }
1721
1722                 case SOURCE_SIGNAL:
1723
1724                         s->enabled = m;
1725
1726                         r = event_make_signal_data(s->event, s->signal.sig, NULL);
1727                         if (r < 0) {
1728                                 s->enabled = SD_EVENT_OFF;
1729                                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
1730                                 return r;
1731                         }
1732
1733                         break;
1734
1735                 case SOURCE_CHILD:
1736
1737                         if (s->enabled == SD_EVENT_OFF)
1738                                 s->event->n_enabled_child_sources++;
1739
1740                         s->enabled = m;
1741
1742                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
1743                         if (r < 0) {
1744                                 s->enabled = SD_EVENT_OFF;
1745                                 s->event->n_enabled_child_sources--;
1746                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
1747                                 return r;
1748                         }
1749
1750                         break;
1751
1752                 case SOURCE_EXIT:
1753                         s->enabled = m;
1754                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1755                         break;
1756
1757                 case SOURCE_DEFER:
1758                 case SOURCE_POST:
1759                         s->enabled = m;
1760                         break;
1761
1762                 default:
1763                         assert_not_reached("Wut? I shouldn't exist.");
1764                 }
1765         }
1766
1767         if (s->pending)
1768                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1769
1770         if (s->prepare)
1771                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1772
1773         return 0;
1774 }
1775
1776 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1777         assert_return(s, -EINVAL);
1778         assert_return(usec, -EINVAL);
1779         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1780         assert_return(!event_pid_changed(s->event), -ECHILD);
1781
1782         *usec = s->time.next;
1783         return 0;
1784 }
1785
1786 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1787         struct clock_data *d;
1788
1789         assert_return(s, -EINVAL);
1790         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1791         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1792         assert_return(!event_pid_changed(s->event), -ECHILD);
1793
1794         s->time.next = usec;
1795
1796         source_set_pending(s, false);
1797
1798         d = event_get_clock_data(s->event, s->type);
1799         assert(d);
1800
1801         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1802         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1803         d->needs_rearm = true;
1804
1805         return 0;
1806 }
1807
1808 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1809         assert_return(s, -EINVAL);
1810         assert_return(usec, -EINVAL);
1811         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1812         assert_return(!event_pid_changed(s->event), -ECHILD);
1813
1814         *usec = s->time.accuracy;
1815         return 0;
1816 }
1817
1818 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1819         struct clock_data *d;
1820
1821         assert_return(s, -EINVAL);
1822         assert_return(usec != (uint64_t) -1, -EINVAL);
1823         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1824         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1825         assert_return(!event_pid_changed(s->event), -ECHILD);
1826
1827         if (usec == 0)
1828                 usec = DEFAULT_ACCURACY_USEC;
1829
1830         s->time.accuracy = usec;
1831
1832         source_set_pending(s, false);
1833
1834         d = event_get_clock_data(s->event, s->type);
1835         assert(d);
1836
1837         prioq_reshuffle(d->latest, s, &s->time.latest_index);
1838         d->needs_rearm = true;
1839
1840         return 0;
1841 }
1842
1843 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
1844         assert_return(s, -EINVAL);
1845         assert_return(clock, -EINVAL);
1846         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
1847         assert_return(!event_pid_changed(s->event), -ECHILD);
1848
1849         *clock = event_source_type_to_clock(s->type);
1850         return 0;
1851 }
1852
1853 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1854         assert_return(s, -EINVAL);
1855         assert_return(pid, -EINVAL);
1856         assert_return(s->type == SOURCE_CHILD, -EDOM);
1857         assert_return(!event_pid_changed(s->event), -ECHILD);
1858
1859         *pid = s->child.pid;
1860         return 0;
1861 }
1862
1863 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1864         int r;
1865
1866         assert_return(s, -EINVAL);
1867         assert_return(s->type != SOURCE_EXIT, -EDOM);
1868         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1869         assert_return(!event_pid_changed(s->event), -ECHILD);
1870
1871         if (s->prepare == callback)
1872                 return 0;
1873
1874         if (callback && s->prepare) {
1875                 s->prepare = callback;
1876                 return 0;
1877         }
1878
1879         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1880         if (r < 0)
1881                 return r;
1882
1883         s->prepare = callback;
1884
1885         if (callback) {
1886                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1887                 if (r < 0)
1888                         return r;
1889         } else
1890                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1891
1892         return 0;
1893 }
1894
1895 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1896         assert_return(s, NULL);
1897
1898         return s->userdata;
1899 }
1900
1901 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1902         void *ret;
1903
1904         assert_return(s, NULL);
1905
1906         ret = s->userdata;
1907         s->userdata = userdata;
1908
1909         return ret;
1910 }
1911
1912 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1913         usec_t c;
1914         assert(e);
1915         assert(a <= b);
1916
1917         if (a <= 0)
1918                 return 0;
1919         if (a >= USEC_INFINITY)
1920                 return USEC_INFINITY;
1921
1922         if (b <= a + 1)
1923                 return a;
1924
1925         initialize_perturb(e);
1926
1927         /*
1928           Find a good time to wake up again between times a and b. We
1929           have two goals here:
1930
1931           a) We want to wake up as seldom as possible, hence prefer
1932              later times over earlier times.
1933
1934           b) But if we have to wake up, then let's make sure to
1935              dispatch as much as possible on the entire system.
1936
1937           We implement this by waking up everywhere at the same time
1938           within any given minute if we can, synchronised via the
1939           perturbation value determined from the boot ID. If we can't,
1940           then we try to find the same spot in every 10s, then 1s and
1941           then 250ms step. Otherwise, we pick the last possible time
1942           to wake up.
1943         */
1944
1945         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1946         if (c >= b) {
1947                 if (_unlikely_(c < USEC_PER_MINUTE))
1948                         return b;
1949
1950                 c -= USEC_PER_MINUTE;
1951         }
1952
1953         if (c >= a)
1954                 return c;
1955
1956         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1957         if (c >= b) {
1958                 if (_unlikely_(c < USEC_PER_SEC*10))
1959                         return b;
1960
1961                 c -= USEC_PER_SEC*10;
1962         }
1963
1964         if (c >= a)
1965                 return c;
1966
1967         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1968         if (c >= b) {
1969                 if (_unlikely_(c < USEC_PER_SEC))
1970                         return b;
1971
1972                 c -= USEC_PER_SEC;
1973         }
1974
1975         if (c >= a)
1976                 return c;
1977
1978         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1979         if (c >= b) {
1980                 if (_unlikely_(c < USEC_PER_MSEC*250))
1981                         return b;
1982
1983                 c -= USEC_PER_MSEC*250;
1984         }
1985
1986         if (c >= a)
1987                 return c;
1988
1989         return b;
1990 }
1991
1992 static int event_arm_timer(
1993                 sd_event *e,
1994                 struct clock_data *d) {
1995
1996         struct itimerspec its = {};
1997         sd_event_source *a, *b;
1998         usec_t t;
1999         int r;
2000
2001         assert(e);
2002         assert(d);
2003
2004         if (!d->needs_rearm)
2005                 return 0;
2006         else
2007                 d->needs_rearm = false;
2008
2009         a = prioq_peek(d->earliest);
2010         if (!a || a->enabled == SD_EVENT_OFF || a->time.next == USEC_INFINITY) {
2011
2012                 if (d->fd < 0)
2013                         return 0;
2014
2015                 if (d->next == USEC_INFINITY)
2016                         return 0;
2017
2018                 /* disarm */
2019                 r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2020                 if (r < 0)
2021                         return r;
2022
2023                 d->next = USEC_INFINITY;
2024                 return 0;
2025         }
2026
2027         b = prioq_peek(d->latest);
2028         assert_se(b && b->enabled != SD_EVENT_OFF);
2029
2030         t = sleep_between(e, a->time.next, time_event_source_latest(b));
2031         if (d->next == t)
2032                 return 0;
2033
2034         assert_se(d->fd >= 0);
2035
2036         if (t == 0) {
2037                 /* We don' want to disarm here, just mean some time looooong ago. */
2038                 its.it_value.tv_sec = 0;
2039                 its.it_value.tv_nsec = 1;
2040         } else
2041                 timespec_store(&its.it_value, t);
2042
2043         r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2044         if (r < 0)
2045                 return -errno;
2046
2047         d->next = t;
2048         return 0;
2049 }
2050
2051 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
2052         assert(e);
2053         assert(s);
2054         assert(s->type == SOURCE_IO);
2055
2056         /* If the event source was already pending, we just OR in the
2057          * new revents, otherwise we reset the value. The ORing is
2058          * necessary to handle EPOLLONESHOT events properly where
2059          * readability might happen independently of writability, and
2060          * we need to keep track of both */
2061
2062         if (s->pending)
2063                 s->io.revents |= revents;
2064         else
2065                 s->io.revents = revents;
2066
2067         return source_set_pending(s, true);
2068 }
2069
2070 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
2071         uint64_t x;
2072         ssize_t ss;
2073
2074         assert(e);
2075         assert(fd >= 0);
2076
2077         assert_return(events == EPOLLIN, -EIO);
2078
2079         ss = read(fd, &x, sizeof(x));
2080         if (ss < 0) {
2081                 if (IN_SET(errno, EAGAIN, EINTR))
2082                         return 0;
2083
2084                 return -errno;
2085         }
2086
2087         if (_unlikely_(ss != sizeof(x)))
2088                 return -EIO;
2089
2090         if (next)
2091                 *next = USEC_INFINITY;
2092
2093         return 0;
2094 }
2095
2096 static int process_timer(
2097                 sd_event *e,
2098                 usec_t n,
2099                 struct clock_data *d) {
2100
2101         sd_event_source *s;
2102         int r;
2103
2104         assert(e);
2105         assert(d);
2106
2107         for (;;) {
2108                 s = prioq_peek(d->earliest);
2109                 if (!s ||
2110                     s->time.next > n ||
2111                     s->enabled == SD_EVENT_OFF ||
2112                     s->pending)
2113                         break;
2114
2115                 r = source_set_pending(s, true);
2116                 if (r < 0)
2117                         return r;
2118
2119                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2120                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
2121                 d->needs_rearm = true;
2122         }
2123
2124         return 0;
2125 }
2126
2127 static int process_child(sd_event *e) {
2128         sd_event_source *s;
2129         Iterator i;
2130         int r;
2131
2132         assert(e);
2133
2134         e->need_process_child = false;
2135
2136         /*
2137            So, this is ugly. We iteratively invoke waitid() with P_PID
2138            + WNOHANG for each PID we wait for, instead of using
2139            P_ALL. This is because we only want to get child
2140            information of very specific child processes, and not all
2141            of them. We might not have processed the SIGCHLD even of a
2142            previous invocation and we don't want to maintain a
2143            unbounded *per-child* event queue, hence we really don't
2144            want anything flushed out of the kernel's queue that we
2145            don't care about. Since this is O(n) this means that if you
2146            have a lot of processes you probably want to handle SIGCHLD
2147            yourself.
2148
2149            We do not reap the children here (by using WNOWAIT), this
2150            is only done after the event source is dispatched so that
2151            the callback still sees the process as a zombie.
2152         */
2153
2154         HASHMAP_FOREACH(s, e->child_sources, i) {
2155                 assert(s->type == SOURCE_CHILD);
2156
2157                 if (s->pending)
2158                         continue;
2159
2160                 if (s->enabled == SD_EVENT_OFF)
2161                         continue;
2162
2163                 zero(s->child.siginfo);
2164                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
2165                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
2166                 if (r < 0)
2167                         return -errno;
2168
2169                 if (s->child.siginfo.si_pid != 0) {
2170                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
2171
2172                         if (!zombie && (s->child.options & WEXITED)) {
2173                                 /* If the child isn't dead then let's
2174                                  * immediately remove the state change
2175                                  * from the queue, since there's no
2176                                  * benefit in leaving it queued */
2177
2178                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
2179                                 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
2180                         }
2181
2182                         r = source_set_pending(s, true);
2183                         if (r < 0)
2184                                 return r;
2185                 }
2186         }
2187
2188         return 0;
2189 }
2190
2191 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
2192         bool read_one = false;
2193         int r;
2194
2195         assert(e);
2196         assert_return(events == EPOLLIN, -EIO);
2197
2198         /* If there's a signal queued on this priority and SIGCHLD is
2199            on this priority too, then make sure to recheck the
2200            children we watch. This is because we only ever dequeue
2201            the first signal per priority, and if we dequeue one, and
2202            SIGCHLD might be enqueued later we wouldn't know, but we
2203            might have higher priority children we care about hence we
2204            need to check that explicitly. */
2205
2206         if (sigismember(&d->sigset, SIGCHLD))
2207                 e->need_process_child = true;
2208
2209         /* If there's already an event source pending for this
2210          * priority we don't read another */
2211         if (d->current)
2212                 return 0;
2213
2214         for (;;) {
2215                 struct signalfd_siginfo si;
2216                 ssize_t n;
2217                 sd_event_source *s = NULL;
2218
2219                 n = read(d->fd, &si, sizeof(si));
2220                 if (n < 0) {
2221                         if (IN_SET(errno, EAGAIN, EINTR))
2222                                 return read_one;
2223
2224                         return -errno;
2225                 }
2226
2227                 if (_unlikely_(n != sizeof(si)))
2228                         return -EIO;
2229
2230                 assert(SIGNAL_VALID(si.ssi_signo));
2231
2232                 read_one = true;
2233
2234                 if (e->signal_sources)
2235                         s = e->signal_sources[si.ssi_signo];
2236                 if (!s)
2237                         continue;
2238                 if (s->pending)
2239                         continue;
2240
2241                 s->signal.siginfo = si;
2242                 d->current = s;
2243
2244                 r = source_set_pending(s, true);
2245                 if (r < 0)
2246                         return r;
2247
2248                 return 1;
2249         }
2250 }
2251
2252 static int source_dispatch(sd_event_source *s) {
2253         EventSourceType saved_type;
2254         int r = 0;
2255
2256         assert(s);
2257         assert(s->pending || s->type == SOURCE_EXIT);
2258
2259         /* Save the event source type, here, so that we still know it after the event callback which might invalidate
2260          * the event. */
2261         saved_type = s->type;
2262
2263         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2264                 r = source_set_pending(s, false);
2265                 if (r < 0)
2266                         return r;
2267         }
2268
2269         if (s->type != SOURCE_POST) {
2270                 sd_event_source *z;
2271                 Iterator i;
2272
2273                 /* If we execute a non-post source, let's mark all
2274                  * post sources as pending */
2275
2276                 SET_FOREACH(z, s->event->post_sources, i) {
2277                         if (z->enabled == SD_EVENT_OFF)
2278                                 continue;
2279
2280                         r = source_set_pending(z, true);
2281                         if (r < 0)
2282                                 return r;
2283                 }
2284         }
2285
2286         if (s->enabled == SD_EVENT_ONESHOT) {
2287                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
2288                 if (r < 0)
2289                         return r;
2290         }
2291
2292         s->dispatching = true;
2293
2294         switch (s->type) {
2295
2296         case SOURCE_IO:
2297                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
2298                 break;
2299
2300         case SOURCE_TIME_REALTIME:
2301         case SOURCE_TIME_BOOTTIME:
2302         case SOURCE_TIME_MONOTONIC:
2303         case SOURCE_TIME_REALTIME_ALARM:
2304         case SOURCE_TIME_BOOTTIME_ALARM:
2305                 r = s->time.callback(s, s->time.next, s->userdata);
2306                 break;
2307
2308         case SOURCE_SIGNAL:
2309                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
2310                 break;
2311
2312         case SOURCE_CHILD: {
2313                 bool zombie;
2314
2315                 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
2316
2317                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
2318
2319                 /* Now, reap the PID for good. */
2320                 if (zombie)
2321                         waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
2322
2323                 break;
2324         }
2325
2326         case SOURCE_DEFER:
2327                 r = s->defer.callback(s, s->userdata);
2328                 break;
2329
2330         case SOURCE_POST:
2331                 r = s->post.callback(s, s->userdata);
2332                 break;
2333
2334         case SOURCE_EXIT:
2335                 r = s->exit.callback(s, s->userdata);
2336                 break;
2337
2338         case SOURCE_WATCHDOG:
2339         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
2340         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
2341                 assert_not_reached("Wut? I shouldn't exist.");
2342         }
2343
2344         s->dispatching = false;
2345
2346         if (r < 0)
2347                 log_debug_errno(r, "Event source %s (type %s) returned error, disabling: %m",
2348                                 strna(s->description), event_source_type_to_string(saved_type));
2349
2350         if (s->n_ref == 0)
2351                 source_free(s);
2352         else if (r < 0)
2353                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
2354
2355         return 1;
2356 }
2357
2358 static int event_prepare(sd_event *e) {
2359         int r;
2360
2361         assert(e);
2362
2363         for (;;) {
2364                 sd_event_source *s;
2365
2366                 s = prioq_peek(e->prepare);
2367                 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
2368                         break;
2369
2370                 s->prepare_iteration = e->iteration;
2371                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
2372                 if (r < 0)
2373                         return r;
2374
2375                 assert(s->prepare);
2376
2377                 s->dispatching = true;
2378                 r = s->prepare(s, s->userdata);
2379                 s->dispatching = false;
2380
2381                 if (r < 0)
2382                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, disabling: %m",
2383                                         strna(s->description), event_source_type_to_string(s->type));
2384
2385                 if (s->n_ref == 0)
2386                         source_free(s);
2387                 else if (r < 0)
2388                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
2389         }
2390
2391         return 0;
2392 }
2393
2394 static int dispatch_exit(sd_event *e) {
2395         sd_event_source *p;
2396         _cleanup_(sd_event_unrefp) sd_event *ref = NULL;
2397         int r;
2398
2399         assert(e);
2400
2401         p = prioq_peek(e->exit);
2402         if (!p || p->enabled == SD_EVENT_OFF) {
2403                 e->state = SD_EVENT_FINISHED;
2404                 return 0;
2405         }
2406
2407         ref = sd_event_ref(e);
2408         e->iteration++;
2409         e->state = SD_EVENT_EXITING;
2410         r = source_dispatch(p);
2411         e->state = SD_EVENT_INITIAL;
2412         return r;
2413 }
2414
2415 static sd_event_source* event_next_pending(sd_event *e) {
2416         sd_event_source *p;
2417
2418         assert(e);
2419
2420         p = prioq_peek(e->pending);
2421         if (!p)
2422                 return NULL;
2423
2424         if (p->enabled == SD_EVENT_OFF)
2425                 return NULL;
2426
2427         return p;
2428 }
2429
2430 static int arm_watchdog(sd_event *e) {
2431         struct itimerspec its = {};
2432         usec_t t;
2433         int r;
2434
2435         assert(e);
2436         assert(e->watchdog_fd >= 0);
2437
2438         t = sleep_between(e,
2439                           e->watchdog_last + (e->watchdog_period / 2),
2440                           e->watchdog_last + (e->watchdog_period * 3 / 4));
2441
2442         timespec_store(&its.it_value, t);
2443
2444         /* Make sure we never set the watchdog to 0, which tells the
2445          * kernel to disable it. */
2446         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
2447                 its.it_value.tv_nsec = 1;
2448
2449         r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
2450         if (r < 0)
2451                 return -errno;
2452
2453         return 0;
2454 }
2455
2456 static int process_watchdog(sd_event *e) {
2457         assert(e);
2458
2459         if (!e->watchdog)
2460                 return 0;
2461
2462         /* Don't notify watchdog too often */
2463         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
2464                 return 0;
2465
2466         sd_notify(false, "WATCHDOG=1");
2467         e->watchdog_last = e->timestamp.monotonic;
2468
2469         return arm_watchdog(e);
2470 }
2471
2472 _public_ int sd_event_prepare(sd_event *e) {
2473         int r;
2474
2475         assert_return(e, -EINVAL);
2476         assert_return(e = event_resolve(e), -ENOPKG);
2477         assert_return(!event_pid_changed(e), -ECHILD);
2478         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2479         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2480
2481         if (e->exit_requested)
2482                 goto pending;
2483
2484         e->iteration++;
2485
2486         e->state = SD_EVENT_PREPARING;
2487         r = event_prepare(e);
2488         e->state = SD_EVENT_INITIAL;
2489         if (r < 0)
2490                 return r;
2491
2492         r = event_arm_timer(e, &e->realtime);
2493         if (r < 0)
2494                 return r;
2495
2496         r = event_arm_timer(e, &e->boottime);
2497         if (r < 0)
2498                 return r;
2499
2500         r = event_arm_timer(e, &e->monotonic);
2501         if (r < 0)
2502                 return r;
2503
2504         r = event_arm_timer(e, &e->realtime_alarm);
2505         if (r < 0)
2506                 return r;
2507
2508         r = event_arm_timer(e, &e->boottime_alarm);
2509         if (r < 0)
2510                 return r;
2511
2512         if (event_next_pending(e) || e->need_process_child)
2513                 goto pending;
2514
2515         e->state = SD_EVENT_ARMED;
2516
2517         return 0;
2518
2519 pending:
2520         e->state = SD_EVENT_ARMED;
2521         r = sd_event_wait(e, 0);
2522         if (r == 0)
2523                 e->state = SD_EVENT_ARMED;
2524
2525         return r;
2526 }
2527
2528 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
2529         struct epoll_event *ev_queue;
2530         unsigned ev_queue_max;
2531         int r, m, i;
2532
2533         assert_return(e, -EINVAL);
2534         assert_return(e = event_resolve(e), -ENOPKG);
2535         assert_return(!event_pid_changed(e), -ECHILD);
2536         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2537         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
2538
2539         if (e->exit_requested) {
2540                 e->state = SD_EVENT_PENDING;
2541                 return 1;
2542         }
2543
2544         ev_queue_max = MAX(e->n_sources, 1u);
2545         ev_queue = newa(struct epoll_event, ev_queue_max);
2546
2547         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
2548                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
2549         if (m < 0) {
2550                 if (errno == EINTR) {
2551                         e->state = SD_EVENT_PENDING;
2552                         return 1;
2553                 }
2554
2555                 r = -errno;
2556                 goto finish;
2557         }
2558
2559         triple_timestamp_get(&e->timestamp);
2560
2561         for (i = 0; i < m; i++) {
2562
2563                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
2564                         r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
2565                 else {
2566                         WakeupType *t = ev_queue[i].data.ptr;
2567
2568                         switch (*t) {
2569
2570                         case WAKEUP_EVENT_SOURCE:
2571                                 r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
2572                                 break;
2573
2574                         case WAKEUP_CLOCK_DATA: {
2575                                 struct clock_data *d = ev_queue[i].data.ptr;
2576                                 r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
2577                                 break;
2578                         }
2579
2580                         case WAKEUP_SIGNAL_DATA:
2581                                 r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
2582                                 break;
2583
2584                         default:
2585                                 assert_not_reached("Invalid wake-up pointer");
2586                         }
2587                 }
2588                 if (r < 0)
2589                         goto finish;
2590         }
2591
2592         r = process_watchdog(e);
2593         if (r < 0)
2594                 goto finish;
2595
2596         r = process_timer(e, e->timestamp.realtime, &e->realtime);
2597         if (r < 0)
2598                 goto finish;
2599
2600         r = process_timer(e, e->timestamp.boottime, &e->boottime);
2601         if (r < 0)
2602                 goto finish;
2603
2604         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
2605         if (r < 0)
2606                 goto finish;
2607
2608         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
2609         if (r < 0)
2610                 goto finish;
2611
2612         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
2613         if (r < 0)
2614                 goto finish;
2615
2616         if (e->need_process_child) {
2617                 r = process_child(e);
2618                 if (r < 0)
2619                         goto finish;
2620         }
2621
2622         if (event_next_pending(e)) {
2623                 e->state = SD_EVENT_PENDING;
2624
2625                 return 1;
2626         }
2627
2628         r = 0;
2629
2630 finish:
2631         e->state = SD_EVENT_INITIAL;
2632
2633         return r;
2634 }
2635
2636 _public_ int sd_event_dispatch(sd_event *e) {
2637         sd_event_source *p;
2638         int r;
2639
2640         assert_return(e, -EINVAL);
2641         assert_return(e = event_resolve(e), -ENOPKG);
2642         assert_return(!event_pid_changed(e), -ECHILD);
2643         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2644         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
2645
2646         if (e->exit_requested)
2647                 return dispatch_exit(e);
2648
2649         p = event_next_pending(e);
2650         if (p) {
2651                 _cleanup_(sd_event_unrefp) sd_event *ref = NULL;
2652
2653                 ref = sd_event_ref(e);
2654                 e->state = SD_EVENT_RUNNING;
2655                 r = source_dispatch(p);
2656                 e->state = SD_EVENT_INITIAL;
2657                 return r;
2658         }
2659
2660         e->state = SD_EVENT_INITIAL;
2661
2662         return 1;
2663 }
2664
2665 static void event_log_delays(sd_event *e) {
2666         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1];
2667         unsigned i;
2668         int o;
2669
2670         for (i = o = 0; i < ELEMENTSOF(e->delays); i++) {
2671                 o += snprintf(&b[o], sizeof(b) - o, "%u ", e->delays[i]);
2672                 e->delays[i] = 0;
2673         }
2674         log_debug("Event loop iterations: %.*s", o, b);
2675 }
2676
2677 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
2678         int r;
2679
2680         assert_return(e, -EINVAL);
2681         assert_return(e = event_resolve(e), -ENOPKG);
2682         assert_return(!event_pid_changed(e), -ECHILD);
2683         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2684         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2685
2686         if (e->profile_delays && e->last_run) {
2687                 usec_t this_run;
2688                 unsigned l;
2689
2690                 this_run = now(CLOCK_MONOTONIC);
2691
2692                 l = u64log2(this_run - e->last_run);
2693                 assert(l < sizeof(e->delays));
2694                 e->delays[l]++;
2695
2696                 if (this_run - e->last_log >= 5*USEC_PER_SEC) {
2697                         event_log_delays(e);
2698                         e->last_log = this_run;
2699                 }
2700         }
2701
2702         r = sd_event_prepare(e);
2703         if (r == 0)
2704                 /* There was nothing? Then wait... */
2705                 r = sd_event_wait(e, timeout);
2706
2707         if (e->profile_delays)
2708                 e->last_run = now(CLOCK_MONOTONIC);
2709
2710         if (r > 0) {
2711                 /* There's something now, then let's dispatch it */
2712                 r = sd_event_dispatch(e);
2713                 if (r < 0)
2714                         return r;
2715
2716                 return 1;
2717         }
2718
2719         return r;
2720 }
2721
2722 _public_ int sd_event_loop(sd_event *e) {
2723         _cleanup_(sd_event_unrefp) sd_event *ref = NULL;
2724         int r;
2725
2726         assert_return(e, -EINVAL);
2727         assert_return(e = event_resolve(e), -ENOPKG);
2728         assert_return(!event_pid_changed(e), -ECHILD);
2729         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
2730
2731         ref = sd_event_ref(e);
2732
2733         while (e->state != SD_EVENT_FINISHED) {
2734                 r = sd_event_run(e, (uint64_t) -1);
2735                 if (r < 0)
2736                         return r;
2737         }
2738
2739         return e->exit_code;
2740 }
2741
2742 _public_ int sd_event_get_fd(sd_event *e) {
2743
2744         assert_return(e, -EINVAL);
2745         assert_return(e = event_resolve(e), -ENOPKG);
2746         assert_return(!event_pid_changed(e), -ECHILD);
2747
2748         return e->epoll_fd;
2749 }
2750
2751 _public_ int sd_event_get_state(sd_event *e) {
2752         assert_return(e, -EINVAL);
2753         assert_return(e = event_resolve(e), -ENOPKG);
2754         assert_return(!event_pid_changed(e), -ECHILD);
2755
2756         return e->state;
2757 }
2758
2759 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2760         assert_return(e, -EINVAL);
2761         assert_return(e = event_resolve(e), -ENOPKG);
2762         assert_return(code, -EINVAL);
2763         assert_return(!event_pid_changed(e), -ECHILD);
2764
2765         if (!e->exit_requested)
2766                 return -ENODATA;
2767
2768         *code = e->exit_code;
2769         return 0;
2770 }
2771
2772 _public_ int sd_event_exit(sd_event *e, int code) {
2773         assert_return(e, -EINVAL);
2774         assert_return(e = event_resolve(e), -ENOPKG);
2775         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2776         assert_return(!event_pid_changed(e), -ECHILD);
2777
2778         e->exit_requested = true;
2779         e->exit_code = code;
2780
2781         return 0;
2782 }
2783
2784 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
2785         assert_return(e, -EINVAL);
2786         assert_return(e = event_resolve(e), -ENOPKG);
2787         assert_return(usec, -EINVAL);
2788         assert_return(!event_pid_changed(e), -ECHILD);
2789
2790         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
2791                 return -EOPNOTSUPP;
2792
2793         /* Generate a clean error in case CLOCK_BOOTTIME is not available. Note that don't use clock_supported() here,
2794          * for a reason: there are systems where CLOCK_BOOTTIME is supported, but CLOCK_BOOTTIME_ALARM is not, but for
2795          * the purpose of getting the time this doesn't matter. */
2796         if (IN_SET(clock, CLOCK_BOOTTIME, CLOCK_BOOTTIME_ALARM) && !clock_boottime_supported())
2797                 return -EOPNOTSUPP;
2798
2799         if (!triple_timestamp_is_set(&e->timestamp)) {
2800                 /* Implicitly fall back to now() if we never ran
2801                  * before and thus have no cached time. */
2802                 *usec = now(clock);
2803                 return 1;
2804         }
2805
2806         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
2807         return 0;
2808 }
2809
2810 _public_ int sd_event_default(sd_event **ret) {
2811         sd_event *e = NULL;
2812         int r;
2813
2814         if (!ret)
2815                 return !!default_event;
2816
2817         if (default_event) {
2818                 *ret = sd_event_ref(default_event);
2819                 return 0;
2820         }
2821
2822         r = sd_event_new(&e);
2823         if (r < 0)
2824                 return r;
2825
2826         e->default_event_ptr = &default_event;
2827         e->tid = gettid();
2828         default_event = e;
2829
2830         *ret = e;
2831         return 1;
2832 }
2833
2834 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2835         assert_return(e, -EINVAL);
2836         assert_return(e = event_resolve(e), -ENOPKG);
2837         assert_return(tid, -EINVAL);
2838         assert_return(!event_pid_changed(e), -ECHILD);
2839
2840         if (e->tid != 0) {
2841                 *tid = e->tid;
2842                 return 0;
2843         }
2844
2845         return -ENXIO;
2846 }
2847
2848 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2849         int r;
2850
2851         assert_return(e, -EINVAL);
2852         assert_return(e = event_resolve(e), -ENOPKG);
2853         assert_return(!event_pid_changed(e), -ECHILD);
2854
2855         if (e->watchdog == !!b)
2856                 return e->watchdog;
2857
2858         if (b) {
2859                 struct epoll_event ev;
2860
2861                 r = sd_watchdog_enabled(false, &e->watchdog_period);
2862                 if (r <= 0)
2863                         return r;
2864
2865                 /* Issue first ping immediately */
2866                 sd_notify(false, "WATCHDOG=1");
2867                 e->watchdog_last = now(CLOCK_MONOTONIC);
2868
2869                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2870                 if (e->watchdog_fd < 0)
2871                         return -errno;
2872
2873                 r = arm_watchdog(e);
2874                 if (r < 0)
2875                         goto fail;
2876
2877                 ev = (struct epoll_event) {
2878                         .events = EPOLLIN,
2879                         .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
2880                 };
2881
2882                 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2883                 if (r < 0) {
2884                         r = -errno;
2885                         goto fail;
2886                 }
2887
2888         } else {
2889                 if (e->watchdog_fd >= 0) {
2890                         epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2891                         e->watchdog_fd = safe_close(e->watchdog_fd);
2892                 }
2893         }
2894
2895         e->watchdog = !!b;
2896         return e->watchdog;
2897
2898 fail:
2899         e->watchdog_fd = safe_close(e->watchdog_fd);
2900         return r;
2901 }
2902
2903 _public_ int sd_event_get_watchdog(sd_event *e) {
2904         assert_return(e, -EINVAL);
2905         assert_return(e = event_resolve(e), -ENOPKG);
2906         assert_return(!event_pid_changed(e), -ECHILD);
2907
2908         return e->watchdog;
2909 }
2910
2911 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
2912         assert_return(e, -EINVAL);
2913         assert_return(e = event_resolve(e), -ENOPKG);
2914         assert_return(!event_pid_changed(e), -ECHILD);
2915
2916         *ret = e->iteration;
2917         return 0;
2918 }