chiark / gitweb /
TODO: update
[elogind.git] / src / libsystemd / sd-event.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2013 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/epoll.h>
23 #include <sys/timerfd.h>
24 #include <sys/wait.h>
25 #include <pthread.h>
26
27 #include "sd-id128.h"
28 #include "sd-daemon.h"
29 #include "macro.h"
30 #include "prioq.h"
31 #include "hashmap.h"
32 #include "util.h"
33 #include "time-util.h"
34 #include "missing.h"
35
36 #include "sd-event.h"
37
38 #define EPOLL_QUEUE_MAX 512U
39 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
40
41 typedef enum EventSourceType {
42         SOURCE_IO,
43         SOURCE_MONOTONIC,
44         SOURCE_REALTIME,
45         SOURCE_SIGNAL,
46         SOURCE_CHILD,
47         SOURCE_DEFER,
48         SOURCE_EXIT,
49         SOURCE_WATCHDOG
50 } EventSourceType;
51
52 struct sd_event_source {
53         unsigned n_ref;
54
55         sd_event *event;
56         void *userdata;
57         sd_event_handler_t prepare;
58
59         EventSourceType type:4;
60         int enabled:3;
61         bool pending:1;
62         bool dispatching:1;
63
64         int priority;
65         unsigned pending_index;
66         unsigned prepare_index;
67         unsigned pending_iteration;
68         unsigned prepare_iteration;
69
70         union {
71                 struct {
72                         sd_event_io_handler_t callback;
73                         int fd;
74                         uint32_t events;
75                         uint32_t revents;
76                         bool registered:1;
77                 } io;
78                 struct {
79                         sd_event_time_handler_t callback;
80                         usec_t next, accuracy;
81                         unsigned earliest_index;
82                         unsigned latest_index;
83                 } time;
84                 struct {
85                         sd_event_signal_handler_t callback;
86                         struct signalfd_siginfo siginfo;
87                         int sig;
88                 } signal;
89                 struct {
90                         sd_event_child_handler_t callback;
91                         siginfo_t siginfo;
92                         pid_t pid;
93                         int options;
94                 } child;
95                 struct {
96                         sd_event_handler_t callback;
97                 } defer;
98                 struct {
99                         sd_event_handler_t callback;
100                         unsigned prioq_index;
101                 } exit;
102         };
103 };
104
105 struct sd_event {
106         unsigned n_ref;
107
108         int epoll_fd;
109         int signal_fd;
110         int realtime_fd;
111         int monotonic_fd;
112         int watchdog_fd;
113
114         Prioq *pending;
115         Prioq *prepare;
116
117         /* For both clocks we maintain two priority queues each, one
118          * ordered for the earliest times the events may be
119          * dispatched, and one ordered by the latest times they must
120          * have been dispatched. The range between the top entries in
121          * the two prioqs is the time window we can freely schedule
122          * wakeups in */
123         Prioq *monotonic_earliest;
124         Prioq *monotonic_latest;
125         Prioq *realtime_earliest;
126         Prioq *realtime_latest;
127
128         usec_t realtime_next, monotonic_next;
129         usec_t perturb;
130
131         sigset_t sigset;
132         sd_event_source **signal_sources;
133
134         Hashmap *child_sources;
135         unsigned n_enabled_child_sources;
136
137         Prioq *exit;
138
139         pid_t original_pid;
140
141         unsigned iteration;
142         dual_timestamp timestamp;
143         int state;
144
145         bool exit_requested:1;
146         bool need_process_child:1;
147         bool watchdog:1;
148
149         int exit_code;
150
151         pid_t tid;
152         sd_event **default_event_ptr;
153
154         usec_t watchdog_last, watchdog_period;
155
156         unsigned n_sources;
157 };
158
159 static int pending_prioq_compare(const void *a, const void *b) {
160         const sd_event_source *x = a, *y = b;
161
162         assert(x->pending);
163         assert(y->pending);
164
165         /* Enabled ones first */
166         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
167                 return -1;
168         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
169                 return 1;
170
171         /* Lower priority values first */
172         if (x->priority < y->priority)
173                 return -1;
174         if (x->priority > y->priority)
175                 return 1;
176
177         /* Older entries first */
178         if (x->pending_iteration < y->pending_iteration)
179                 return -1;
180         if (x->pending_iteration > y->pending_iteration)
181                 return 1;
182
183         /* Stability for the rest */
184         if (x < y)
185                 return -1;
186         if (x > y)
187                 return 1;
188
189         return 0;
190 }
191
192 static int prepare_prioq_compare(const void *a, const void *b) {
193         const sd_event_source *x = a, *y = b;
194
195         assert(x->prepare);
196         assert(y->prepare);
197
198         /* Move most recently prepared ones last, so that we can stop
199          * preparing as soon as we hit one that has already been
200          * prepared in the current iteration */
201         if (x->prepare_iteration < y->prepare_iteration)
202                 return -1;
203         if (x->prepare_iteration > y->prepare_iteration)
204                 return 1;
205
206         /* Enabled ones first */
207         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
208                 return -1;
209         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
210                 return 1;
211
212         /* Lower priority values first */
213         if (x->priority < y->priority)
214                 return -1;
215         if (x->priority > y->priority)
216                 return 1;
217
218         /* Stability for the rest */
219         if (x < y)
220                 return -1;
221         if (x > y)
222                 return 1;
223
224         return 0;
225 }
226
227 static int earliest_time_prioq_compare(const void *a, const void *b) {
228         const sd_event_source *x = a, *y = b;
229
230         assert(x->type == SOURCE_MONOTONIC || x->type == SOURCE_REALTIME);
231         assert(y->type == SOURCE_MONOTONIC || y->type == SOURCE_REALTIME);
232
233         /* Enabled ones first */
234         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
235                 return -1;
236         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
237                 return 1;
238
239         /* Move the pending ones to the end */
240         if (!x->pending && y->pending)
241                 return -1;
242         if (x->pending && !y->pending)
243                 return 1;
244
245         /* Order by time */
246         if (x->time.next < y->time.next)
247                 return -1;
248         if (x->time.next > y->time.next)
249                 return 1;
250
251         /* Stability for the rest */
252         if (x < y)
253                 return -1;
254         if (x > y)
255                 return 1;
256
257         return 0;
258 }
259
260 static int latest_time_prioq_compare(const void *a, const void *b) {
261         const sd_event_source *x = a, *y = b;
262
263         assert((x->type == SOURCE_MONOTONIC && y->type == SOURCE_MONOTONIC) ||
264                (x->type == SOURCE_REALTIME && y->type == SOURCE_REALTIME));
265
266         /* Enabled ones first */
267         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
268                 return -1;
269         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
270                 return 1;
271
272         /* Move the pending ones to the end */
273         if (!x->pending && y->pending)
274                 return -1;
275         if (x->pending && !y->pending)
276                 return 1;
277
278         /* Order by time */
279         if (x->time.next + x->time.accuracy < y->time.next + y->time.accuracy)
280                 return -1;
281         if (x->time.next + x->time.accuracy > y->time.next + y->time.accuracy)
282                 return 1;
283
284         /* Stability for the rest */
285         if (x < y)
286                 return -1;
287         if (x > y)
288                 return 1;
289
290         return 0;
291 }
292
293 static int exit_prioq_compare(const void *a, const void *b) {
294         const sd_event_source *x = a, *y = b;
295
296         assert(x->type == SOURCE_EXIT);
297         assert(y->type == SOURCE_EXIT);
298
299         /* Enabled ones first */
300         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
301                 return -1;
302         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
303                 return 1;
304
305         /* Lower priority values first */
306         if (x->priority < y->priority)
307                 return -1;
308         if (x->priority > y->priority)
309                 return 1;
310
311         /* Stability for the rest */
312         if (x < y)
313                 return -1;
314         if (x > y)
315                 return 1;
316
317         return 0;
318 }
319
320 static void event_free(sd_event *e) {
321         assert(e);
322         assert(e->n_sources == 0);
323
324         if (e->default_event_ptr)
325                 *(e->default_event_ptr) = NULL;
326
327         if (e->epoll_fd >= 0)
328                 close_nointr_nofail(e->epoll_fd);
329
330         if (e->signal_fd >= 0)
331                 close_nointr_nofail(e->signal_fd);
332
333         if (e->realtime_fd >= 0)
334                 close_nointr_nofail(e->realtime_fd);
335
336         if (e->monotonic_fd >= 0)
337                 close_nointr_nofail(e->monotonic_fd);
338
339         if (e->watchdog_fd >= 0)
340                 close_nointr_nofail(e->watchdog_fd);
341
342         prioq_free(e->pending);
343         prioq_free(e->prepare);
344         prioq_free(e->monotonic_earliest);
345         prioq_free(e->monotonic_latest);
346         prioq_free(e->realtime_earliest);
347         prioq_free(e->realtime_latest);
348         prioq_free(e->exit);
349
350         free(e->signal_sources);
351
352         hashmap_free(e->child_sources);
353         free(e);
354 }
355
356 _public_ int sd_event_new(sd_event** ret) {
357         sd_event *e;
358         int r;
359
360         assert_return(ret, -EINVAL);
361
362         e = new0(sd_event, 1);
363         if (!e)
364                 return -ENOMEM;
365
366         e->n_ref = 1;
367         e->signal_fd = e->realtime_fd = e->monotonic_fd = e->watchdog_fd = e->epoll_fd = -1;
368         e->realtime_next = e->monotonic_next = (usec_t) -1;
369         e->original_pid = getpid();
370
371         assert_se(sigemptyset(&e->sigset) == 0);
372
373         e->pending = prioq_new(pending_prioq_compare);
374         if (!e->pending) {
375                 r = -ENOMEM;
376                 goto fail;
377         }
378
379         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
380         if (e->epoll_fd < 0) {
381                 r = -errno;
382                 goto fail;
383         }
384
385         *ret = e;
386         return 0;
387
388 fail:
389         event_free(e);
390         return r;
391 }
392
393 _public_ sd_event* sd_event_ref(sd_event *e) {
394         assert_return(e, NULL);
395
396         assert(e->n_ref >= 1);
397         e->n_ref++;
398
399         return e;
400 }
401
402 _public_ sd_event* sd_event_unref(sd_event *e) {
403
404         if (!e)
405                 return NULL;
406
407         assert(e->n_ref >= 1);
408         e->n_ref--;
409
410         if (e->n_ref <= 0)
411                 event_free(e);
412
413         return NULL;
414 }
415
416 static bool event_pid_changed(sd_event *e) {
417         assert(e);
418
419         /* We don't support people creating am event loop and keeping
420          * it around over a fork(). Let's complain. */
421
422         return e->original_pid != getpid();
423 }
424
425 static int source_io_unregister(sd_event_source *s) {
426         int r;
427
428         assert(s);
429         assert(s->type == SOURCE_IO);
430
431         if (!s->io.registered)
432                 return 0;
433
434         r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
435         if (r < 0)
436                 return -errno;
437
438         s->io.registered = false;
439         return 0;
440 }
441
442 static int source_io_register(
443                 sd_event_source *s,
444                 int enabled,
445                 uint32_t events) {
446
447         struct epoll_event ev = {};
448         int r;
449
450         assert(s);
451         assert(s->type == SOURCE_IO);
452         assert(enabled != SD_EVENT_OFF);
453
454         ev.events = events;
455         ev.data.ptr = s;
456
457         if (enabled == SD_EVENT_ONESHOT)
458                 ev.events |= EPOLLONESHOT;
459
460         if (s->io.registered)
461                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
462         else
463                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
464
465         if (r < 0)
466                 return -errno;
467
468         s->io.registered = true;
469
470         return 0;
471 }
472
473 static void source_free(sd_event_source *s) {
474         assert(s);
475
476         if (s->event) {
477                 assert(s->event->n_sources > 0);
478
479                 switch (s->type) {
480
481                 case SOURCE_IO:
482                         if (s->io.fd >= 0)
483                                 source_io_unregister(s);
484
485                         break;
486
487                 case SOURCE_MONOTONIC:
488                         prioq_remove(s->event->monotonic_earliest, s, &s->time.earliest_index);
489                         prioq_remove(s->event->monotonic_latest, s, &s->time.latest_index);
490                         break;
491
492                 case SOURCE_REALTIME:
493                         prioq_remove(s->event->realtime_earliest, s, &s->time.earliest_index);
494                         prioq_remove(s->event->realtime_latest, s, &s->time.latest_index);
495                         break;
496
497                 case SOURCE_SIGNAL:
498                         if (s->signal.sig > 0) {
499                                 if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0)
500                                         assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
501
502                                 if (s->event->signal_sources)
503                                         s->event->signal_sources[s->signal.sig] = NULL;
504                         }
505
506                         break;
507
508                 case SOURCE_CHILD:
509                         if (s->child.pid > 0) {
510                                 if (s->enabled != SD_EVENT_OFF) {
511                                         assert(s->event->n_enabled_child_sources > 0);
512                                         s->event->n_enabled_child_sources--;
513                                 }
514
515                                 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD])
516                                         assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
517
518                                 hashmap_remove(s->event->child_sources, INT_TO_PTR(s->child.pid));
519                         }
520
521                         break;
522
523                 case SOURCE_DEFER:
524                         /* nothing */
525                         break;
526
527                 case SOURCE_EXIT:
528                         prioq_remove(s->event->exit, s, &s->exit.prioq_index);
529                         break;
530
531                 case SOURCE_WATCHDOG:
532                         assert_not_reached("Wut? I shouldn't exist.");
533                 }
534
535                 if (s->pending)
536                         prioq_remove(s->event->pending, s, &s->pending_index);
537
538                 if (s->prepare)
539                         prioq_remove(s->event->prepare, s, &s->prepare_index);
540
541                 s->event->n_sources--;
542                 sd_event_unref(s->event);
543         }
544
545         free(s);
546 }
547
548 static int source_set_pending(sd_event_source *s, bool b) {
549         int r;
550
551         assert(s);
552         assert(s->type != SOURCE_EXIT);
553
554         if (s->pending == b)
555                 return 0;
556
557         s->pending = b;
558
559         if (b) {
560                 s->pending_iteration = s->event->iteration;
561
562                 r = prioq_put(s->event->pending, s, &s->pending_index);
563                 if (r < 0) {
564                         s->pending = false;
565                         return r;
566                 }
567         } else
568                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
569
570         if (s->type == SOURCE_REALTIME) {
571                 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
572                 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
573         } else if (s->type == SOURCE_MONOTONIC) {
574                 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
575                 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
576         }
577
578         return 0;
579 }
580
581 static sd_event_source *source_new(sd_event *e, EventSourceType type) {
582         sd_event_source *s;
583
584         assert(e);
585
586         s = new0(sd_event_source, 1);
587         if (!s)
588                 return NULL;
589
590         s->n_ref = 1;
591         s->event = sd_event_ref(e);
592         s->type = type;
593         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
594
595         e->n_sources ++;
596
597         return s;
598 }
599
600 _public_ int sd_event_add_io(
601                 sd_event *e,
602                 int fd,
603                 uint32_t events,
604                 sd_event_io_handler_t callback,
605                 void *userdata,
606                 sd_event_source **ret) {
607
608         sd_event_source *s;
609         int r;
610
611         assert_return(e, -EINVAL);
612         assert_return(fd >= 0, -EINVAL);
613         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
614         assert_return(callback, -EINVAL);
615         assert_return(ret, -EINVAL);
616         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
617         assert_return(!event_pid_changed(e), -ECHILD);
618
619         s = source_new(e, SOURCE_IO);
620         if (!s)
621                 return -ENOMEM;
622
623         s->io.fd = fd;
624         s->io.events = events;
625         s->io.callback = callback;
626         s->userdata = userdata;
627         s->enabled = SD_EVENT_ON;
628
629         r = source_io_register(s, s->enabled, events);
630         if (r < 0) {
631                 source_free(s);
632                 return -errno;
633         }
634
635         *ret = s;
636         return 0;
637 }
638
639 static int event_setup_timer_fd(
640                 sd_event *e,
641                 EventSourceType type,
642                 int *timer_fd,
643                 clockid_t id) {
644
645         struct epoll_event ev = {};
646         int r, fd;
647         sd_id128_t bootid;
648
649         assert(e);
650         assert(timer_fd);
651
652         if (_likely_(*timer_fd >= 0))
653                 return 0;
654
655         fd = timerfd_create(id, TFD_NONBLOCK|TFD_CLOEXEC);
656         if (fd < 0)
657                 return -errno;
658
659         ev.events = EPOLLIN;
660         ev.data.ptr = INT_TO_PTR(type);
661
662         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
663         if (r < 0) {
664                 close_nointr_nofail(fd);
665                 return -errno;
666         }
667
668         /* When we sleep for longer, we try to realign the wakeup to
669            the same time wihtin each minute/second/250ms, so that
670            events all across the system can be coalesced into a single
671            CPU wakeup. However, let's take some system-specific
672            randomness for this value, so that in a network of systems
673            with synced clocks timer events are distributed a
674            bit. Here, we calculate a perturbation usec offset from the
675            boot ID. */
676
677         if (sd_id128_get_boot(&bootid) >= 0)
678                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
679
680         *timer_fd = fd;
681         return 0;
682 }
683
684 static int event_add_time_internal(
685                 sd_event *e,
686                 EventSourceType type,
687                 int *timer_fd,
688                 clockid_t id,
689                 Prioq **earliest,
690                 Prioq **latest,
691                 uint64_t usec,
692                 uint64_t accuracy,
693                 sd_event_time_handler_t callback,
694                 void *userdata,
695                 sd_event_source **ret) {
696
697         sd_event_source *s;
698         int r;
699
700         assert_return(e, -EINVAL);
701         assert_return(callback, -EINVAL);
702         assert_return(ret, -EINVAL);
703         assert_return(usec != (uint64_t) -1, -EINVAL);
704         assert_return(accuracy != (uint64_t) -1, -EINVAL);
705         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
706         assert_return(!event_pid_changed(e), -ECHILD);
707
708         assert(timer_fd);
709         assert(earliest);
710         assert(latest);
711
712         if (!*earliest) {
713                 *earliest = prioq_new(earliest_time_prioq_compare);
714                 if (!*earliest)
715                         return -ENOMEM;
716         }
717
718         if (!*latest) {
719                 *latest = prioq_new(latest_time_prioq_compare);
720                 if (!*latest)
721                         return -ENOMEM;
722         }
723
724         if (*timer_fd < 0) {
725                 r = event_setup_timer_fd(e, type, timer_fd, id);
726                 if (r < 0)
727                         return r;
728         }
729
730         s = source_new(e, type);
731         if (!s)
732                 return -ENOMEM;
733
734         s->time.next = usec;
735         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
736         s->time.callback = callback;
737         s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
738         s->userdata = userdata;
739         s->enabled = SD_EVENT_ONESHOT;
740
741         r = prioq_put(*earliest, s, &s->time.earliest_index);
742         if (r < 0)
743                 goto fail;
744
745         r = prioq_put(*latest, s, &s->time.latest_index);
746         if (r < 0)
747                 goto fail;
748
749         *ret = s;
750         return 0;
751
752 fail:
753         source_free(s);
754         return r;
755 }
756
757 _public_ int sd_event_add_monotonic(sd_event *e,
758                                     uint64_t usec,
759                                     uint64_t accuracy,
760                                     sd_event_time_handler_t callback,
761                                     void *userdata,
762                                     sd_event_source **ret) {
763
764         return event_add_time_internal(e, SOURCE_MONOTONIC, &e->monotonic_fd, CLOCK_MONOTONIC, &e->monotonic_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
765 }
766
767 _public_ int sd_event_add_realtime(sd_event *e,
768                                    uint64_t usec,
769                                    uint64_t accuracy,
770                                    sd_event_time_handler_t callback,
771                                    void *userdata,
772                                    sd_event_source **ret) {
773
774         return event_add_time_internal(e, SOURCE_REALTIME, &e->realtime_fd, CLOCK_REALTIME, &e->realtime_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
775 }
776
777 static int event_update_signal_fd(sd_event *e) {
778         struct epoll_event ev = {};
779         bool add_to_epoll;
780         int r;
781
782         assert(e);
783
784         add_to_epoll = e->signal_fd < 0;
785
786         r = signalfd(e->signal_fd, &e->sigset, SFD_NONBLOCK|SFD_CLOEXEC);
787         if (r < 0)
788                 return -errno;
789
790         e->signal_fd = r;
791
792         if (!add_to_epoll)
793                 return 0;
794
795         ev.events = EPOLLIN;
796         ev.data.ptr = INT_TO_PTR(SOURCE_SIGNAL);
797
798         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->signal_fd, &ev);
799         if (r < 0) {
800                 close_nointr_nofail(e->signal_fd);
801                 e->signal_fd = -1;
802
803                 return -errno;
804         }
805
806         return 0;
807 }
808
809 _public_ int sd_event_add_signal(
810                 sd_event *e,
811                 int sig,
812                 sd_event_signal_handler_t callback,
813                 void *userdata,
814                 sd_event_source **ret) {
815
816         sd_event_source *s;
817         sigset_t ss;
818         int r;
819
820         assert_return(e, -EINVAL);
821         assert_return(sig > 0, -EINVAL);
822         assert_return(sig < _NSIG, -EINVAL);
823         assert_return(callback, -EINVAL);
824         assert_return(ret, -EINVAL);
825         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
826         assert_return(!event_pid_changed(e), -ECHILD);
827
828         r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
829         if (r < 0)
830                 return -errno;
831
832         if (!sigismember(&ss, sig))
833                 return -EBUSY;
834
835         if (!e->signal_sources) {
836                 e->signal_sources = new0(sd_event_source*, _NSIG);
837                 if (!e->signal_sources)
838                         return -ENOMEM;
839         } else if (e->signal_sources[sig])
840                 return -EBUSY;
841
842         s = source_new(e, SOURCE_SIGNAL);
843         if (!s)
844                 return -ENOMEM;
845
846         s->signal.sig = sig;
847         s->signal.callback = callback;
848         s->userdata = userdata;
849         s->enabled = SD_EVENT_ON;
850
851         e->signal_sources[sig] = s;
852         assert_se(sigaddset(&e->sigset, sig) == 0);
853
854         if (sig != SIGCHLD || e->n_enabled_child_sources == 0) {
855                 r = event_update_signal_fd(e);
856                 if (r < 0) {
857                         source_free(s);
858                         return r;
859                 }
860         }
861
862         *ret = s;
863         return 0;
864 }
865
866 _public_ int sd_event_add_child(
867                 sd_event *e,
868                 pid_t pid,
869                 int options,
870                 sd_event_child_handler_t callback,
871                 void *userdata,
872                 sd_event_source **ret) {
873
874         sd_event_source *s;
875         int r;
876
877         assert_return(e, -EINVAL);
878         assert_return(pid > 1, -EINVAL);
879         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
880         assert_return(options != 0, -EINVAL);
881         assert_return(callback, -EINVAL);
882         assert_return(ret, -EINVAL);
883         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
884         assert_return(!event_pid_changed(e), -ECHILD);
885
886         r = hashmap_ensure_allocated(&e->child_sources, trivial_hash_func, trivial_compare_func);
887         if (r < 0)
888                 return r;
889
890         if (hashmap_contains(e->child_sources, INT_TO_PTR(pid)))
891                 return -EBUSY;
892
893         s = source_new(e, SOURCE_CHILD);
894         if (!s)
895                 return -ENOMEM;
896
897         s->child.pid = pid;
898         s->child.options = options;
899         s->child.callback = callback;
900         s->userdata = userdata;
901         s->enabled = SD_EVENT_ONESHOT;
902
903         r = hashmap_put(e->child_sources, INT_TO_PTR(pid), s);
904         if (r < 0) {
905                 source_free(s);
906                 return r;
907         }
908
909         e->n_enabled_child_sources ++;
910
911         assert_se(sigaddset(&e->sigset, SIGCHLD) == 0);
912
913         if (!e->signal_sources || !e->signal_sources[SIGCHLD]) {
914                 r = event_update_signal_fd(e);
915                 if (r < 0) {
916                         source_free(s);
917                         return -errno;
918                 }
919         }
920
921         e->need_process_child = true;
922
923         *ret = s;
924         return 0;
925 }
926
927 _public_ int sd_event_add_defer(
928                 sd_event *e,
929                 sd_event_handler_t callback,
930                 void *userdata,
931                 sd_event_source **ret) {
932
933         sd_event_source *s;
934         int r;
935
936         assert_return(e, -EINVAL);
937         assert_return(callback, -EINVAL);
938         assert_return(ret, -EINVAL);
939         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
940         assert_return(!event_pid_changed(e), -ECHILD);
941
942         s = source_new(e, SOURCE_DEFER);
943         if (!s)
944                 return -ENOMEM;
945
946         s->defer.callback = callback;
947         s->userdata = userdata;
948         s->enabled = SD_EVENT_ONESHOT;
949
950         r = source_set_pending(s, true);
951         if (r < 0) {
952                 source_free(s);
953                 return r;
954         }
955
956         *ret = s;
957         return 0;
958 }
959
960 _public_ int sd_event_add_exit(
961                 sd_event *e,
962                 sd_event_handler_t callback,
963                 void *userdata,
964                 sd_event_source **ret) {
965
966         sd_event_source *s;
967         int r;
968
969         assert_return(e, -EINVAL);
970         assert_return(callback, -EINVAL);
971         assert_return(ret, -EINVAL);
972         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
973         assert_return(!event_pid_changed(e), -ECHILD);
974
975         if (!e->exit) {
976                 e->exit = prioq_new(exit_prioq_compare);
977                 if (!e->exit)
978                         return -ENOMEM;
979         }
980
981         s = source_new(e, SOURCE_EXIT);
982         if (!s)
983                 return -ENOMEM;
984
985         s->exit.callback = callback;
986         s->userdata = userdata;
987         s->exit.prioq_index = PRIOQ_IDX_NULL;
988         s->enabled = SD_EVENT_ONESHOT;
989
990         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
991         if (r < 0) {
992                 source_free(s);
993                 return r;
994         }
995
996         *ret = s;
997         return 0;
998 }
999
1000 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
1001         assert_return(s, NULL);
1002
1003         assert(s->n_ref >= 1);
1004         s->n_ref++;
1005
1006         return s;
1007 }
1008
1009 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
1010
1011         if (!s)
1012                 return NULL;
1013
1014         assert(s->n_ref >= 1);
1015         s->n_ref--;
1016
1017         if (s->n_ref <= 0) {
1018                 /* Here's a special hack: when we are called from a
1019                  * dispatch handler we won't free the event source
1020                  * immediately, but we will detach the fd from the
1021                  * epoll. This way it is safe for the caller to unref
1022                  * the event source and immediately close the fd, but
1023                  * we still retain a valid event source object after
1024                  * the callback. */
1025
1026                 if (s->dispatching) {
1027                         if (s->type == SOURCE_IO)
1028                                 source_io_unregister(s);
1029                 } else
1030                         source_free(s);
1031         }
1032
1033         return NULL;
1034 }
1035
1036 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1037         assert_return(s, NULL);
1038
1039         return s->event;
1040 }
1041
1042 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1043         assert_return(s, -EINVAL);
1044         assert_return(s->type != SOURCE_EXIT, -EDOM);
1045         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1046         assert_return(!event_pid_changed(s->event), -ECHILD);
1047
1048         return s->pending;
1049 }
1050
1051 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1052         assert_return(s, -EINVAL);
1053         assert_return(s->type == SOURCE_IO, -EDOM);
1054         assert_return(!event_pid_changed(s->event), -ECHILD);
1055
1056         return s->io.fd;
1057 }
1058
1059 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1060         int r;
1061
1062         assert_return(s, -EINVAL);
1063         assert_return(fd >= 0, -EINVAL);
1064         assert_return(s->type == SOURCE_IO, -EDOM);
1065         assert_return(!event_pid_changed(s->event), -ECHILD);
1066
1067         if (s->io.fd == fd)
1068                 return 0;
1069
1070         if (s->enabled == SD_EVENT_OFF) {
1071                 s->io.fd = fd;
1072                 s->io.registered = false;
1073         } else {
1074                 int saved_fd;
1075
1076                 saved_fd = s->io.fd;
1077                 assert(s->io.registered);
1078
1079                 s->io.fd = fd;
1080                 s->io.registered = false;
1081
1082                 r = source_io_register(s, s->enabled, s->io.events);
1083                 if (r < 0) {
1084                         s->io.fd = saved_fd;
1085                         s->io.registered = true;
1086                         return r;
1087                 }
1088
1089                 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
1090         }
1091
1092         return 0;
1093 }
1094
1095 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
1096         assert_return(s, -EINVAL);
1097         assert_return(events, -EINVAL);
1098         assert_return(s->type == SOURCE_IO, -EDOM);
1099         assert_return(!event_pid_changed(s->event), -ECHILD);
1100
1101         *events = s->io.events;
1102         return 0;
1103 }
1104
1105 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
1106         int r;
1107
1108         assert_return(s, -EINVAL);
1109         assert_return(s->type == SOURCE_IO, -EDOM);
1110         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1111         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1112         assert_return(!event_pid_changed(s->event), -ECHILD);
1113
1114         if (s->io.events == events)
1115                 return 0;
1116
1117         if (s->enabled != SD_EVENT_OFF) {
1118                 r = source_io_register(s, s->enabled, events);
1119                 if (r < 0)
1120                         return r;
1121         }
1122
1123         s->io.events = events;
1124         source_set_pending(s, false);
1125
1126         return 0;
1127 }
1128
1129 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
1130         assert_return(s, -EINVAL);
1131         assert_return(revents, -EINVAL);
1132         assert_return(s->type == SOURCE_IO, -EDOM);
1133         assert_return(s->pending, -ENODATA);
1134         assert_return(!event_pid_changed(s->event), -ECHILD);
1135
1136         *revents = s->io.revents;
1137         return 0;
1138 }
1139
1140 _public_ int sd_event_source_get_signal(sd_event_source *s) {
1141         assert_return(s, -EINVAL);
1142         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
1143         assert_return(!event_pid_changed(s->event), -ECHILD);
1144
1145         return s->signal.sig;
1146 }
1147
1148 _public_ int sd_event_source_get_priority(sd_event_source *s, int *priority) {
1149         assert_return(s, -EINVAL);
1150         assert_return(!event_pid_changed(s->event), -ECHILD);
1151
1152         return s->priority;
1153 }
1154
1155 _public_ int sd_event_source_set_priority(sd_event_source *s, int priority) {
1156         assert_return(s, -EINVAL);
1157         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1158         assert_return(!event_pid_changed(s->event), -ECHILD);
1159
1160         if (s->priority == priority)
1161                 return 0;
1162
1163         s->priority = priority;
1164
1165         if (s->pending)
1166                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1167
1168         if (s->prepare)
1169                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1170
1171         if (s->type == SOURCE_EXIT)
1172                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1173
1174         return 0;
1175 }
1176
1177 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
1178         assert_return(s, -EINVAL);
1179         assert_return(m, -EINVAL);
1180         assert_return(!event_pid_changed(s->event), -ECHILD);
1181
1182         *m = s->enabled;
1183         return 0;
1184 }
1185
1186 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
1187         int r;
1188
1189         assert_return(s, -EINVAL);
1190         assert_return(m == SD_EVENT_OFF || m == SD_EVENT_ON || m == SD_EVENT_ONESHOT, -EINVAL);
1191         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1192         assert_return(!event_pid_changed(s->event), -ECHILD);
1193
1194         if (s->enabled == m)
1195                 return 0;
1196
1197         if (m == SD_EVENT_OFF) {
1198
1199                 switch (s->type) {
1200
1201                 case SOURCE_IO:
1202                         r = source_io_unregister(s);
1203                         if (r < 0)
1204                                 return r;
1205
1206                         s->enabled = m;
1207                         break;
1208
1209                 case SOURCE_MONOTONIC:
1210                         s->enabled = m;
1211                         prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1212                         prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1213                         break;
1214
1215                 case SOURCE_REALTIME:
1216                         s->enabled = m;
1217                         prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1218                         prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1219                         break;
1220
1221                 case SOURCE_SIGNAL:
1222                         s->enabled = m;
1223                         if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0) {
1224                                 assert_se(sigdelset(&s->event->sigset, s->signal.sig) == 0);
1225                                 event_update_signal_fd(s->event);
1226                         }
1227
1228                         break;
1229
1230                 case SOURCE_CHILD:
1231                         s->enabled = m;
1232
1233                         assert(s->event->n_enabled_child_sources > 0);
1234                         s->event->n_enabled_child_sources--;
1235
1236                         if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
1237                                 assert_se(sigdelset(&s->event->sigset, SIGCHLD) == 0);
1238                                 event_update_signal_fd(s->event);
1239                         }
1240
1241                         break;
1242
1243                 case SOURCE_EXIT:
1244                         s->enabled = m;
1245                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1246                         break;
1247
1248                 case SOURCE_DEFER:
1249                         s->enabled = m;
1250                         break;
1251
1252                 case SOURCE_WATCHDOG:
1253                         assert_not_reached("Wut? I shouldn't exist.");
1254                 }
1255
1256         } else {
1257                 switch (s->type) {
1258
1259                 case SOURCE_IO:
1260                         r = source_io_register(s, m, s->io.events);
1261                         if (r < 0)
1262                                 return r;
1263
1264                         s->enabled = m;
1265                         break;
1266
1267                 case SOURCE_MONOTONIC:
1268                         s->enabled = m;
1269                         prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1270                         prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1271                         break;
1272
1273                 case SOURCE_REALTIME:
1274                         s->enabled = m;
1275                         prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1276                         prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1277                         break;
1278
1279                 case SOURCE_SIGNAL:
1280                         s->enabled = m;
1281
1282                         if (s->signal.sig != SIGCHLD || s->event->n_enabled_child_sources == 0)  {
1283                                 assert_se(sigaddset(&s->event->sigset, s->signal.sig) == 0);
1284                                 event_update_signal_fd(s->event);
1285                         }
1286                         break;
1287
1288                 case SOURCE_CHILD:
1289                         s->enabled = m;
1290
1291                         if (s->enabled == SD_EVENT_OFF) {
1292                                 s->event->n_enabled_child_sources++;
1293
1294                                 if (!s->event->signal_sources || !s->event->signal_sources[SIGCHLD]) {
1295                                         assert_se(sigaddset(&s->event->sigset, SIGCHLD) == 0);
1296                                         event_update_signal_fd(s->event);
1297                                 }
1298                         }
1299                         break;
1300
1301                 case SOURCE_EXIT:
1302                         s->enabled = m;
1303                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
1304                         break;
1305
1306                 case SOURCE_DEFER:
1307                         s->enabled = m;
1308                         break;
1309
1310                 case SOURCE_WATCHDOG:
1311                         assert_not_reached("Wut? I shouldn't exist.");
1312                 }
1313         }
1314
1315         if (s->pending)
1316                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
1317
1318         if (s->prepare)
1319                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
1320
1321         return 0;
1322 }
1323
1324 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
1325         assert_return(s, -EINVAL);
1326         assert_return(usec, -EINVAL);
1327         assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1328         assert_return(!event_pid_changed(s->event), -ECHILD);
1329
1330         *usec = s->time.next;
1331         return 0;
1332 }
1333
1334 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
1335         assert_return(s, -EINVAL);
1336         assert_return(usec != (uint64_t) -1, -EINVAL);
1337         assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1338         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1339         assert_return(!event_pid_changed(s->event), -ECHILD);
1340
1341         s->time.next = usec;
1342
1343         source_set_pending(s, false);
1344
1345         if (s->type == SOURCE_REALTIME) {
1346                 prioq_reshuffle(s->event->realtime_earliest, s, &s->time.earliest_index);
1347                 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1348         } else {
1349                 prioq_reshuffle(s->event->monotonic_earliest, s, &s->time.earliest_index);
1350                 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1351         }
1352
1353         return 0;
1354 }
1355
1356 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
1357         assert_return(s, -EINVAL);
1358         assert_return(usec, -EINVAL);
1359         assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1360         assert_return(!event_pid_changed(s->event), -ECHILD);
1361
1362         *usec = s->time.accuracy;
1363         return 0;
1364 }
1365
1366 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
1367         assert_return(s, -EINVAL);
1368         assert_return(usec != (uint64_t) -1, -EINVAL);
1369         assert_return(s->type == SOURCE_REALTIME || s->type == SOURCE_MONOTONIC, -EDOM);
1370         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1371         assert_return(!event_pid_changed(s->event), -ECHILD);
1372
1373         if (usec == 0)
1374                 usec = DEFAULT_ACCURACY_USEC;
1375
1376         s->time.accuracy = usec;
1377
1378         source_set_pending(s, false);
1379
1380         if (s->type == SOURCE_REALTIME)
1381                 prioq_reshuffle(s->event->realtime_latest, s, &s->time.latest_index);
1382         else
1383                 prioq_reshuffle(s->event->monotonic_latest, s, &s->time.latest_index);
1384
1385         return 0;
1386 }
1387
1388 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
1389         assert_return(s, -EINVAL);
1390         assert_return(pid, -EINVAL);
1391         assert_return(s->type == SOURCE_CHILD, -EDOM);
1392         assert_return(!event_pid_changed(s->event), -ECHILD);
1393
1394         *pid = s->child.pid;
1395         return 0;
1396 }
1397
1398 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
1399         int r;
1400
1401         assert_return(s, -EINVAL);
1402         assert_return(s->type != SOURCE_EXIT, -EDOM);
1403         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1404         assert_return(!event_pid_changed(s->event), -ECHILD);
1405
1406         if (s->prepare == callback)
1407                 return 0;
1408
1409         if (callback && s->prepare) {
1410                 s->prepare = callback;
1411                 return 0;
1412         }
1413
1414         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
1415         if (r < 0)
1416                 return r;
1417
1418         s->prepare = callback;
1419
1420         if (callback) {
1421                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
1422                 if (r < 0)
1423                         return r;
1424         } else
1425                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1426
1427         return 0;
1428 }
1429
1430 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
1431         assert_return(s, NULL);
1432
1433         return s->userdata;
1434 }
1435
1436 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
1437         void *ret;
1438
1439         assert_return(s, NULL);
1440
1441         ret = s->userdata;
1442         s->userdata = userdata;
1443
1444         return ret;
1445 }
1446
1447 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
1448         usec_t c;
1449         assert(e);
1450         assert(a <= b);
1451
1452         if (a <= 0)
1453                 return 0;
1454
1455         if (b <= a + 1)
1456                 return a;
1457
1458         /*
1459           Find a good time to wake up again between times a and b. We
1460           have two goals here:
1461
1462           a) We want to wake up as seldom as possible, hence prefer
1463              later times over earlier times.
1464
1465           b) But if we have to wake up, then let's make sure to
1466              dispatch as much as possible on the entire system.
1467
1468           We implement this by waking up everywhere at the same time
1469           within any given minute if we can, synchronised via the
1470           perturbation value determined from the boot ID. If we can't,
1471           then we try to find the same spot in every 10s, then 1s and
1472           then 250ms step. Otherwise, we pick the last possible time
1473           to wake up.
1474         */
1475
1476         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
1477         if (c >= b) {
1478                 if (_unlikely_(c < USEC_PER_MINUTE))
1479                         return b;
1480
1481                 c -= USEC_PER_MINUTE;
1482         }
1483
1484         if (c >= a)
1485                 return c;
1486
1487         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
1488         if (c >= b) {
1489                 if (_unlikely_(c < USEC_PER_SEC*10))
1490                         return b;
1491
1492                 c -= USEC_PER_SEC*10;
1493         }
1494
1495         if (c >= a)
1496                 return c;
1497
1498         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
1499         if (c >= b) {
1500                 if (_unlikely_(c < USEC_PER_SEC))
1501                         return b;
1502
1503                 c -= USEC_PER_SEC;
1504         }
1505
1506         if (c >= a)
1507                 return c;
1508
1509         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
1510         if (c >= b) {
1511                 if (_unlikely_(c < USEC_PER_MSEC*250))
1512                         return b;
1513
1514                 c -= USEC_PER_MSEC*250;
1515         }
1516
1517         if (c >= a)
1518                 return c;
1519
1520         return b;
1521 }
1522
1523 static int event_arm_timer(
1524                 sd_event *e,
1525                 int timer_fd,
1526                 Prioq *earliest,
1527                 Prioq *latest,
1528                 usec_t *next) {
1529
1530         struct itimerspec its = {};
1531         sd_event_source *a, *b;
1532         usec_t t;
1533         int r;
1534
1535         assert(e);
1536         assert(next);
1537
1538         a = prioq_peek(earliest);
1539         if (!a || a->enabled == SD_EVENT_OFF) {
1540
1541                 if (timer_fd < 0)
1542                         return 0;
1543
1544                 if (*next == (usec_t) -1)
1545                         return 0;
1546
1547                 /* disarm */
1548                 r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
1549                 if (r < 0)
1550                         return r;
1551
1552                 *next = (usec_t) -1;
1553
1554                 return 0;
1555         }
1556
1557         b = prioq_peek(latest);
1558         assert_se(b && b->enabled != SD_EVENT_OFF);
1559
1560         t = sleep_between(e, a->time.next, b->time.next + b->time.accuracy);
1561         if (*next == t)
1562                 return 0;
1563
1564         assert_se(timer_fd >= 0);
1565
1566         if (t == 0) {
1567                 /* We don' want to disarm here, just mean some time looooong ago. */
1568                 its.it_value.tv_sec = 0;
1569                 its.it_value.tv_nsec = 1;
1570         } else
1571                 timespec_store(&its.it_value, t);
1572
1573         r = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &its, NULL);
1574         if (r < 0)
1575                 return -errno;
1576
1577         *next = t;
1578         return 0;
1579 }
1580
1581 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
1582         assert(e);
1583         assert(s);
1584         assert(s->type == SOURCE_IO);
1585
1586         /* If the event source was already pending, we just OR in the
1587          * new revents, otherwise we reset the value. The ORing is
1588          * necessary to handle EPOLLONESHOT events properly where
1589          * readability might happen independently of writability, and
1590          * we need to keep track of both */
1591
1592         if (s->pending)
1593                 s->io.revents |= revents;
1594         else
1595                 s->io.revents = revents;
1596
1597         return source_set_pending(s, true);
1598 }
1599
1600 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
1601         uint64_t x;
1602         ssize_t ss;
1603
1604         assert(e);
1605         assert(fd >= 0);
1606
1607         assert_return(events == EPOLLIN, -EIO);
1608
1609         ss = read(fd, &x, sizeof(x));
1610         if (ss < 0) {
1611                 if (errno == EAGAIN || errno == EINTR)
1612                         return 0;
1613
1614                 return -errno;
1615         }
1616
1617         if (_unlikely_(ss != sizeof(x)))
1618                 return -EIO;
1619
1620         if (next)
1621                 *next = (usec_t) -1;
1622
1623         return 0;
1624 }
1625
1626 static int process_timer(
1627                 sd_event *e,
1628                 usec_t n,
1629                 Prioq *earliest,
1630                 Prioq *latest) {
1631
1632         sd_event_source *s;
1633         int r;
1634
1635         assert(e);
1636
1637         for (;;) {
1638                 s = prioq_peek(earliest);
1639                 if (!s ||
1640                     s->time.next > n ||
1641                     s->enabled == SD_EVENT_OFF ||
1642                     s->pending)
1643                         break;
1644
1645                 r = source_set_pending(s, true);
1646                 if (r < 0)
1647                         return r;
1648
1649                 prioq_reshuffle(earliest, s, &s->time.earliest_index);
1650                 prioq_reshuffle(latest, s, &s->time.latest_index);
1651         }
1652
1653         return 0;
1654 }
1655
1656 static int process_child(sd_event *e) {
1657         sd_event_source *s;
1658         Iterator i;
1659         int r;
1660
1661         assert(e);
1662
1663         e->need_process_child = false;
1664
1665         /*
1666            So, this is ugly. We iteratively invoke waitid() with P_PID
1667            + WNOHANG for each PID we wait for, instead of using
1668            P_ALL. This is because we only want to get child
1669            information of very specific child processes, and not all
1670            of them. We might not have processed the SIGCHLD even of a
1671            previous invocation and we don't want to maintain a
1672            unbounded *per-child* event queue, hence we really don't
1673            want anything flushed out of the kernel's queue that we
1674            don't care about. Since this is O(n) this means that if you
1675            have a lot of processes you probably want to handle SIGCHLD
1676            yourself.
1677
1678            We do not reap the children here (by using WNOWAIT), this
1679            is only done after the event source is dispatched so that
1680            the callback still sees the process as a zombie.
1681         */
1682
1683         HASHMAP_FOREACH(s, e->child_sources, i) {
1684                 assert(s->type == SOURCE_CHILD);
1685
1686                 if (s->pending)
1687                         continue;
1688
1689                 if (s->enabled == SD_EVENT_OFF)
1690                         continue;
1691
1692                 zero(s->child.siginfo);
1693                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
1694                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
1695                 if (r < 0)
1696                         return -errno;
1697
1698                 if (s->child.siginfo.si_pid != 0) {
1699                         bool zombie =
1700                                 s->child.siginfo.si_code == CLD_EXITED ||
1701                                 s->child.siginfo.si_code == CLD_KILLED ||
1702                                 s->child.siginfo.si_code == CLD_DUMPED;
1703
1704                         if (!zombie && (s->child.options & WEXITED)) {
1705                                 /* If the child isn't dead then let's
1706                                  * immediately remove the state change
1707                                  * from the queue, since there's no
1708                                  * benefit in leaving it queued */
1709
1710                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
1711                                 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
1712                         }
1713
1714                         r = source_set_pending(s, true);
1715                         if (r < 0)
1716                                 return r;
1717                 }
1718         }
1719
1720         return 0;
1721 }
1722
1723 static int process_signal(sd_event *e, uint32_t events) {
1724         bool read_one = false;
1725         int r;
1726
1727         assert(e);
1728         assert(e->signal_sources);
1729
1730         assert_return(events == EPOLLIN, -EIO);
1731
1732         for (;;) {
1733                 struct signalfd_siginfo si;
1734                 ssize_t ss;
1735                 sd_event_source *s;
1736
1737                 ss = read(e->signal_fd, &si, sizeof(si));
1738                 if (ss < 0) {
1739                         if (errno == EAGAIN || errno == EINTR)
1740                                 return read_one;
1741
1742                         return -errno;
1743                 }
1744
1745                 if (_unlikely_(ss != sizeof(si)))
1746                         return -EIO;
1747
1748                 read_one = true;
1749
1750                 s = e->signal_sources[si.ssi_signo];
1751                 if (si.ssi_signo == SIGCHLD) {
1752                         r = process_child(e);
1753                         if (r < 0)
1754                                 return r;
1755                         if (r > 0 || !s)
1756                                 continue;
1757                 } else
1758                         if (!s)
1759                                 return -EIO;
1760
1761                 s->signal.siginfo = si;
1762                 r = source_set_pending(s, true);
1763                 if (r < 0)
1764                         return r;
1765         }
1766
1767         return 0;
1768 }
1769
1770 static int source_dispatch(sd_event_source *s) {
1771         int r = 0;
1772
1773         assert(s);
1774         assert(s->pending || s->type == SOURCE_EXIT);
1775
1776         if (s->type != SOURCE_DEFER && s->type != SOURCE_EXIT) {
1777                 r = source_set_pending(s, false);
1778                 if (r < 0)
1779                         return r;
1780         }
1781
1782         if (s->enabled == SD_EVENT_ONESHOT) {
1783                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
1784                 if (r < 0)
1785                         return r;
1786         }
1787
1788         s->dispatching = true;
1789
1790         switch (s->type) {
1791
1792         case SOURCE_IO:
1793                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
1794                 break;
1795
1796         case SOURCE_MONOTONIC:
1797                 r = s->time.callback(s, s->time.next, s->userdata);
1798                 break;
1799
1800         case SOURCE_REALTIME:
1801                 r = s->time.callback(s, s->time.next, s->userdata);
1802                 break;
1803
1804         case SOURCE_SIGNAL:
1805                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
1806                 break;
1807
1808         case SOURCE_CHILD: {
1809                 bool zombie;
1810
1811                 zombie = s->child.siginfo.si_code == CLD_EXITED ||
1812                          s->child.siginfo.si_code == CLD_KILLED ||
1813                          s->child.siginfo.si_code == CLD_DUMPED;
1814
1815                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
1816
1817                 /* Now, reap the PID for good. */
1818                 if (zombie)
1819                         waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
1820
1821                 break;
1822         }
1823
1824         case SOURCE_DEFER:
1825                 r = s->defer.callback(s, s->userdata);
1826                 break;
1827
1828         case SOURCE_EXIT:
1829                 r = s->exit.callback(s, s->userdata);
1830                 break;
1831
1832         case SOURCE_WATCHDOG:
1833                 assert_not_reached("Wut? I shouldn't exist.");
1834         }
1835
1836         s->dispatching = false;
1837
1838         if (r < 0)
1839                 log_debug("Event source %p returned error, disabling: %s", s, strerror(-r));
1840
1841         if (s->n_ref == 0)
1842                 source_free(s);
1843         else if (r < 0)
1844                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
1845
1846         return 1;
1847 }
1848
1849 static int event_prepare(sd_event *e) {
1850         int r;
1851
1852         assert(e);
1853
1854         for (;;) {
1855                 sd_event_source *s;
1856
1857                 s = prioq_peek(e->prepare);
1858                 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
1859                         break;
1860
1861                 s->prepare_iteration = e->iteration;
1862                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
1863                 if (r < 0)
1864                         return r;
1865
1866                 assert(s->prepare);
1867
1868                 s->dispatching = true;
1869                 r = s->prepare(s, s->userdata);
1870                 s->dispatching = false;
1871
1872                 if (r < 0)
1873                         log_debug("Prepare callback of event source %p returned error, disabling: %s", s, strerror(-r));
1874
1875                 if (s->n_ref == 0)
1876                         source_free(s);
1877                 else if (r < 0)
1878                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
1879         }
1880
1881         return 0;
1882 }
1883
1884 static int dispatch_exit(sd_event *e) {
1885         sd_event_source *p;
1886         int r;
1887
1888         assert(e);
1889
1890         p = prioq_peek(e->exit);
1891         if (!p || p->enabled == SD_EVENT_OFF) {
1892                 e->state = SD_EVENT_FINISHED;
1893                 return 0;
1894         }
1895
1896         sd_event_ref(e);
1897         e->iteration++;
1898         e->state = SD_EVENT_EXITING;
1899
1900         r = source_dispatch(p);
1901
1902         e->state = SD_EVENT_PASSIVE;
1903         sd_event_unref(e);
1904
1905         return r;
1906 }
1907
1908 static sd_event_source* event_next_pending(sd_event *e) {
1909         sd_event_source *p;
1910
1911         assert(e);
1912
1913         p = prioq_peek(e->pending);
1914         if (!p)
1915                 return NULL;
1916
1917         if (p->enabled == SD_EVENT_OFF)
1918                 return NULL;
1919
1920         return p;
1921 }
1922
1923 static int arm_watchdog(sd_event *e) {
1924         struct itimerspec its = {};
1925         usec_t t;
1926         int r;
1927
1928         assert(e);
1929         assert(e->watchdog_fd >= 0);
1930
1931         t = sleep_between(e,
1932                           e->watchdog_last + (e->watchdog_period / 2),
1933                           e->watchdog_last + (e->watchdog_period * 3 / 4));
1934
1935         timespec_store(&its.it_value, t);
1936
1937         r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
1938         if (r < 0)
1939                 return -errno;
1940
1941         return 0;
1942 }
1943
1944 static int process_watchdog(sd_event *e) {
1945         assert(e);
1946
1947         if (!e->watchdog)
1948                 return 0;
1949
1950         /* Don't notify watchdog too often */
1951         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
1952                 return 0;
1953
1954         sd_notify(false, "WATCHDOG=1");
1955         e->watchdog_last = e->timestamp.monotonic;
1956
1957         return arm_watchdog(e);
1958 }
1959
1960 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
1961         struct epoll_event *ev_queue;
1962         unsigned ev_queue_max;
1963         sd_event_source *p;
1964         int r, i, m;
1965
1966         assert_return(e, -EINVAL);
1967         assert_return(!event_pid_changed(e), -ECHILD);
1968         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1969         assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
1970
1971         if (e->exit_requested)
1972                 return dispatch_exit(e);
1973
1974         sd_event_ref(e);
1975         e->iteration++;
1976         e->state = SD_EVENT_RUNNING;
1977
1978         r = event_prepare(e);
1979         if (r < 0)
1980                 goto finish;
1981
1982         r = event_arm_timer(e, e->monotonic_fd, e->monotonic_earliest, e->monotonic_latest, &e->monotonic_next);
1983         if (r < 0)
1984                 goto finish;
1985
1986         r = event_arm_timer(e, e->realtime_fd, e->realtime_earliest, e->realtime_latest, &e->realtime_next);
1987         if (r < 0)
1988                 goto finish;
1989
1990         if (event_next_pending(e) || e->need_process_child)
1991                 timeout = 0;
1992         ev_queue_max = CLAMP(e->n_sources, 1U, EPOLL_QUEUE_MAX);
1993         ev_queue = newa(struct epoll_event, ev_queue_max);
1994
1995         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
1996                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
1997         if (m < 0) {
1998                 r = errno == EAGAIN || errno == EINTR ? 1 : -errno;
1999                 goto finish;
2000         }
2001
2002         dual_timestamp_get(&e->timestamp);
2003
2004         for (i = 0; i < m; i++) {
2005
2006                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_MONOTONIC))
2007                         r = flush_timer(e, e->monotonic_fd, ev_queue[i].events, &e->monotonic_next);
2008                 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_REALTIME))
2009                         r = flush_timer(e, e->realtime_fd, ev_queue[i].events, &e->realtime_next);
2010                 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_SIGNAL))
2011                         r = process_signal(e, ev_queue[i].events);
2012                 else if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
2013                         r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
2014                 else
2015                         r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
2016
2017                 if (r < 0)
2018                         goto finish;
2019         }
2020
2021         r = process_watchdog(e);
2022         if (r < 0)
2023                 goto finish;
2024
2025         r = process_timer(e, e->timestamp.monotonic, e->monotonic_earliest, e->monotonic_latest);
2026         if (r < 0)
2027                 goto finish;
2028
2029         r = process_timer(e, e->timestamp.realtime, e->realtime_earliest, e->realtime_latest);
2030         if (r < 0)
2031                 goto finish;
2032
2033         if (e->need_process_child) {
2034                 r = process_child(e);
2035                 if (r < 0)
2036                         goto finish;
2037         }
2038
2039         p = event_next_pending(e);
2040         if (!p) {
2041                 r = 1;
2042                 goto finish;
2043         }
2044
2045         r = source_dispatch(p);
2046
2047 finish:
2048         e->state = SD_EVENT_PASSIVE;
2049         sd_event_unref(e);
2050
2051         return r;
2052 }
2053
2054 _public_ int sd_event_loop(sd_event *e) {
2055         int r;
2056
2057         assert_return(e, -EINVAL);
2058         assert_return(!event_pid_changed(e), -ECHILD);
2059         assert_return(e->state == SD_EVENT_PASSIVE, -EBUSY);
2060
2061         sd_event_ref(e);
2062
2063         while (e->state != SD_EVENT_FINISHED) {
2064                 r = sd_event_run(e, (uint64_t) -1);
2065                 if (r < 0)
2066                         goto finish;
2067         }
2068
2069         r = e->exit_code;
2070
2071 finish:
2072         sd_event_unref(e);
2073         return r;
2074 }
2075
2076 _public_ int sd_event_get_state(sd_event *e) {
2077         assert_return(e, -EINVAL);
2078         assert_return(!event_pid_changed(e), -ECHILD);
2079
2080         return e->state;
2081 }
2082
2083 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
2084         assert_return(e, -EINVAL);
2085         assert_return(code, -EINVAL);
2086         assert_return(!event_pid_changed(e), -ECHILD);
2087
2088         if (!e->exit_requested)
2089                 return -ENODATA;
2090
2091         *code = e->exit_code;
2092         return 0;
2093 }
2094
2095 _public_ int sd_event_exit(sd_event *e, int code) {
2096         assert_return(e, -EINVAL);
2097         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2098         assert_return(!event_pid_changed(e), -ECHILD);
2099
2100         e->exit_requested = true;
2101         e->exit_code = code;
2102
2103         return 0;
2104 }
2105
2106 _public_ int sd_event_get_now_realtime(sd_event *e, uint64_t *usec) {
2107         assert_return(e, -EINVAL);
2108         assert_return(usec, -EINVAL);
2109         assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
2110         assert_return(!event_pid_changed(e), -ECHILD);
2111
2112         *usec = e->timestamp.realtime;
2113         return 0;
2114 }
2115
2116 _public_ int sd_event_get_now_monotonic(sd_event *e, uint64_t *usec) {
2117         assert_return(e, -EINVAL);
2118         assert_return(usec, -EINVAL);
2119         assert_return(dual_timestamp_is_set(&e->timestamp), -ENODATA);
2120         assert_return(!event_pid_changed(e), -ECHILD);
2121
2122         *usec = e->timestamp.monotonic;
2123         return 0;
2124 }
2125
2126 _public_ int sd_event_default(sd_event **ret) {
2127
2128         static thread_local sd_event *default_event = NULL;
2129         sd_event *e;
2130         int r;
2131
2132         if (!ret)
2133                 return !!default_event;
2134
2135         if (default_event) {
2136                 *ret = sd_event_ref(default_event);
2137                 return 0;
2138         }
2139
2140         r = sd_event_new(&e);
2141         if (r < 0)
2142                 return r;
2143
2144         e->default_event_ptr = &default_event;
2145         e->tid = gettid();
2146         default_event = e;
2147
2148         *ret = e;
2149         return 1;
2150 }
2151
2152 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
2153         assert_return(e, -EINVAL);
2154         assert_return(tid, -EINVAL);
2155         assert_return(!event_pid_changed(e), -ECHILD);
2156
2157         if (e->tid != 0) {
2158                 *tid = e->tid;
2159                 return 0;
2160         }
2161
2162         return -ENXIO;
2163 }
2164
2165 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
2166         int r;
2167
2168         assert_return(e, -EINVAL);
2169         assert_return(!event_pid_changed(e), -ECHILD);
2170
2171         if (e->watchdog == !!b)
2172                 return e->watchdog;
2173
2174         if (b) {
2175                 struct epoll_event ev = {};
2176
2177                 r = sd_watchdog_enabled(false, &e->watchdog_period);
2178                 if (r <= 0)
2179                         return r;
2180
2181                 /* Issue first ping immediately */
2182                 sd_notify(false, "WATCHDOG=1");
2183                 e->watchdog_last = now(CLOCK_MONOTONIC);
2184
2185                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
2186                 if (e->watchdog_fd < 0)
2187                         return -errno;
2188
2189                 r = arm_watchdog(e);
2190                 if (r < 0)
2191                         goto fail;
2192
2193                 ev.events = EPOLLIN;
2194                 ev.data.ptr = INT_TO_PTR(SOURCE_WATCHDOG);
2195
2196                 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
2197                 if (r < 0) {
2198                         r = -errno;
2199                         goto fail;
2200                 }
2201
2202         } else {
2203                 if (e->watchdog_fd >= 0) {
2204                         epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
2205                         close_nointr_nofail(e->watchdog_fd);
2206                         e->watchdog_fd = -1;
2207                 }
2208         }
2209
2210         e->watchdog = !!b;
2211         return e->watchdog;
2212
2213 fail:
2214         close_nointr_nofail(e->watchdog_fd);
2215         e->watchdog_fd = -1;
2216         return r;
2217 }
2218
2219 _public_ int sd_event_get_watchdog(sd_event *e) {
2220         assert_return(e, -EINVAL);
2221         assert_return(!event_pid_changed(e), -ECHILD);
2222
2223         return e->watchdog;
2224 }