src/libelogind/sd-event/sd-event.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2013 Lennart Poettering
   6 ***/
   7
   8 #include <sys/epoll.h>
   9 #include <sys/timerfd.h>
  10 #include <sys/wait.h>
  11
  12 #include "sd-daemon.h"
  13 #include "sd-event.h"
  14 #include "sd-id128.h"
  15
  16 #include "alloc-util.h"
  17 #include "fd-util.h"
  18 //#include "fs-util.h"
  19 #include "hashmap.h"
  20 #include "list.h"
  21 #include "macro.h"
  22 #include "missing.h"
  23 #include "prioq.h"
  24 #include "process-util.h"
  25 #include "set.h"
  26 #include "signal-util.h"
  27 #include "string-table.h"
  28 #include "string-util.h"
  29 #include "time-util.h"
  30 #include "util.h"
  31
  32 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
  33
  34 typedef enum EventSourceType {
  35         SOURCE_IO,
  36         SOURCE_TIME_REALTIME,
  37         SOURCE_TIME_BOOTTIME,
  38         SOURCE_TIME_MONOTONIC,
  39         SOURCE_TIME_REALTIME_ALARM,
  40         SOURCE_TIME_BOOTTIME_ALARM,
  41         SOURCE_SIGNAL,
  42         SOURCE_CHILD,
  43         SOURCE_DEFER,
  44         SOURCE_POST,
  45         SOURCE_EXIT,
  46         SOURCE_WATCHDOG,
  47         SOURCE_INOTIFY,
  48         _SOURCE_EVENT_SOURCE_TYPE_MAX,
  49         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
  50 } EventSourceType;
  51
  52 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
  53         [SOURCE_IO] = "io",
  54         [SOURCE_TIME_REALTIME] = "realtime",
  55         [SOURCE_TIME_BOOTTIME] = "bootime",
  56         [SOURCE_TIME_MONOTONIC] = "monotonic",
  57         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
  58         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
  59         [SOURCE_SIGNAL] = "signal",
  60         [SOURCE_CHILD] = "child",
  61         [SOURCE_DEFER] = "defer",
  62         [SOURCE_POST] = "post",
  63         [SOURCE_EXIT] = "exit",
  64         [SOURCE_WATCHDOG] = "watchdog",
  65         [SOURCE_INOTIFY] = "inotify",
  66 };
  67
  68 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
  69
  70 /* All objects we use in epoll events start with this value, so that
  71  * we know how to dispatch it */
  72 typedef enum WakeupType {
  73         WAKEUP_NONE,
  74         WAKEUP_EVENT_SOURCE,
  75         WAKEUP_CLOCK_DATA,
  76         WAKEUP_SIGNAL_DATA,
  77         WAKEUP_INOTIFY_DATA,
  78         _WAKEUP_TYPE_MAX,
  79         _WAKEUP_TYPE_INVALID = -1,
  80 } WakeupType;
  81
  82 #define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
  83
  84 struct inode_data;
  85
  86 struct sd_event_source {
  87         WakeupType wakeup;
  88
  89         unsigned n_ref;
  90
  91         sd_event *event;
  92         void *userdata;
  93         sd_event_handler_t prepare;
  94
  95         char *description;
  96
  97         EventSourceType type:5;
  98         int enabled:3;
  99         bool pending:1;
 100         bool dispatching:1;
 101         bool floating:1;
 102
 103         int64_t priority;
 104         unsigned pending_index;
 105         unsigned prepare_index;
 106         uint64_t pending_iteration;
 107         uint64_t prepare_iteration;
 108
 109         LIST_FIELDS(sd_event_source, sources);
 110
 111         union {
 112                 struct {
 113                         sd_event_io_handler_t callback;
 114                         int fd;
 115                         uint32_t events;
 116                         uint32_t revents;
 117                         bool registered:1;
 118                         bool owned:1;
 119                 } io;
 120                 struct {
 121                         sd_event_time_handler_t callback;
 122                         usec_t next, accuracy;
 123                         unsigned earliest_index;
 124                         unsigned latest_index;
 125                 } time;
 126                 struct {
 127                         sd_event_signal_handler_t callback;
 128                         struct signalfd_siginfo siginfo;
 129                         int sig;
 130                 } signal;
 131                 struct {
 132                         sd_event_child_handler_t callback;
 133                         siginfo_t siginfo;
 134                         pid_t pid;
 135                         int options;
 136                 } child;
 137                 struct {
 138                         sd_event_handler_t callback;
 139                 } defer;
 140                 struct {
 141                         sd_event_handler_t callback;
 142                 } post;
 143                 struct {
 144                         sd_event_handler_t callback;
 145                         unsigned prioq_index;
 146                 } exit;
 147                 struct {
 148                         sd_event_inotify_handler_t callback;
 149                         uint32_t mask;
 150                         struct inode_data *inode_data;
 151                         LIST_FIELDS(sd_event_source, by_inode_data);
 152                 } inotify;
 153         };
 154 };
 155
 156 struct clock_data {
 157         WakeupType wakeup;
 158         int fd;
 159
 160         /* For all clocks we maintain two priority queues each, one
 161          * ordered for the earliest times the events may be
 162          * dispatched, and one ordered by the latest times they must
 163          * have been dispatched. The range between the top entries in
 164          * the two prioqs is the time window we can freely schedule
 165          * wakeups in */
 166
 167         Prioq *earliest;
 168         Prioq *latest;
 169         usec_t next;
 170
 171         bool needs_rearm:1;
 172 };
 173
 174 struct signal_data {
 175         WakeupType wakeup;
 176
 177         /* For each priority we maintain one signal fd, so that we
 178          * only have to dequeue a single event per priority at a
 179          * time. */
 180
 181         int fd;
 182         int64_t priority;
 183         sigset_t sigset;
 184         sd_event_source *current;
 185 };
 186
 187 /* A structure listing all event sources currently watching a specific inode */
 188 struct inode_data {
 189         /* The identifier for the inode, the combination of the .st_dev + .st_ino fields of the file */
 190         ino_t ino;
 191         dev_t dev;
 192
 193         /* An fd of the inode to watch. The fd is kept open until the next iteration of the loop, so that we can
 194          * rearrange the priority still until then, as we need the original inode to change the priority as we need to
 195          * add a watch descriptor to the right inotify for the priority which we can only do if we have a handle to the
 196          * original inode. We keep a list of all inode_data objects with an open fd in the to_close list (see below) of
 197          * the sd-event object, so that it is efficient to close everything, before entering the next event loop
 198          * iteration. */
 199         int fd;
 200
 201         /* The inotify "watch descriptor" */
 202         int wd;
 203
 204         /* The combination of the mask of all inotify watches on this inode we manage. This is also the mask that has
 205          * most recently been set on the watch descriptor. */
 206         uint32_t combined_mask;
 207
 208         /* All event sources subscribed to this inode */
 209         LIST_HEAD(sd_event_source, event_sources);
 210
 211         /* The inotify object we watch this inode with */
 212         struct inotify_data *inotify_data;
 213
 214         /* A linked list of all inode data objects with fds to close (see above) */
 215         LIST_FIELDS(struct inode_data, to_close);
 216 };
 217
 218 /* A structure encapsulating an inotify fd */
 219 struct inotify_data {
 220         WakeupType wakeup;
 221
 222         /* For each priority we maintain one inotify fd, so that we only have to dequeue a single event per priority at
 223          * a time */
 224
 225         int fd;
 226         int64_t priority;
 227
 228         Hashmap *inodes; /* The inode_data structures keyed by dev+ino */
 229         Hashmap *wd;     /* The inode_data structures keyed by the watch descriptor for each */
 230
 231         /* The buffer we read inotify events into */
 232         union inotify_event_buffer buffer;
 233         size_t buffer_filled; /* fill level of the buffer */
 234
 235         /* How many event sources are currently marked pending for this inotify. We won't read new events off the
 236          * inotify fd as long as there are still pending events on the inotify (because we have no strategy of queuing
 237          * the events locally if they can't be coalesced). */
 238         unsigned n_pending;
 239
 240         /* A linked list of all inotify objects with data already read, that still need processing. We keep this list
 241          * to make it efficient to figure out what inotify objects to process data on next. */
 242         LIST_FIELDS(struct inotify_data, buffered);
 243 };
 244
 245 struct sd_event {
 246         unsigned n_ref;
 247
 248         int epoll_fd;
 249         int watchdog_fd;
 250
 251         Prioq *pending;
 252         Prioq *prepare;
 253
 254         /* timerfd_create() only supports these five clocks so far. We
 255          * can add support for more clocks when the kernel learns to
 256          * deal with them, too. */
 257         struct clock_data realtime;
 258         struct clock_data boottime;
 259         struct clock_data monotonic;
 260         struct clock_data realtime_alarm;
 261         struct clock_data boottime_alarm;
 262
 263         usec_t perturb;
 264
 265         sd_event_source **signal_sources; /* indexed by signal number */
 266         Hashmap *signal_data; /* indexed by priority */
 267
 268         Hashmap *child_sources;
 269         unsigned n_enabled_child_sources;
 270
 271         Set *post_sources;
 272
 273         Prioq *exit;
 274
 275         Hashmap *inotify_data; /* indexed by priority */
 276
 277         /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
 278         LIST_HEAD(struct inode_data, inode_data_to_close);
 279
 280         /* A list of inotify objects that already have events buffered which aren't processed yet */
 281         LIST_HEAD(struct inotify_data, inotify_data_buffered);
 282
 283         pid_t original_pid;
 284
 285         uint64_t iteration;
 286         triple_timestamp timestamp;
 287         int state;
 288
 289         bool exit_requested:1;
 290         bool need_process_child:1;
 291         bool watchdog:1;
 292         bool profile_delays:1;
 293
 294         int exit_code;
 295
 296         pid_t tid;
 297         sd_event **default_event_ptr;
 298
 299         usec_t watchdog_last, watchdog_period;
 300
 301         unsigned n_sources;
 302
 303         LIST_HEAD(sd_event_source, sources);
 304
 305         usec_t last_run, last_log;
 306         unsigned delays[sizeof(usec_t) * 8];
 307 };
 308
 309 static thread_local sd_event *default_event = NULL;
 310
 311 static void source_disconnect(sd_event_source *s);
 312 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
 313
 314 static sd_event *event_resolve(sd_event *e) {
 315         return e == SD_EVENT_DEFAULT ? default_event : e;
 316 }
 317
 318 static int pending_prioq_compare(const void *a, const void *b) {
 319         const sd_event_source *x = a, *y = b;
 320
 321         assert(x->pending);
 322         assert(y->pending);
 323
 324         /* Enabled ones first */
 325         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
 326                 return -1;
 327         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
 328                 return 1;
 329
 330         /* Lower priority values first */
 331         if (x->priority < y->priority)
 332                 return -1;
 333         if (x->priority > y->priority)
 334                 return 1;
 335
 336         /* Older entries first */
 337         if (x->pending_iteration < y->pending_iteration)
 338                 return -1;
 339         if (x->pending_iteration > y->pending_iteration)
 340                 return 1;
 341
 342         return 0;
 343 }
 344
 345 static int prepare_prioq_compare(const void *a, const void *b) {
 346         const sd_event_source *x = a, *y = b;
 347
 348         assert(x->prepare);
 349         assert(y->prepare);
 350
 351         /* Enabled ones first */
 352         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
 353                 return -1;
 354         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
 355                 return 1;
 356
 357         /* Move most recently prepared ones last, so that we can stop
 358          * preparing as soon as we hit one that has already been
 359          * prepared in the current iteration */
 360         if (x->prepare_iteration < y->prepare_iteration)
 361                 return -1;
 362         if (x->prepare_iteration > y->prepare_iteration)
 363                 return 1;
 364
 365         /* Lower priority values first */
 366         if (x->priority < y->priority)
 367                 return -1;
 368         if (x->priority > y->priority)
 369                 return 1;
 370
 371         return 0;
 372 }
 373
 374 static int earliest_time_prioq_compare(const void *a, const void *b) {
 375         const sd_event_source *x = a, *y = b;
 376
 377         assert(EVENT_SOURCE_IS_TIME(x->type));
 378         assert(x->type == y->type);
 379
 380         /* Enabled ones first */
 381         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
 382                 return -1;
 383         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
 384                 return 1;
 385
 386         /* Move the pending ones to the end */
 387         if (!x->pending && y->pending)
 388                 return -1;
 389         if (x->pending && !y->pending)
 390                 return 1;
 391
 392         /* Order by time */
 393         if (x->time.next < y->time.next)
 394                 return -1;
 395         if (x->time.next > y->time.next)
 396                 return 1;
 397
 398         return 0;
 399 }
 400
 401 static usec_t time_event_source_latest(const sd_event_source *s) {
 402         return usec_add(s->time.next, s->time.accuracy);
 403 }
 404
 405 static int latest_time_prioq_compare(const void *a, const void *b) {
 406         const sd_event_source *x = a, *y = b;
 407
 408         assert(EVENT_SOURCE_IS_TIME(x->type));
 409         assert(x->type == y->type);
 410
 411         /* Enabled ones first */
 412         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
 413                 return -1;
 414         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
 415                 return 1;
 416
 417         /* Move the pending ones to the end */
 418         if (!x->pending && y->pending)
 419                 return -1;
 420         if (x->pending && !y->pending)
 421                 return 1;
 422
 423         /* Order by time */
 424         if (time_event_source_latest(x) < time_event_source_latest(y))
 425                 return -1;
 426         if (time_event_source_latest(x) > time_event_source_latest(y))
 427                 return 1;
 428
 429         return 0;
 430 }
 431
 432 static int exit_prioq_compare(const void *a, const void *b) {
 433         const sd_event_source *x = a, *y = b;
 434
 435         assert(x->type == SOURCE_EXIT);
 436         assert(y->type == SOURCE_EXIT);
 437
 438         /* Enabled ones first */
 439         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
 440                 return -1;
 441         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
 442                 return 1;
 443
 444         /* Lower priority values first */
 445         if (x->priority < y->priority)
 446                 return -1;
 447         if (x->priority > y->priority)
 448                 return 1;
 449
 450         return 0;
 451 }
 452
 453 static void free_clock_data(struct clock_data *d) {
 454         assert(d);
 455         assert(d->wakeup == WAKEUP_CLOCK_DATA);
 456
 457         safe_close(d->fd);
 458         prioq_free(d->earliest);
 459         prioq_free(d->latest);
 460 }
 461
 462 static void event_free(sd_event *e) {
 463         sd_event_source *s;
 464
 465         assert(e);
 466
 467         while ((s = e->sources)) {
 468                 assert(s->floating);
 469                 source_disconnect(s);
 470                 sd_event_source_unref(s);
 471         }
 472
 473         assert(e->n_sources == 0);
 474
 475         if (e->default_event_ptr)
 476                 *(e->default_event_ptr) = NULL;
 477
 478         safe_close(e->epoll_fd);
 479         safe_close(e->watchdog_fd);
 480
 481         free_clock_data(&e->realtime);
 482         free_clock_data(&e->boottime);
 483         free_clock_data(&e->monotonic);
 484         free_clock_data(&e->realtime_alarm);
 485         free_clock_data(&e->boottime_alarm);
 486
 487         prioq_free(e->pending);
 488         prioq_free(e->prepare);
 489         prioq_free(e->exit);
 490
 491         free(e->signal_sources);
 492         hashmap_free(e->signal_data);
 493
 494         hashmap_free(e->inotify_data);
 495
 496         hashmap_free(e->child_sources);
 497         set_free(e->post_sources);
 498         free(e);
 499 }
 500
 501 _public_ int sd_event_new(sd_event** ret) {
 502         sd_event *e;
 503         int r;
 504
 505         assert_return(ret, -EINVAL);
 506
 507         e = new0(sd_event, 1);
 508         if (!e)
 509                 return -ENOMEM;
 510
 511         e->n_ref = 1;
 512         e->watchdog_fd = e->epoll_fd = e->realtime.fd = e->boottime.fd = e->monotonic.fd = e->realtime_alarm.fd = e->boottime_alarm.fd = -1;
 513         e->realtime.next = e->boottime.next = e->monotonic.next = e->realtime_alarm.next = e->boottime_alarm.next = USEC_INFINITY;
 514         e->realtime.wakeup = e->boottime.wakeup = e->monotonic.wakeup = e->realtime_alarm.wakeup = e->boottime_alarm.wakeup = WAKEUP_CLOCK_DATA;
 515         e->original_pid = getpid_cached();
 516         e->perturb = USEC_INFINITY;
 517
 518         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
 519         if (r < 0)
 520                 goto fail;
 521
 522         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
 523         if (e->epoll_fd < 0) {
 524                 r = -errno;
 525                 goto fail;
 526         }
 527
 528         e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
 529
 530         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
 531                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 ... 2^63 us will be logged every 5s.");
 532                 e->profile_delays = true;
 533         }
 534
 535         *ret = e;
 536         return 0;
 537
 538 fail:
 539         event_free(e);
 540         return r;
 541 }
 542
 543 _public_ sd_event* sd_event_ref(sd_event *e) {
 544
 545         if (!e)
 546                 return NULL;
 547
 548         assert(e->n_ref >= 1);
 549         e->n_ref++;
 550
 551         return e;
 552 }
 553
 554 _public_ sd_event* sd_event_unref(sd_event *e) {
 555
 556         if (!e)
 557                 return NULL;
 558
 559         assert(e->n_ref >= 1);
 560         e->n_ref--;
 561
 562         if (e->n_ref <= 0)
 563                 event_free(e);
 564
 565         return NULL;
 566 }
 567
 568 static bool event_pid_changed(sd_event *e) {
 569         assert(e);
 570
 571         /* We don't support people creating an event loop and keeping
 572          * it around over a fork(). Let's complain. */
 573
 574         return e->original_pid != getpid_cached();
 575 }
 576
 577 static void source_io_unregister(sd_event_source *s) {
 578         int r;
 579
 580         assert(s);
 581         assert(s->type == SOURCE_IO);
 582
 583         if (event_pid_changed(s->event))
 584                 return;
 585
 586         if (!s->io.registered)
 587                 return;
 588
 589         r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL);
 590         if (r < 0)
 591                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
 592                                 strna(s->description), event_source_type_to_string(s->type));
 593
 594         s->io.registered = false;
 595 }
 596
 597 static int source_io_register(
 598                 sd_event_source *s,
 599                 int enabled,
 600                 uint32_t events) {
 601
 602         struct epoll_event ev;
 603         int r;
 604
 605         assert(s);
 606         assert(s->type == SOURCE_IO);
 607         assert(enabled != SD_EVENT_OFF);
 608
 609         ev = (struct epoll_event) {
 610                 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 611                 .data.ptr = s,
 612         };
 613
 614         if (s->io.registered)
 615                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->io.fd, &ev);
 616         else
 617                 r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->io.fd, &ev);
 618         if (r < 0)
 619                 return -errno;
 620
 621         s->io.registered = true;
 622
 623         return 0;
 624 }
 625
 626 static clockid_t event_source_type_to_clock(EventSourceType t) {
 627
 628         switch (t) {
 629
 630         case SOURCE_TIME_REALTIME:
 631                 return CLOCK_REALTIME;
 632
 633         case SOURCE_TIME_BOOTTIME:
 634                 return CLOCK_BOOTTIME;
 635
 636         case SOURCE_TIME_MONOTONIC:
 637                 return CLOCK_MONOTONIC;
 638
 639         case SOURCE_TIME_REALTIME_ALARM:
 640                 return CLOCK_REALTIME_ALARM;
 641
 642         case SOURCE_TIME_BOOTTIME_ALARM:
 643                 return CLOCK_BOOTTIME_ALARM;
 644
 645         default:
 646                 return (clockid_t) -1;
 647         }
 648 }
 649
 650 static EventSourceType clock_to_event_source_type(clockid_t clock) {
 651
 652         switch (clock) {
 653
 654         case CLOCK_REALTIME:
 655                 return SOURCE_TIME_REALTIME;
 656
 657         case CLOCK_BOOTTIME:
 658                 return SOURCE_TIME_BOOTTIME;
 659
 660         case CLOCK_MONOTONIC:
 661                 return SOURCE_TIME_MONOTONIC;
 662
 663         case CLOCK_REALTIME_ALARM:
 664                 return SOURCE_TIME_REALTIME_ALARM;
 665
 666         case CLOCK_BOOTTIME_ALARM:
 667                 return SOURCE_TIME_BOOTTIME_ALARM;
 668
 669         default:
 670                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
 671         }
 672 }
 673
 674 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
 675         assert(e);
 676
 677         switch (t) {
 678
 679         case SOURCE_TIME_REALTIME:
 680                 return &e->realtime;
 681
 682         case SOURCE_TIME_BOOTTIME:
 683                 return &e->boottime;
 684
 685         case SOURCE_TIME_MONOTONIC:
 686                 return &e->monotonic;
 687
 688         case SOURCE_TIME_REALTIME_ALARM:
 689                 return &e->realtime_alarm;
 690
 691         case SOURCE_TIME_BOOTTIME_ALARM:
 692                 return &e->boottime_alarm;
 693
 694         default:
 695                 return NULL;
 696         }
 697 }
 698
 699 static int event_make_signal_data(
 700                 sd_event *e,
 701                 int sig,
 702                 struct signal_data **ret) {
 703
 704         struct epoll_event ev;
 705         struct signal_data *d;
 706         bool added = false;
 707         sigset_t ss_copy;
 708         int64_t priority;
 709         int r;
 710
 711         assert(e);
 712
 713         if (event_pid_changed(e))
 714                 return -ECHILD;
 715
 716         if (e->signal_sources && e->signal_sources[sig])
 717                 priority = e->signal_sources[sig]->priority;
 718         else
 719                 priority = SD_EVENT_PRIORITY_NORMAL;
 720
 721         d = hashmap_get(e->signal_data, &priority);
 722         if (d) {
 723                 if (sigismember(&d->sigset, sig) > 0) {
 724                         if (ret)
 725                                 *ret = d;
 726                         return 0;
 727                 }
 728         } else {
 729                 r = hashmap_ensure_allocated(&e->signal_data, &uint64_hash_ops);
 730                 if (r < 0)
 731                         return r;
 732
 733                 d = new0(struct signal_data, 1);
 734                 if (!d)
 735                         return -ENOMEM;
 736
 737                 d->wakeup = WAKEUP_SIGNAL_DATA;
 738                 d->fd  = -1;
 739                 d->priority = priority;
 740
 741                 r = hashmap_put(e->signal_data, &d->priority, d);
 742                 if (r < 0) {
 743                         free(d);
 744                         return r;
 745                 }
 746
 747                 added = true;
 748         }
 749
 750         ss_copy = d->sigset;
 751         assert_se(sigaddset(&ss_copy, sig) >= 0);
 752
 753         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
 754         if (r < 0) {
 755                 r = -errno;
 756                 goto fail;
 757         }
 758
 759         d->sigset = ss_copy;
 760
 761         if (d->fd >= 0) {
 762                 if (ret)
 763                         *ret = d;
 764                 return 0;
 765         }
 766
 767         d->fd = fd_move_above_stdio(r);
 768
 769         ev = (struct epoll_event) {
 770                 .events = EPOLLIN,
 771                 .data.ptr = d,
 772         };
 773
 774         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev);
 775         if (r < 0)  {
 776                 r = -errno;
 777                 goto fail;
 778         }
 779
 780         if (ret)
 781                 *ret = d;
 782
 783         return 0;
 784
 785 fail:
 786         if (added) {
 787                 d->fd = safe_close(d->fd);
 788                 hashmap_remove(e->signal_data, &d->priority);
 789                 free(d);
 790         }
 791
 792         return r;
 793 }
 794
 795 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
 796         assert(e);
 797         assert(d);
 798
 799         /* Turns off the specified signal in the signal data
 800          * object. If the signal mask of the object becomes empty that
 801          * way removes it. */
 802
 803         if (sigismember(&d->sigset, sig) == 0)
 804                 return;
 805
 806         assert_se(sigdelset(&d->sigset, sig) >= 0);
 807
 808         if (sigisemptyset(&d->sigset)) {
 809
 810                 /* If all the mask is all-zero we can get rid of the structure */
 811                 hashmap_remove(e->signal_data, &d->priority);
 812                 safe_close(d->fd);
 813                 free(d);
 814                 return;
 815         }
 816
 817         assert(d->fd >= 0);
 818
 819         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
 820                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
 821 }
 822
 823 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
 824         struct signal_data *d;
 825         static const int64_t zero_priority = 0;
 826
 827         assert(e);
 828
 829         /* Rechecks if the specified signal is still something we are
 830          * interested in. If not, we'll unmask it, and possibly drop
 831          * the signalfd for it. */
 832
 833         if (sig == SIGCHLD &&
 834             e->n_enabled_child_sources > 0)
 835                 return;
 836
 837         if (e->signal_sources &&
 838             e->signal_sources[sig] &&
 839             e->signal_sources[sig]->enabled != SD_EVENT_OFF)
 840                 return;
 841
 842         /*
 843          * The specified signal might be enabled in three different queues:
 844          *
 845          * 1) the one that belongs to the priority passed (if it is non-NULL)
 846          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
 847          * 3) the 0 priority (to cover the SIGCHLD case)
 848          *
 849          * Hence, let's remove it from all three here.
 850          */
 851
 852         if (priority) {
 853                 d = hashmap_get(e->signal_data, priority);
 854                 if (d)
 855                         event_unmask_signal_data(e, d, sig);
 856         }
 857
 858         if (e->signal_sources && e->signal_sources[sig]) {
 859                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
 860                 if (d)
 861                         event_unmask_signal_data(e, d, sig);
 862         }
 863
 864         d = hashmap_get(e->signal_data, &zero_priority);
 865         if (d)
 866                 event_unmask_signal_data(e, d, sig);
 867 }
 868
 869 static void source_disconnect(sd_event_source *s) {
 870         sd_event *event;
 871
 872         assert(s);
 873
 874         if (!s->event)
 875                 return;
 876
 877         assert(s->event->n_sources > 0);
 878
 879         switch (s->type) {
 880
 881         case SOURCE_IO:
 882                 if (s->io.fd >= 0)
 883                         source_io_unregister(s);
 884
 885                 break;
 886
 887         case SOURCE_TIME_REALTIME:
 888         case SOURCE_TIME_BOOTTIME:
 889         case SOURCE_TIME_MONOTONIC:
 890         case SOURCE_TIME_REALTIME_ALARM:
 891         case SOURCE_TIME_BOOTTIME_ALARM: {
 892                 struct clock_data *d;
 893
 894                 d = event_get_clock_data(s->event, s->type);
 895                 assert(d);
 896
 897                 prioq_remove(d->earliest, s, &s->time.earliest_index);
 898                 prioq_remove(d->latest, s, &s->time.latest_index);
 899                 d->needs_rearm = true;
 900                 break;
 901         }
 902
 903         case SOURCE_SIGNAL:
 904                 if (s->signal.sig > 0) {
 905
 906                         if (s->event->signal_sources)
 907                                 s->event->signal_sources[s->signal.sig] = NULL;
 908
 909                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
 910                 }
 911
 912                 break;
 913
 914         case SOURCE_CHILD:
 915                 if (s->child.pid > 0) {
 916                         if (s->enabled != SD_EVENT_OFF) {
 917                                 assert(s->event->n_enabled_child_sources > 0);
 918                                 s->event->n_enabled_child_sources--;
 919                         }
 920
 921                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
 922                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
 923                 }
 924
 925                 break;
 926
 927         case SOURCE_DEFER:
 928                 /* nothing */
 929                 break;
 930
 931         case SOURCE_POST:
 932                 set_remove(s->event->post_sources, s);
 933                 break;
 934
 935         case SOURCE_EXIT:
 936                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
 937                 break;
 938
 939         case SOURCE_INOTIFY: {
 940                 struct inode_data *inode_data;
 941
 942                 inode_data = s->inotify.inode_data;
 943                 if (inode_data) {
 944                         struct inotify_data *inotify_data;
 945                         assert_se(inotify_data = inode_data->inotify_data);
 946
 947                         /* Detach this event source from the inode object */
 948                         LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
 949                         s->inotify.inode_data = NULL;
 950
 951                         if (s->pending) {
 952                                 assert(inotify_data->n_pending > 0);
 953                                 inotify_data->n_pending--;
 954                         }
 955
 956                         /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
 957                          * continued to being watched. That's because inotify doesn't really have an API for that: we
 958                          * can only change watch masks with access to the original inode either by fd or by path. But
 959                          * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
 960                          * continously and keeping the mount busy which we can't really do. We could reconstruct the
 961                          * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
 962                          * there), but given the need for open_by_handle_at() which is privileged and not universally
 963                          * available this would be quite an incomplete solution. Hence we go the other way, leave the
 964                          * mask set, even if it is not minimized now, and ignore all events we aren't interested in
 965                          * anymore after reception. Yes, this sucks, but … Linux … */
 966
 967                         /* Maybe release the inode data (and its inotify) */
 968                         event_gc_inode_data(s->event, inode_data);
 969                 }
 970
 971                 break;
 972         }
 973
 974         default:
 975                 assert_not_reached("Wut? I shouldn't exist.");
 976         }
 977
 978         if (s->pending)
 979                 prioq_remove(s->event->pending, s, &s->pending_index);
 980
 981         if (s->prepare)
 982                 prioq_remove(s->event->prepare, s, &s->prepare_index);
 983
 984         event = s->event;
 985
 986         s->type = _SOURCE_EVENT_SOURCE_TYPE_INVALID;
 987         s->event = NULL;
 988         LIST_REMOVE(sources, event->sources, s);
 989         event->n_sources--;
 990
 991         if (!s->floating)
 992                 sd_event_unref(event);
 993 }
 994
 995 static void source_free(sd_event_source *s) {
 996         assert(s);
 997
 998         source_disconnect(s);
 999
1000         if (s->type == SOURCE_IO && s->io.owned)
1001                 safe_close(s->io.fd);
1002
1003         free(s->description);
1004         free(s);
1005 }
1006
1007 static int source_set_pending(sd_event_source *s, bool b) {
1008         int r;
1009
1010         assert(s);
1011         assert(s->type != SOURCE_EXIT);
1012
1013         if (s->pending == b)
1014                 return 0;
1015
1016         s->pending = b;
1017
1018         if (b) {
1019                 s->pending_iteration = s->event->iteration;
1020
1021                 r = prioq_put(s->event->pending, s, &s->pending_index);
1022                 if (r < 0) {
1023                         s->pending = false;
1024                         return r;
1025                 }
1026         } else
1027                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1028
1029         if (EVENT_SOURCE_IS_TIME(s->type)) {
1030                 struct clock_data *d;
1031
1032                 d = event_get_clock_data(s->event, s->type);
1033                 assert(d);
1034
1035                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
1036                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
1037                 d->needs_rearm = true;
1038         }
1039
1040         if (s->type == SOURCE_SIGNAL && !b) {
1041                 struct signal_data *d;
1042
1043                 d = hashmap_get(s->event->signal_data, &s->priority);
1044                 if (d && d->current == s)
1045                         d->current = NULL;
1046         }
1047
1048         if (s->type == SOURCE_INOTIFY) {
1049
1050                 assert(s->inotify.inode_data);
1051                 assert(s->inotify.inode_data->inotify_data);
1052
1053                 if (b)
1054                         s->inotify.inode_data->inotify_data->n_pending ++;
1055                 else {
1056                         assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1057                         s->inotify.inode_data->inotify_data->n_pending --;
1058                 }
1059         }
1060
1061         return 0;
1062 }
1063
1064 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
1065         sd_event_source *s;
1066
1067         assert(e);
1068
1069         s = new0(sd_event_source, 1);
1070         if (!s)
1071                 return NULL;
1072
1073         s->n_ref = 1;
1074         s->event = e;
1075         s->floating = floating;
1076         s->type = type;
1077         s->pending_index = s->prepare_index = PRIOQ_IDX_NULL;
1078
1079         if (!floating)
1080                 sd_event_ref(e);
1081
1082         LIST_PREPEND(sources, e->sources, s);
1083         e->n_sources++;
1084
1085         return s;
1086 }
1087
1088 _public_ int sd_event_add_io(
1089                 sd_event *e,
1090                 sd_event_source **ret,
1091                 int fd,
1092                 uint32_t events,
1093                 sd_event_io_handler_t callback,
1094                 void *userdata) {
1095
1096         sd_event_source *s;
1097         int r;
1098
1099         assert_return(e, -EINVAL);
1100         assert_return(e = event_resolve(e), -ENOPKG);
1101         assert_return(fd >= 0, -EBADF);
1102         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1103         assert_return(callback, -EINVAL);
1104         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1105         assert_return(!event_pid_changed(e), -ECHILD);
1106
1107         s = source_new(e, !ret, SOURCE_IO);
1108         if (!s)
1109                 return -ENOMEM;
1110
1111         s->wakeup = WAKEUP_EVENT_SOURCE;
1112         s->io.fd = fd;
1113         s->io.events = events;
1114         s->io.callback = callback;
1115         s->userdata = userdata;
1116         s->enabled = SD_EVENT_ON;
1117
1118         r = source_io_register(s, s->enabled, events);
1119         if (r < 0) {
1120                 source_free(s);
1121                 return r;
1122         }
1123
1124         if (ret)
1125                 *ret = s;
1126
1127         return 0;
1128 }
1129
1130 static void initialize_perturb(sd_event *e) {
1131         sd_id128_t bootid = {};
1132
1133         /* When we sleep for longer, we try to realign the wakeup to
1134            the same time wihtin each minute/second/250ms, so that
1135            events all across the system can be coalesced into a single
1136            CPU wakeup. However, let's take some system-specific
1137            randomness for this value, so that in a network of systems
1138            with synced clocks timer events are distributed a
1139            bit. Here, we calculate a perturbation usec offset from the
1140            boot ID. */
1141
1142         if (_likely_(e->perturb != USEC_INFINITY))
1143                 return;
1144
1145         if (sd_id128_get_boot(&bootid) >= 0)
1146                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1147 }
1148
1149 static int event_setup_timer_fd(
1150                 sd_event *e,
1151                 struct clock_data *d,
1152                 clockid_t clock) {
1153
1154         struct epoll_event ev;
1155         int r, fd;
1156
1157         assert(e);
1158         assert(d);
1159
1160         if (_likely_(d->fd >= 0))
1161                 return 0;
1162
1163         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1164         if (fd < 0)
1165                 return -errno;
1166
1167         fd = fd_move_above_stdio(fd);
1168
1169         ev = (struct epoll_event) {
1170                 .events = EPOLLIN,
1171                 .data.ptr = d,
1172         };
1173
1174         r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev);
1175         if (r < 0) {
1176                 safe_close(fd);
1177                 return -errno;
1178         }
1179
1180         d->fd = fd;
1181         return 0;
1182 }
1183
1184 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1185         assert(s);
1186
1187         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1188 }
1189
1190 _public_ int sd_event_add_time(
1191                 sd_event *e,
1192                 sd_event_source **ret,
1193                 clockid_t clock,
1194                 uint64_t usec,
1195                 uint64_t accuracy,
1196                 sd_event_time_handler_t callback,
1197                 void *userdata) {
1198
1199         EventSourceType type;
1200         sd_event_source *s;
1201         struct clock_data *d;
1202         int r;
1203
1204         assert_return(e, -EINVAL);
1205         assert_return(e = event_resolve(e), -ENOPKG);
1206         assert_return(accuracy != (uint64_t) -1, -EINVAL);
1207         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1208         assert_return(!event_pid_changed(e), -ECHILD);
1209
1210         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1211                 return -EOPNOTSUPP;
1212
1213         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1214         if (type < 0)
1215                 return -EOPNOTSUPP;
1216
1217         if (!callback)
1218                 callback = time_exit_callback;
1219
1220         d = event_get_clock_data(e, type);
1221         assert(d);
1222
1223         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1224         if (r < 0)
1225                 return r;
1226
1227         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1228         if (r < 0)
1229                 return r;
1230
1231         if (d->fd < 0) {
1232                 r = event_setup_timer_fd(e, d, clock);
1233                 if (r < 0)
1234                         return r;
1235         }
1236
1237         s = source_new(e, !ret, type);
1238         if (!s)
1239                 return -ENOMEM;
1240
1241         s->time.next = usec;
1242         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1243         s->time.callback = callback;
1244         s->time.earliest_index = s->time.latest_index = PRIOQ_IDX_NULL;
1245         s->userdata = userdata;
1246         s->enabled = SD_EVENT_ONESHOT;
1247
1248         d->needs_rearm = true;
1249
1250         r = prioq_put(d->earliest, s, &s->time.earliest_index);
1251         if (r < 0)
1252                 goto fail;
1253
1254         r = prioq_put(d->latest, s, &s->time.latest_index);
1255         if (r < 0)
1256                 goto fail;
1257
1258         if (ret)
1259                 *ret = s;
1260
1261         return 0;
1262
1263 fail:
1264         source_free(s);
1265         return r;
1266 }
1267
1268 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1269         assert(s);
1270
1271         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1272 }
1273
1274 _public_ int sd_event_add_signal(
1275                 sd_event *e,
1276                 sd_event_source **ret,
1277                 int sig,
1278                 sd_event_signal_handler_t callback,
1279                 void *userdata) {
1280
1281         sd_event_source *s;
1282         struct signal_data *d;
1283         sigset_t ss;
1284         int r;
1285
1286         assert_return(e, -EINVAL);
1287         assert_return(e = event_resolve(e), -ENOPKG);
1288         assert_return(SIGNAL_VALID(sig), -EINVAL);
1289         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1290         assert_return(!event_pid_changed(e), -ECHILD);
1291
1292         if (!callback)
1293                 callback = signal_exit_callback;
1294
1295         r = pthread_sigmask(SIG_SETMASK, NULL, &ss);
1296         if (r != 0)
1297                 return -r;
1298
1299         if (!sigismember(&ss, sig))
1300                 return -EBUSY;
1301
1302         if (!e->signal_sources) {
1303                 e->signal_sources = new0(sd_event_source*, _NSIG);
1304                 if (!e->signal_sources)
1305                         return -ENOMEM;
1306         } else if (e->signal_sources[sig])
1307                 return -EBUSY;
1308
1309         s = source_new(e, !ret, SOURCE_SIGNAL);
1310         if (!s)
1311                 return -ENOMEM;
1312
1313         s->signal.sig = sig;
1314         s->signal.callback = callback;
1315         s->userdata = userdata;
1316         s->enabled = SD_EVENT_ON;
1317
1318         e->signal_sources[sig] = s;
1319
1320         r = event_make_signal_data(e, sig, &d);
1321         if (r < 0) {
1322                 source_free(s);
1323                 return r;
1324         }
1325
1326         /* Use the signal name as description for the event source by default */
1327         (void) sd_event_source_set_description(s, signal_to_string(sig));
1328
1329         if (ret)
1330                 *ret = s;
1331
1332         return 0;
1333 }
1334
1335 _public_ int sd_event_add_child(
1336                 sd_event *e,
1337                 sd_event_source **ret,
1338                 pid_t pid,
1339                 int options,
1340                 sd_event_child_handler_t callback,
1341                 void *userdata) {
1342
1343         sd_event_source *s;
1344         int r;
1345
1346         assert_return(e, -EINVAL);
1347         assert_return(e = event_resolve(e), -ENOPKG);
1348         assert_return(pid > 1, -EINVAL);
1349         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1350         assert_return(options != 0, -EINVAL);
1351         assert_return(callback, -EINVAL);
1352         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1353         assert_return(!event_pid_changed(e), -ECHILD);
1354
1355         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1356         if (r < 0)
1357                 return r;
1358
1359         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1360                 return -EBUSY;
1361
1362         s = source_new(e, !ret, SOURCE_CHILD);
1363         if (!s)
1364                 return -ENOMEM;
1365
1366         s->child.pid = pid;
1367         s->child.options = options;
1368         s->child.callback = callback;
1369         s->userdata = userdata;
1370         s->enabled = SD_EVENT_ONESHOT;
1371
1372         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1373         if (r < 0) {
1374                 source_free(s);
1375                 return r;
1376         }
1377
1378         e->n_enabled_child_sources++;
1379
1380         r = event_make_signal_data(e, SIGCHLD, NULL);
1381         if (r < 0) {
1382                 e->n_enabled_child_sources--;
1383                 source_free(s);
1384                 return r;
1385         }
1386
1387         e->need_process_child = true;
1388
1389         if (ret)
1390                 *ret = s;
1391
1392         return 0;
1393 }
1394
1395 _public_ int sd_event_add_defer(
1396                 sd_event *e,
1397                 sd_event_source **ret,
1398                 sd_event_handler_t callback,
1399                 void *userdata) {
1400
1401         sd_event_source *s;
1402         int r;
1403
1404         assert_return(e, -EINVAL);
1405         assert_return(e = event_resolve(e), -ENOPKG);
1406         assert_return(callback, -EINVAL);
1407         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1408         assert_return(!event_pid_changed(e), -ECHILD);
1409
1410         s = source_new(e, !ret, SOURCE_DEFER);
1411         if (!s)
1412                 return -ENOMEM;
1413
1414         s->defer.callback = callback;
1415         s->userdata = userdata;
1416         s->enabled = SD_EVENT_ONESHOT;
1417
1418         r = source_set_pending(s, true);
1419         if (r < 0) {
1420                 source_free(s);
1421                 return r;
1422         }
1423
1424         if (ret)
1425                 *ret = s;
1426
1427         return 0;
1428 }
1429
1430 _public_ int sd_event_add_post(
1431                 sd_event *e,
1432                 sd_event_source **ret,
1433                 sd_event_handler_t callback,
1434                 void *userdata) {
1435
1436         sd_event_source *s;
1437         int r;
1438
1439         assert_return(e, -EINVAL);
1440         assert_return(e = event_resolve(e), -ENOPKG);
1441         assert_return(callback, -EINVAL);
1442         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1443         assert_return(!event_pid_changed(e), -ECHILD);
1444
1445         r = set_ensure_allocated(&e->post_sources, NULL);
1446         if (r < 0)
1447                 return r;
1448
1449         s = source_new(e, !ret, SOURCE_POST);
1450         if (!s)
1451                 return -ENOMEM;
1452
1453         s->post.callback = callback;
1454         s->userdata = userdata;
1455         s->enabled = SD_EVENT_ON;
1456
1457         r = set_put(e->post_sources, s);
1458         if (r < 0) {
1459                 source_free(s);
1460                 return r;
1461         }
1462
1463         if (ret)
1464                 *ret = s;
1465
1466         return 0;
1467 }
1468
1469 _public_ int sd_event_add_exit(
1470                 sd_event *e,
1471                 sd_event_source **ret,
1472                 sd_event_handler_t callback,
1473                 void *userdata) {
1474
1475         sd_event_source *s;
1476         int r;
1477
1478         assert_return(e, -EINVAL);
1479         assert_return(e = event_resolve(e), -ENOPKG);
1480         assert_return(callback, -EINVAL);
1481         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1482         assert_return(!event_pid_changed(e), -ECHILD);
1483
1484         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1485         if (r < 0)
1486                 return r;
1487
1488         s = source_new(e, !ret, SOURCE_EXIT);
1489         if (!s)
1490                 return -ENOMEM;
1491
1492         s->exit.callback = callback;
1493         s->userdata = userdata;
1494         s->exit.prioq_index = PRIOQ_IDX_NULL;
1495         s->enabled = SD_EVENT_ONESHOT;
1496
1497         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1498         if (r < 0) {
1499                 source_free(s);
1500                 return r;
1501         }
1502
1503         if (ret)
1504                 *ret = s;
1505
1506         return 0;
1507 }
1508
1509 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
1510         assert(e);
1511
1512         if (!d)
1513                 return;
1514
1515         assert(hashmap_isempty(d->inodes));
1516         assert(hashmap_isempty(d->wd));
1517
1518         if (d->buffer_filled > 0)
1519                 LIST_REMOVE(buffered, e->inotify_data_buffered, d);
1520
1521         hashmap_free(d->inodes);
1522         hashmap_free(d->wd);
1523
1524         assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
1525
1526         if (d->fd >= 0) {
1527                 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
1528                         log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
1529
1530                 safe_close(d->fd);
1531         }
1532         free(d);
1533 }
1534
1535 static int event_make_inotify_data(
1536                 sd_event *e,
1537                 int64_t priority,
1538                 struct inotify_data **ret) {
1539
1540         _cleanup_close_ int fd = -1;
1541         struct inotify_data *d;
1542         struct epoll_event ev;
1543         int r;
1544
1545         assert(e);
1546
1547         d = hashmap_get(e->inotify_data, &priority);
1548         if (d) {
1549                 if (ret)
1550                         *ret = d;
1551                 return 0;
1552         }
1553
1554         fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
1555         if (fd < 0)
1556                 return -errno;
1557
1558         fd = fd_move_above_stdio(fd);
1559
1560         r = hashmap_ensure_allocated(&e->inotify_data, &uint64_hash_ops);
1561         if (r < 0)
1562                 return r;
1563
1564         d = new(struct inotify_data, 1);
1565         if (!d)
1566                 return -ENOMEM;
1567
1568         *d = (struct inotify_data) {
1569                 .wakeup = WAKEUP_INOTIFY_DATA,
1570                 .fd = TAKE_FD(fd),
1571                 .priority = priority,
1572         };
1573
1574         r = hashmap_put(e->inotify_data, &d->priority, d);
1575         if (r < 0) {
1576                 d->fd = safe_close(d->fd);
1577                 free(d);
1578                 return r;
1579         }
1580
1581         ev = (struct epoll_event) {
1582                 .events = EPOLLIN,
1583                 .data.ptr = d,
1584         };
1585
1586         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
1587                 r = -errno;
1588                 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
1589                                             * remove the fd from the epoll first, which we don't want as we couldn't
1590                                             * add it in the first place. */
1591                 event_free_inotify_data(e, d);
1592                 return r;
1593         }
1594
1595         if (ret)
1596                 *ret = d;
1597
1598         return 1;
1599 }
1600
1601 static int inode_data_compare(const void *a, const void *b) {
1602         const struct inode_data *x = a, *y = b;
1603
1604         assert(x);
1605         assert(y);
1606
1607         if (x->dev < y->dev)
1608                 return -1;
1609         if (x->dev > y->dev)
1610                 return 1;
1611
1612         if (x->ino < y->ino)
1613                 return -1;
1614         if (x->ino > y->ino)
1615                 return 1;
1616
1617         return 0;
1618 }
1619
1620 static void inode_data_hash_func(const void *p, struct siphash *state) {
1621         const struct inode_data *d = p;
1622
1623         assert(p);
1624
1625         siphash24_compress(&d->dev, sizeof(d->dev), state);
1626         siphash24_compress(&d->ino, sizeof(d->ino), state);
1627 }
1628
1629 const struct hash_ops inode_data_hash_ops = {
1630         .hash = inode_data_hash_func,
1631         .compare = inode_data_compare
1632 };
1633
1634 static void event_free_inode_data(
1635                 sd_event *e,
1636                 struct inode_data *d) {
1637
1638         assert(e);
1639
1640         if (!d)
1641                 return;
1642
1643         assert(!d->event_sources);
1644
1645         if (d->fd >= 0) {
1646                 LIST_REMOVE(to_close, e->inode_data_to_close, d);
1647                 safe_close(d->fd);
1648         }
1649
1650         if (d->inotify_data) {
1651
1652                 if (d->wd >= 0) {
1653                         if (d->inotify_data->fd >= 0) {
1654                                 /* So here's a problem. At the time this runs the watch descriptor might already be
1655                                  * invalidated, because an IN_IGNORED event might be queued right the moment we enter
1656                                  * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
1657                                  * likely case to happen. */
1658
1659                                 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
1660                                         log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
1661                         }
1662
1663                         assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
1664                 }
1665
1666                 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
1667         }
1668
1669         free(d);
1670 }
1671
1672 static void event_gc_inode_data(
1673                 sd_event *e,
1674                 struct inode_data *d) {
1675
1676         struct inotify_data *inotify_data;
1677
1678         assert(e);
1679
1680         if (!d)
1681                 return;
1682
1683         if (d->event_sources)
1684                 return;
1685
1686         inotify_data = d->inotify_data;
1687         event_free_inode_data(e, d);
1688
1689         if (inotify_data && hashmap_isempty(inotify_data->inodes))
1690                 event_free_inotify_data(e, inotify_data);
1691 }
1692
1693 static int event_make_inode_data(
1694                 sd_event *e,
1695                 struct inotify_data *inotify_data,
1696                 dev_t dev,
1697                 ino_t ino,
1698                 struct inode_data **ret) {
1699
1700         struct inode_data *d, key;
1701         int r;
1702
1703         assert(e);
1704         assert(inotify_data);
1705
1706         key = (struct inode_data) {
1707                 .ino = ino,
1708                 .dev = dev,
1709         };
1710
1711         d = hashmap_get(inotify_data->inodes, &key);
1712         if (d) {
1713                 if (ret)
1714                         *ret = d;
1715
1716                 return 0;
1717         }
1718
1719         r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
1720         if (r < 0)
1721                 return r;
1722
1723         d = new(struct inode_data, 1);
1724         if (!d)
1725                 return -ENOMEM;
1726
1727         *d = (struct inode_data) {
1728                 .dev = dev,
1729                 .ino = ino,
1730                 .wd = -1,
1731                 .fd = -1,
1732                 .inotify_data = inotify_data,
1733         };
1734
1735         r = hashmap_put(inotify_data->inodes, d, d);
1736         if (r < 0) {
1737                 free(d);
1738                 return r;
1739         }
1740
1741         if (ret)
1742                 *ret = d;
1743
1744         return 1;
1745 }
1746
1747 static uint32_t inode_data_determine_mask(struct inode_data *d) {
1748         bool excl_unlink = true;
1749         uint32_t combined = 0;
1750         sd_event_source *s;
1751
1752         assert(d);
1753
1754         /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
1755          * the IN_EXCL_UNLINK flag is ANDed instead.
1756          *
1757          * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
1758          * because we cannot change the mask anymore after the event source was created once, since the kernel has no
1759          * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and supress
1760          * events we don't care for client-side. */
1761
1762         LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
1763
1764                 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
1765                         excl_unlink = false;
1766
1767                 combined |= s->inotify.mask;
1768         }
1769
1770         return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
1771 }
1772
1773 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
1774         uint32_t combined_mask;
1775         int wd, r;
1776
1777         assert(d);
1778         assert(d->fd >= 0);
1779
1780         combined_mask = inode_data_determine_mask(d);
1781
1782         if (d->wd >= 0 && combined_mask == d->combined_mask)
1783                 return 0;
1784
1785         r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
1786         if (r < 0)
1787                 return r;
1788
1789         wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
1790         if (wd < 0)
1791                 return -errno;
1792
1793         if (d->wd < 0) {
1794                 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
1795                 if (r < 0) {
1796                         (void) inotify_rm_watch(d->inotify_data->fd, wd);
1797                         return r;
1798                 }
1799
1800                 d->wd = wd;
1801
1802         } else if (d->wd != wd) {
1803
1804                 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
1805                 (void) inotify_rm_watch(d->fd, wd);
1806                 return -EINVAL;
1807         }
1808
1809         d->combined_mask = combined_mask;
1810         return 1;
1811 }
1812
1813 _public_ int sd_event_add_inotify(
1814                 sd_event *e,
1815                 sd_event_source **ret,
1816                 const char *path,
1817                 uint32_t mask,
1818                 sd_event_inotify_handler_t callback,
1819                 void *userdata) {
1820
1821         bool rm_inotify = false, rm_inode = false;
1822         struct inotify_data *inotify_data = NULL;
1823         struct inode_data *inode_data = NULL;
1824         _cleanup_close_ int fd = -1;
1825         sd_event_source *s;
1826         struct stat st;
1827         int r;
1828
1829         assert_return(e, -EINVAL);
1830         assert_return(e = event_resolve(e), -ENOPKG);
1831         assert_return(path, -EINVAL);
1832         assert_return(callback, -EINVAL);
1833         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1834         assert_return(!event_pid_changed(e), -ECHILD);
1835
1836         /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
1837          * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
1838          * the user can't use them for us. */
1839         if (mask & IN_MASK_ADD)
1840                 return -EINVAL;
1841
1842         fd = open(path, O_PATH|O_CLOEXEC|
1843                   (mask & IN_ONLYDIR ? O_DIRECTORY : 0)|
1844                   (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
1845         if (fd < 0)
1846                 return -errno;
1847
1848         if (fstat(fd, &st) < 0)
1849                 return -errno;
1850
1851         s = source_new(e, !ret, SOURCE_INOTIFY);
1852         if (!s)
1853                 return -ENOMEM;
1854
1855         s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
1856         s->inotify.mask = mask;
1857         s->inotify.callback = callback;
1858         s->userdata = userdata;
1859
1860         /* Allocate an inotify object for this priority, and an inode object within it */
1861         r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
1862         if (r < 0)
1863                 goto fail;
1864         rm_inotify = r > 0;
1865
1866         r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
1867         if (r < 0)
1868                 goto fail;
1869         rm_inode = r > 0;
1870
1871         /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
1872          * the event source, until then, for which we need the original inode. */
1873         if (inode_data->fd < 0) {
1874                 inode_data->fd = TAKE_FD(fd);
1875                 LIST_PREPEND(to_close, e->inode_data_to_close, inode_data);
1876         }
1877
1878         /* Link our event source to the inode data object */
1879         LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
1880         s->inotify.inode_data = inode_data;
1881
1882         rm_inode = rm_inotify = false;
1883
1884         /* Actually realize the watch now */
1885         r = inode_data_realize_watch(e, inode_data);
1886         if (r < 0)
1887                 goto fail;
1888
1889         (void) sd_event_source_set_description(s, path);
1890
1891         if (ret)
1892                 *ret = s;
1893
1894         return 0;
1895
1896 fail:
1897         source_free(s);
1898
1899         if (rm_inode)
1900                 event_free_inode_data(e, inode_data);
1901
1902         if (rm_inotify)
1903                 event_free_inotify_data(e, inotify_data);
1904
1905         return r;
1906 }
1907
1908 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
1909
1910         if (!s)
1911                 return NULL;
1912
1913         assert(s->n_ref >= 1);
1914         s->n_ref++;
1915
1916         return s;
1917 }
1918
1919 _public_ sd_event_source* sd_event_source_unref(sd_event_source *s) {
1920
1921         if (!s)
1922                 return NULL;
1923
1924         assert(s->n_ref >= 1);
1925         s->n_ref--;
1926
1927         if (s->n_ref <= 0) {
1928                 /* Here's a special hack: when we are called from a
1929                  * dispatch handler we won't free the event source
1930                  * immediately, but we will detach the fd from the
1931                  * epoll. This way it is safe for the caller to unref
1932                  * the event source and immediately close the fd, but
1933                  * we still retain a valid event source object after
1934                  * the callback. */
1935
1936                 if (s->dispatching) {
1937                         if (s->type == SOURCE_IO)
1938                                 source_io_unregister(s);
1939
1940                         source_disconnect(s);
1941                 } else
1942                         source_free(s);
1943         }
1944
1945         return NULL;
1946 }
1947
1948 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
1949         assert_return(s, -EINVAL);
1950         assert_return(!event_pid_changed(s->event), -ECHILD);
1951
1952         return free_and_strdup(&s->description, description);
1953 }
1954
1955 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
1956         assert_return(s, -EINVAL);
1957         assert_return(description, -EINVAL);
1958         assert_return(s->description, -ENXIO);
1959         assert_return(!event_pid_changed(s->event), -ECHILD);
1960
1961         *description = s->description;
1962         return 0;
1963 }
1964
1965 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
1966         assert_return(s, NULL);
1967
1968         return s->event;
1969 }
1970
1971 _public_ int sd_event_source_get_pending(sd_event_source *s) {
1972         assert_return(s, -EINVAL);
1973         assert_return(s->type != SOURCE_EXIT, -EDOM);
1974         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
1975         assert_return(!event_pid_changed(s->event), -ECHILD);
1976
1977         return s->pending;
1978 }
1979
1980 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
1981         assert_return(s, -EINVAL);
1982         assert_return(s->type == SOURCE_IO, -EDOM);
1983         assert_return(!event_pid_changed(s->event), -ECHILD);
1984
1985         return s->io.fd;
1986 }
1987
1988 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
1989         int r;
1990
1991         assert_return(s, -EINVAL);
1992         assert_return(fd >= 0, -EBADF);
1993         assert_return(s->type == SOURCE_IO, -EDOM);
1994         assert_return(!event_pid_changed(s->event), -ECHILD);
1995
1996         if (s->io.fd == fd)
1997                 return 0;
1998
1999         if (s->enabled == SD_EVENT_OFF) {
2000                 s->io.fd = fd;
2001                 s->io.registered = false;
2002         } else {
2003                 int saved_fd;
2004
2005                 saved_fd = s->io.fd;
2006                 assert(s->io.registered);
2007
2008                 s->io.fd = fd;
2009                 s->io.registered = false;
2010
2011                 r = source_io_register(s, s->enabled, s->io.events);
2012                 if (r < 0) {
2013                         s->io.fd = saved_fd;
2014                         s->io.registered = true;
2015                         return r;
2016                 }
2017
2018                 epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2019         }
2020
2021         return 0;
2022 }
2023
2024 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2025         assert_return(s, -EINVAL);
2026         assert_return(s->type == SOURCE_IO, -EDOM);
2027
2028         return s->io.owned;
2029 }
2030
2031 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2032         assert_return(s, -EINVAL);
2033         assert_return(s->type == SOURCE_IO, -EDOM);
2034
2035         s->io.owned = own;
2036         return 0;
2037 }
2038
2039 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
2040         assert_return(s, -EINVAL);
2041         assert_return(events, -EINVAL);
2042         assert_return(s->type == SOURCE_IO, -EDOM);
2043         assert_return(!event_pid_changed(s->event), -ECHILD);
2044
2045         *events = s->io.events;
2046         return 0;
2047 }
2048
2049 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2050         int r;
2051
2052         assert_return(s, -EINVAL);
2053         assert_return(s->type == SOURCE_IO, -EDOM);
2054         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2055         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2056         assert_return(!event_pid_changed(s->event), -ECHILD);
2057
2058         /* edge-triggered updates are never skipped, so we can reset edges */
2059         if (s->io.events == events && !(events & EPOLLET))
2060                 return 0;
2061
2062         r = source_set_pending(s, false);
2063         if (r < 0)
2064                 return r;
2065
2066         if (s->enabled != SD_EVENT_OFF) {
2067                 r = source_io_register(s, s->enabled, events);
2068                 if (r < 0)
2069                         return r;
2070         }
2071
2072         s->io.events = events;
2073
2074         return 0;
2075 }
2076
2077 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
2078         assert_return(s, -EINVAL);
2079         assert_return(revents, -EINVAL);
2080         assert_return(s->type == SOURCE_IO, -EDOM);
2081         assert_return(s->pending, -ENODATA);
2082         assert_return(!event_pid_changed(s->event), -ECHILD);
2083
2084         *revents = s->io.revents;
2085         return 0;
2086 }
2087
2088 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2089         assert_return(s, -EINVAL);
2090         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2091         assert_return(!event_pid_changed(s->event), -ECHILD);
2092
2093         return s->signal.sig;
2094 }
2095
2096 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
2097         assert_return(s, -EINVAL);
2098         assert_return(!event_pid_changed(s->event), -ECHILD);
2099
2100         *priority = s->priority;
2101         return 0;
2102 }
2103
2104 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2105         bool rm_inotify = false, rm_inode = false;
2106         struct inotify_data *new_inotify_data = NULL;
2107         struct inode_data *new_inode_data = NULL;
2108         int r;
2109
2110         assert_return(s, -EINVAL);
2111         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2112         assert_return(!event_pid_changed(s->event), -ECHILD);
2113
2114         if (s->priority == priority)
2115                 return 0;
2116
2117         if (s->type == SOURCE_INOTIFY) {
2118                 struct inode_data *old_inode_data;
2119
2120                 assert(s->inotify.inode_data);
2121                 old_inode_data = s->inotify.inode_data;
2122
2123                 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2124                  * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2125                  * events we allow priority changes only until the first following iteration. */
2126                 if (old_inode_data->fd < 0)
2127                         return -EOPNOTSUPP;
2128
2129                 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2130                 if (r < 0)
2131                         return r;
2132                 rm_inotify = r > 0;
2133
2134                 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2135                 if (r < 0)
2136                         goto fail;
2137                 rm_inode = r > 0;
2138
2139                 if (new_inode_data->fd < 0) {
2140                         /* Duplicate the fd for the new inode object if we don't have any yet */
2141                         new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2142                         if (new_inode_data->fd < 0) {
2143                                 r = -errno;
2144                                 goto fail;
2145                         }
2146
2147                         LIST_PREPEND(to_close, s->event->inode_data_to_close, new_inode_data);
2148                 }
2149
2150                 /* Move the event source to the new inode data structure */
2151                 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2152                 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2153                 s->inotify.inode_data = new_inode_data;
2154
2155                 /* Now create the new watch */
2156                 r = inode_data_realize_watch(s->event, new_inode_data);
2157                 if (r < 0) {
2158                         /* Move it back */
2159                         LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2160                         LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2161                         s->inotify.inode_data = old_inode_data;
2162                         goto fail;
2163                 }
2164
2165                 s->priority = priority;
2166
2167                 event_gc_inode_data(s->event, old_inode_data);
2168
2169         } else if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
2170                 struct signal_data *old, *d;
2171
2172                 /* Move us from the signalfd belonging to the old
2173                  * priority to the signalfd of the new priority */
2174
2175                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2176
2177                 s->priority = priority;
2178
2179                 r = event_make_signal_data(s->event, s->signal.sig, &d);
2180                 if (r < 0) {
2181                         s->priority = old->priority;
2182                         return r;
2183                 }
2184
2185                 event_unmask_signal_data(s->event, old, s->signal.sig);
2186         } else
2187                 s->priority = priority;
2188
2189         if (s->pending)
2190                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
2191
2192         if (s->prepare)
2193                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
2194
2195         if (s->type == SOURCE_EXIT)
2196                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2197
2198         return 0;
2199
2200 fail:
2201         if (rm_inode)
2202                 event_free_inode_data(s->event, new_inode_data);
2203
2204         if (rm_inotify)
2205                 event_free_inotify_data(s->event, new_inotify_data);
2206
2207         return r;
2208 }
2209
2210 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
2211         assert_return(s, -EINVAL);
2212         assert_return(m, -EINVAL);
2213         assert_return(!event_pid_changed(s->event), -ECHILD);
2214
2215         *m = s->enabled;
2216         return 0;
2217 }
2218
2219 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
2220         int r;
2221
2222         assert_return(s, -EINVAL);
2223         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
2224         assert_return(!event_pid_changed(s->event), -ECHILD);
2225
2226         /* If we are dead anyway, we are fine with turning off
2227          * sources, but everything else needs to fail. */
2228         if (s->event->state == SD_EVENT_FINISHED)
2229                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
2230
2231         if (s->enabled == m)
2232                 return 0;
2233
2234         if (m == SD_EVENT_OFF) {
2235
2236                 /* Unset the pending flag when this event source is disabled */
2237                 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2238                         r = source_set_pending(s, false);
2239                         if (r < 0)
2240                                 return r;
2241                 }
2242
2243                 switch (s->type) {
2244
2245                 case SOURCE_IO:
2246                         source_io_unregister(s);
2247                         s->enabled = m;
2248                         break;
2249
2250                 case SOURCE_TIME_REALTIME:
2251                 case SOURCE_TIME_BOOTTIME:
2252                 case SOURCE_TIME_MONOTONIC:
2253                 case SOURCE_TIME_REALTIME_ALARM:
2254                 case SOURCE_TIME_BOOTTIME_ALARM: {
2255                         struct clock_data *d;
2256
2257                         s->enabled = m;
2258                         d = event_get_clock_data(s->event, s->type);
2259                         assert(d);
2260
2261                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2262                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
2263                         d->needs_rearm = true;
2264                         break;
2265                 }
2266
2267                 case SOURCE_SIGNAL:
2268                         s->enabled = m;
2269
2270                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2271                         break;
2272
2273                 case SOURCE_CHILD:
2274                         s->enabled = m;
2275
2276                         assert(s->event->n_enabled_child_sources > 0);
2277                         s->event->n_enabled_child_sources--;
2278
2279                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2280                         break;
2281
2282                 case SOURCE_EXIT:
2283                         s->enabled = m;
2284                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2285                         break;
2286
2287                 case SOURCE_DEFER:
2288                 case SOURCE_POST:
2289                 case SOURCE_INOTIFY:
2290                         s->enabled = m;
2291                         break;
2292
2293                 default:
2294                         assert_not_reached("Wut? I shouldn't exist.");
2295                 }
2296
2297         } else {
2298
2299                 /* Unset the pending flag when this event source is enabled */
2300                 if (s->enabled == SD_EVENT_OFF && !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2301                         r = source_set_pending(s, false);
2302                         if (r < 0)
2303                                 return r;
2304                 }
2305
2306                 switch (s->type) {
2307
2308                 case SOURCE_IO:
2309                         r = source_io_register(s, m, s->io.events);
2310                         if (r < 0)
2311                                 return r;
2312
2313                         s->enabled = m;
2314                         break;
2315
2316                 case SOURCE_TIME_REALTIME:
2317                 case SOURCE_TIME_BOOTTIME:
2318                 case SOURCE_TIME_MONOTONIC:
2319                 case SOURCE_TIME_REALTIME_ALARM:
2320                 case SOURCE_TIME_BOOTTIME_ALARM: {
2321                         struct clock_data *d;
2322
2323                         s->enabled = m;
2324                         d = event_get_clock_data(s->event, s->type);
2325                         assert(d);
2326
2327                         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2328                         prioq_reshuffle(d->latest, s, &s->time.latest_index);
2329                         d->needs_rearm = true;
2330                         break;
2331                 }
2332
2333                 case SOURCE_SIGNAL:
2334
2335                         s->enabled = m;
2336
2337                         r = event_make_signal_data(s->event, s->signal.sig, NULL);
2338                         if (r < 0) {
2339                                 s->enabled = SD_EVENT_OFF;
2340                                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2341                                 return r;
2342                         }
2343
2344                         break;
2345
2346                 case SOURCE_CHILD:
2347
2348                         if (s->enabled == SD_EVENT_OFF)
2349                                 s->event->n_enabled_child_sources++;
2350
2351                         s->enabled = m;
2352
2353                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
2354                         if (r < 0) {
2355                                 s->enabled = SD_EVENT_OFF;
2356                                 s->event->n_enabled_child_sources--;
2357                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2358                                 return r;
2359                         }
2360
2361                         break;
2362
2363                 case SOURCE_EXIT:
2364                         s->enabled = m;
2365                         prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2366                         break;
2367
2368                 case SOURCE_DEFER:
2369                 case SOURCE_POST:
2370                 case SOURCE_INOTIFY:
2371                         s->enabled = m;
2372                         break;
2373
2374                 default:
2375                         assert_not_reached("Wut? I shouldn't exist.");
2376                 }
2377         }
2378
2379         if (s->pending)
2380                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
2381
2382         if (s->prepare)
2383                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
2384
2385         return 0;
2386 }
2387
2388 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
2389         assert_return(s, -EINVAL);
2390         assert_return(usec, -EINVAL);
2391         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2392         assert_return(!event_pid_changed(s->event), -ECHILD);
2393
2394         *usec = s->time.next;
2395         return 0;
2396 }
2397
2398 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
2399         struct clock_data *d;
2400         int r;
2401
2402         assert_return(s, -EINVAL);
2403         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2404         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2405         assert_return(!event_pid_changed(s->event), -ECHILD);
2406
2407         r = source_set_pending(s, false);
2408         if (r < 0)
2409                 return r;
2410
2411         s->time.next = usec;
2412
2413         d = event_get_clock_data(s->event, s->type);
2414         assert(d);
2415
2416         prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2417         prioq_reshuffle(d->latest, s, &s->time.latest_index);
2418         d->needs_rearm = true;
2419
2420         return 0;
2421 }
2422
2423 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
2424         assert_return(s, -EINVAL);
2425         assert_return(usec, -EINVAL);
2426         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2427         assert_return(!event_pid_changed(s->event), -ECHILD);
2428
2429         *usec = s->time.accuracy;
2430         return 0;
2431 }
2432
2433 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
2434         struct clock_data *d;
2435         int r;
2436
2437         assert_return(s, -EINVAL);
2438         assert_return(usec != (uint64_t) -1, -EINVAL);
2439         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2440         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2441         assert_return(!event_pid_changed(s->event), -ECHILD);
2442
2443         r = source_set_pending(s, false);
2444         if (r < 0)
2445                 return r;
2446
2447         if (usec == 0)
2448                 usec = DEFAULT_ACCURACY_USEC;
2449
2450         s->time.accuracy = usec;
2451
2452         d = event_get_clock_data(s->event, s->type);
2453         assert(d);
2454
2455         prioq_reshuffle(d->latest, s, &s->time.latest_index);
2456         d->needs_rearm = true;
2457
2458         return 0;
2459 }
2460
2461 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
2462         assert_return(s, -EINVAL);
2463         assert_return(clock, -EINVAL);
2464         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2465         assert_return(!event_pid_changed(s->event), -ECHILD);
2466
2467         *clock = event_source_type_to_clock(s->type);
2468         return 0;
2469 }
2470
2471 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
2472         assert_return(s, -EINVAL);
2473         assert_return(pid, -EINVAL);
2474         assert_return(s->type == SOURCE_CHILD, -EDOM);
2475         assert_return(!event_pid_changed(s->event), -ECHILD);
2476
2477         *pid = s->child.pid;
2478         return 0;
2479 }
2480
2481 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
2482         assert_return(s, -EINVAL);
2483         assert_return(mask, -EINVAL);
2484         assert_return(s->type == SOURCE_INOTIFY, -EDOM);
2485         assert_return(!event_pid_changed(s->event), -ECHILD);
2486
2487         *mask = s->inotify.mask;
2488         return 0;
2489 }
2490
2491 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
2492         int r;
2493
2494         assert_return(s, -EINVAL);
2495         assert_return(s->type != SOURCE_EXIT, -EDOM);
2496         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2497         assert_return(!event_pid_changed(s->event), -ECHILD);
2498
2499         if (s->prepare == callback)
2500                 return 0;
2501
2502         if (callback && s->prepare) {
2503                 s->prepare = callback;
2504                 return 0;
2505         }
2506
2507         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
2508         if (r < 0)
2509                 return r;
2510
2511         s->prepare = callback;
2512
2513         if (callback) {
2514                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
2515                 if (r < 0)
2516                         return r;
2517         } else
2518                 prioq_remove(s->event->prepare, s, &s->prepare_index);
2519
2520         return 0;
2521 }
2522
2523 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
2524         assert_return(s, NULL);
2525
2526         return s->userdata;
2527 }
2528
2529 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
2530         void *ret;
2531
2532         assert_return(s, NULL);
2533
2534         ret = s->userdata;
2535         s->userdata = userdata;
2536
2537         return ret;
2538 }
2539
2540 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
2541         usec_t c;
2542         assert(e);
2543         assert(a <= b);
2544
2545         if (a <= 0)
2546                 return 0;
2547         if (a >= USEC_INFINITY)
2548                 return USEC_INFINITY;
2549
2550         if (b <= a + 1)
2551                 return a;
2552
2553         initialize_perturb(e);
2554
2555         /*
2556           Find a good time to wake up again between times a and b. We
2557           have two goals here:
2558
2559           a) We want to wake up as seldom as possible, hence prefer
2560              later times over earlier times.
2561
2562           b) But if we have to wake up, then let's make sure to
2563              dispatch as much as possible on the entire system.
2564
2565           We implement this by waking up everywhere at the same time
2566           within any given minute if we can, synchronised via the
2567           perturbation value determined from the boot ID. If we can't,
2568           then we try to find the same spot in every 10s, then 1s and
2569           then 250ms step. Otherwise, we pick the last possible time
2570           to wake up.
2571         */
2572
2573         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
2574         if (c >= b) {
2575                 if (_unlikely_(c < USEC_PER_MINUTE))
2576                         return b;
2577
2578                 c -= USEC_PER_MINUTE;
2579         }
2580
2581         if (c >= a)
2582                 return c;
2583
2584         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
2585         if (c >= b) {
2586                 if (_unlikely_(c < USEC_PER_SEC*10))
2587                         return b;
2588
2589                 c -= USEC_PER_SEC*10;
2590         }
2591
2592         if (c >= a)
2593                 return c;
2594
2595         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
2596         if (c >= b) {
2597                 if (_unlikely_(c < USEC_PER_SEC))
2598                         return b;
2599
2600                 c -= USEC_PER_SEC;
2601         }
2602
2603         if (c >= a)
2604                 return c;
2605
2606         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
2607         if (c >= b) {
2608                 if (_unlikely_(c < USEC_PER_MSEC*250))
2609                         return b;
2610
2611                 c -= USEC_PER_MSEC*250;
2612         }
2613
2614         if (c >= a)
2615                 return c;
2616
2617         return b;
2618 }
2619
2620 static int event_arm_timer(
2621                 sd_event *e,
2622                 struct clock_data *d) {
2623
2624         struct itimerspec its = {};
2625         sd_event_source *a, *b;
2626         usec_t t;
2627         int r;
2628
2629         assert(e);
2630         assert(d);
2631
2632         if (!d->needs_rearm)
2633                 return 0;
2634         else
2635                 d->needs_rearm = false;
2636
2637         a = prioq_peek(d->earliest);
2638         if (!a || a->enabled == SD_EVENT_OFF || a->time.next == USEC_INFINITY) {
2639
2640                 if (d->fd < 0)
2641                         return 0;
2642
2643                 if (d->next == USEC_INFINITY)
2644                         return 0;
2645
2646                 /* disarm */
2647                 r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2648                 if (r < 0)
2649                         return r;
2650
2651                 d->next = USEC_INFINITY;
2652                 return 0;
2653         }
2654
2655         b = prioq_peek(d->latest);
2656         assert_se(b && b->enabled != SD_EVENT_OFF);
2657
2658         t = sleep_between(e, a->time.next, time_event_source_latest(b));
2659         if (d->next == t)
2660                 return 0;
2661
2662         assert_se(d->fd >= 0);
2663
2664         if (t == 0) {
2665                 /* We don' want to disarm here, just mean some time looooong ago. */
2666                 its.it_value.tv_sec = 0;
2667                 its.it_value.tv_nsec = 1;
2668         } else
2669                 timespec_store(&its.it_value, t);
2670
2671         r = timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL);
2672         if (r < 0)
2673                 return -errno;
2674
2675         d->next = t;
2676         return 0;
2677 }
2678
2679 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
2680         assert(e);
2681         assert(s);
2682         assert(s->type == SOURCE_IO);
2683
2684         /* If the event source was already pending, we just OR in the
2685          * new revents, otherwise we reset the value. The ORing is
2686          * necessary to handle EPOLLONESHOT events properly where
2687          * readability might happen independently of writability, and
2688          * we need to keep track of both */
2689
2690         if (s->pending)
2691                 s->io.revents |= revents;
2692         else
2693                 s->io.revents = revents;
2694
2695         return source_set_pending(s, true);
2696 }
2697
2698 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
2699         uint64_t x;
2700         ssize_t ss;
2701
2702         assert(e);
2703         assert(fd >= 0);
2704
2705         assert_return(events == EPOLLIN, -EIO);
2706
2707         ss = read(fd, &x, sizeof(x));
2708         if (ss < 0) {
2709                 if (IN_SET(errno, EAGAIN, EINTR))
2710                         return 0;
2711
2712                 return -errno;
2713         }
2714
2715         if (_unlikely_(ss != sizeof(x)))
2716                 return -EIO;
2717
2718         if (next)
2719                 *next = USEC_INFINITY;
2720
2721         return 0;
2722 }
2723
2724 static int process_timer(
2725                 sd_event *e,
2726                 usec_t n,
2727                 struct clock_data *d) {
2728
2729         sd_event_source *s;
2730         int r;
2731
2732         assert(e);
2733         assert(d);
2734
2735         for (;;) {
2736                 s = prioq_peek(d->earliest);
2737                 if (!s ||
2738                     s->time.next > n ||
2739                     s->enabled == SD_EVENT_OFF ||
2740                     s->pending)
2741                         break;
2742
2743                 r = source_set_pending(s, true);
2744                 if (r < 0)
2745                         return r;
2746
2747                 prioq_reshuffle(d->earliest, s, &s->time.earliest_index);
2748                 prioq_reshuffle(d->latest, s, &s->time.latest_index);
2749                 d->needs_rearm = true;
2750         }
2751
2752         return 0;
2753 }
2754
2755 static int process_child(sd_event *e) {
2756         sd_event_source *s;
2757         Iterator i;
2758         int r;
2759
2760         assert(e);
2761
2762         e->need_process_child = false;
2763
2764         /*
2765            So, this is ugly. We iteratively invoke waitid() with P_PID
2766            + WNOHANG for each PID we wait for, instead of using
2767            P_ALL. This is because we only want to get child
2768            information of very specific child processes, and not all
2769            of them. We might not have processed the SIGCHLD even of a
2770            previous invocation and we don't want to maintain a
2771            unbounded *per-child* event queue, hence we really don't
2772            want anything flushed out of the kernel's queue that we
2773            don't care about. Since this is O(n) this means that if you
2774            have a lot of processes you probably want to handle SIGCHLD
2775            yourself.
2776
2777            We do not reap the children here (by using WNOWAIT), this
2778            is only done after the event source is dispatched so that
2779            the callback still sees the process as a zombie.
2780         */
2781
2782         HASHMAP_FOREACH(s, e->child_sources, i) {
2783                 assert(s->type == SOURCE_CHILD);
2784
2785                 if (s->pending)
2786                         continue;
2787
2788                 if (s->enabled == SD_EVENT_OFF)
2789                         continue;
2790
2791                 zero(s->child.siginfo);
2792                 r = waitid(P_PID, s->child.pid, &s->child.siginfo,
2793                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
2794                 if (r < 0)
2795                         return -errno;
2796
2797                 if (s->child.siginfo.si_pid != 0) {
2798                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
2799
2800                         if (!zombie && (s->child.options & WEXITED)) {
2801                                 /* If the child isn't dead then let's
2802                                  * immediately remove the state change
2803                                  * from the queue, since there's no
2804                                  * benefit in leaving it queued */
2805
2806                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
2807                                 waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
2808                         }
2809
2810                         r = source_set_pending(s, true);
2811                         if (r < 0)
2812                                 return r;
2813                 }
2814         }
2815
2816         return 0;
2817 }
2818
2819 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
2820         bool read_one = false;
2821         int r;
2822
2823         assert(e);
2824         assert(d);
2825         assert_return(events == EPOLLIN, -EIO);
2826
2827         /* If there's a signal queued on this priority and SIGCHLD is
2828            on this priority too, then make sure to recheck the
2829            children we watch. This is because we only ever dequeue
2830            the first signal per priority, and if we dequeue one, and
2831            SIGCHLD might be enqueued later we wouldn't know, but we
2832            might have higher priority children we care about hence we
2833            need to check that explicitly. */
2834
2835         if (sigismember(&d->sigset, SIGCHLD))
2836                 e->need_process_child = true;
2837
2838         /* If there's already an event source pending for this
2839          * priority we don't read another */
2840         if (d->current)
2841                 return 0;
2842
2843         for (;;) {
2844                 struct signalfd_siginfo si;
2845                 ssize_t n;
2846                 sd_event_source *s = NULL;
2847
2848                 n = read(d->fd, &si, sizeof(si));
2849                 if (n < 0) {
2850                         if (IN_SET(errno, EAGAIN, EINTR))
2851                                 return read_one;
2852
2853                         return -errno;
2854                 }
2855
2856                 if (_unlikely_(n != sizeof(si)))
2857                         return -EIO;
2858
2859                 assert(SIGNAL_VALID(si.ssi_signo));
2860
2861                 read_one = true;
2862
2863                 if (e->signal_sources)
2864                         s = e->signal_sources[si.ssi_signo];
2865                 if (!s)
2866                         continue;
2867                 if (s->pending)
2868                         continue;
2869
2870                 s->signal.siginfo = si;
2871                 d->current = s;
2872
2873                 r = source_set_pending(s, true);
2874                 if (r < 0)
2875                         return r;
2876
2877                 return 1;
2878         }
2879 }
2880
2881 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents) {
2882         ssize_t n;
2883
2884         assert(e);
2885         assert(d);
2886
2887         assert_return(revents == EPOLLIN, -EIO);
2888
2889         /* If there's already an event source pending for this priority, don't read another */
2890         if (d->n_pending > 0)
2891                 return 0;
2892
2893         /* Is the read buffer non-empty? If so, let's not read more */
2894         if (d->buffer_filled > 0)
2895                 return 0;
2896
2897         n = read(d->fd, &d->buffer, sizeof(d->buffer));
2898         if (n < 0) {
2899                 if (IN_SET(errno, EAGAIN, EINTR))
2900                         return 0;
2901
2902                 return -errno;
2903         }
2904
2905         assert(n > 0);
2906         d->buffer_filled = (size_t) n;
2907         LIST_PREPEND(buffered, e->inotify_data_buffered, d);
2908
2909         return 1;
2910 }
2911
2912 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
2913         assert(e);
2914         assert(d);
2915         assert(sz <= d->buffer_filled);
2916
2917         if (sz == 0)
2918                 return;
2919
2920         /* Move the rest to the buffer to the front, in order to get things properly aligned again */
2921         memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
2922         d->buffer_filled -= sz;
2923
2924         if (d->buffer_filled == 0)
2925                 LIST_REMOVE(buffered, e->inotify_data_buffered, d);
2926 }
2927
2928 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
2929         int r;
2930
2931         assert(e);
2932         assert(d);
2933
2934         /* If there's already an event source pending for this priority, don't read another */
2935         if (d->n_pending > 0)
2936                 return 0;
2937
2938         while (d->buffer_filled > 0) {
2939                 size_t sz;
2940
2941                 /* Let's validate that the event structures are complete */
2942                 if (d->buffer_filled < offsetof(struct inotify_event, name))
2943                         return -EIO;
2944
2945                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
2946                 if (d->buffer_filled < sz)
2947                         return -EIO;
2948
2949                 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
2950                         struct inode_data *inode_data;
2951                         Iterator i;
2952
2953                         /* The queue overran, let's pass this event to all event sources connected to this inotify
2954                          * object */
2955
2956                         HASHMAP_FOREACH(inode_data, d->inodes, i) {
2957                                 sd_event_source *s;
2958
2959                                 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
2960
2961                                         if (s->enabled == SD_EVENT_OFF)
2962                                                 continue;
2963
2964                                         r = source_set_pending(s, true);
2965                                         if (r < 0)
2966                                                 return r;
2967                                 }
2968                         }
2969                 } else {
2970                         struct inode_data *inode_data;
2971                         sd_event_source *s;
2972
2973                         /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
2974                          * our watch descriptor table. */
2975                         if (d->buffer.ev.mask & IN_IGNORED) {
2976
2977                                 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
2978                                 if (!inode_data) {
2979                                         event_inotify_data_drop(e, d, sz);
2980                                         continue;
2981                                 }
2982
2983                                 /* The watch descriptor was removed by the kernel, let's drop it here too */
2984                                 inode_data->wd = -1;
2985                         } else {
2986                                 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
2987                                 if (!inode_data) {
2988                                         event_inotify_data_drop(e, d, sz);
2989                                         continue;
2990                                 }
2991                         }
2992
2993                         /* Trigger all event sources that are interested in these events. Also trigger all event
2994                          * sources if IN_IGNORED or IN_UNMOUNT is set. */
2995                         LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
2996
2997                                 if (s->enabled == SD_EVENT_OFF)
2998                                         continue;
2999
3000                                 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3001                                     (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3002                                         continue;
3003
3004                                 r = source_set_pending(s, true);
3005                                 if (r < 0)
3006                                         return r;
3007                         }
3008                 }
3009
3010                 /* Something pending now? If so, let's finish, otherwise let's read more. */
3011                 if (d->n_pending > 0)
3012                         return 1;
3013         }
3014
3015         return 0;
3016 }
3017
3018 static int process_inotify(sd_event *e) {
3019         struct inotify_data *d;
3020         int r, done = 0;
3021
3022         assert(e);
3023
3024         LIST_FOREACH(buffered, d, e->inotify_data_buffered) {
3025                 r = event_inotify_data_process(e, d);
3026                 if (r < 0)
3027                         return r;
3028                 if (r > 0)
3029                         done ++;
3030         }
3031
3032         return done;
3033 }
3034
3035 static int source_dispatch(sd_event_source *s) {
3036         EventSourceType saved_type;
3037         int r = 0;
3038
3039         assert(s);
3040         assert(s->pending || s->type == SOURCE_EXIT);
3041
3042         /* Save the event source type, here, so that we still know it after the event callback which might invalidate
3043          * the event. */
3044         saved_type = s->type;
3045
3046         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
3047                 r = source_set_pending(s, false);
3048                 if (r < 0)
3049                         return r;
3050         }
3051
3052         if (s->type != SOURCE_POST) {
3053                 sd_event_source *z;
3054                 Iterator i;
3055
3056                 /* If we execute a non-post source, let's mark all
3057                  * post sources as pending */
3058
3059                 SET_FOREACH(z, s->event->post_sources, i) {
3060                         if (z->enabled == SD_EVENT_OFF)
3061                                 continue;
3062
3063                         r = source_set_pending(z, true);
3064                         if (r < 0)
3065                                 return r;
3066                 }
3067         }
3068
3069         if (s->enabled == SD_EVENT_ONESHOT) {
3070                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
3071                 if (r < 0)
3072                         return r;
3073         }
3074
3075         s->dispatching = true;
3076
3077         switch (s->type) {
3078
3079         case SOURCE_IO:
3080                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
3081                 break;
3082
3083         case SOURCE_TIME_REALTIME:
3084         case SOURCE_TIME_BOOTTIME:
3085         case SOURCE_TIME_MONOTONIC:
3086         case SOURCE_TIME_REALTIME_ALARM:
3087         case SOURCE_TIME_BOOTTIME_ALARM:
3088                 r = s->time.callback(s, s->time.next, s->userdata);
3089                 break;
3090
3091         case SOURCE_SIGNAL:
3092                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
3093                 break;
3094
3095         case SOURCE_CHILD: {
3096                 bool zombie;
3097
3098                 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3099
3100                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
3101
3102                 /* Now, reap the PID for good. */
3103                 if (zombie)
3104                         (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
3105
3106                 break;
3107         }
3108
3109         case SOURCE_DEFER:
3110                 r = s->defer.callback(s, s->userdata);
3111                 break;
3112
3113         case SOURCE_POST:
3114                 r = s->post.callback(s, s->userdata);
3115                 break;
3116
3117         case SOURCE_EXIT:
3118                 r = s->exit.callback(s, s->userdata);
3119                 break;
3120
3121         case SOURCE_INOTIFY: {
3122                 struct sd_event *e = s->event;
3123                 struct inotify_data *d;
3124                 size_t sz;
3125
3126                 assert(s->inotify.inode_data);
3127                 assert_se(d = s->inotify.inode_data->inotify_data);
3128
3129                 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
3130                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3131                 assert(d->buffer_filled >= sz);
3132
3133                 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
3134
3135                 /* When no event is pending anymore on this inotify object, then let's drop the event from the
3136                  * buffer. */
3137                 if (d->n_pending == 0)
3138                         event_inotify_data_drop(e, d, sz);
3139
3140                 break;
3141         }
3142
3143         case SOURCE_WATCHDOG:
3144         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
3145         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
3146                 assert_not_reached("Wut? I shouldn't exist.");
3147         }
3148
3149         s->dispatching = false;
3150
3151         if (r < 0)
3152                 log_debug_errno(r, "Event source %s (type %s) returned error, disabling: %m",
3153                                 strna(s->description), event_source_type_to_string(saved_type));
3154
3155         if (s->n_ref == 0)
3156                 source_free(s);
3157         else if (r < 0)
3158                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
3159
3160         return 1;
3161 }
3162
3163 static int event_prepare(sd_event *e) {
3164         int r;
3165
3166         assert(e);
3167
3168         for (;;) {
3169                 sd_event_source *s;
3170
3171                 s = prioq_peek(e->prepare);
3172                 if (!s || s->prepare_iteration == e->iteration || s->enabled == SD_EVENT_OFF)
3173                         break;
3174
3175                 s->prepare_iteration = e->iteration;
3176                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
3177                 if (r < 0)
3178                         return r;
3179
3180                 assert(s->prepare);
3181
3182                 s->dispatching = true;
3183                 r = s->prepare(s, s->userdata);
3184                 s->dispatching = false;
3185
3186                 if (r < 0)
3187                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, disabling: %m",
3188                                         strna(s->description), event_source_type_to_string(s->type));
3189
3190                 if (s->n_ref == 0)
3191                         source_free(s);
3192                 else if (r < 0)
3193                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
3194         }
3195
3196         return 0;
3197 }
3198
3199 static int dispatch_exit(sd_event *e) {
3200         sd_event_source *p;
3201         _cleanup_(sd_event_unrefp) sd_event *ref = NULL;
3202         int r;
3203
3204         assert(e);
3205
3206         p = prioq_peek(e->exit);
3207         if (!p || p->enabled == SD_EVENT_OFF) {
3208                 e->state = SD_EVENT_FINISHED;
3209                 return 0;
3210         }
3211
3212         ref = sd_event_ref(e);
3213         e->iteration++;
3214         e->state = SD_EVENT_EXITING;
3215         r = source_dispatch(p);
3216         e->state = SD_EVENT_INITIAL;
3217         return r;
3218 }
3219
3220 static sd_event_source* event_next_pending(sd_event *e) {
3221         sd_event_source *p;
3222
3223         assert(e);
3224
3225         p = prioq_peek(e->pending);
3226         if (!p)
3227                 return NULL;
3228
3229         if (p->enabled == SD_EVENT_OFF)
3230                 return NULL;
3231
3232         return p;
3233 }
3234
3235 static int arm_watchdog(sd_event *e) {
3236         struct itimerspec its = {};
3237         usec_t t;
3238         int r;
3239
3240         assert(e);
3241         assert(e->watchdog_fd >= 0);
3242
3243         t = sleep_between(e,
3244                           e->watchdog_last + (e->watchdog_period / 2),
3245                           e->watchdog_last + (e->watchdog_period * 3 / 4));
3246
3247         timespec_store(&its.it_value, t);
3248
3249         /* Make sure we never set the watchdog to 0, which tells the
3250          * kernel to disable it. */
3251         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
3252                 its.it_value.tv_nsec = 1;
3253
3254         r = timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL);
3255         if (r < 0)
3256                 return -errno;
3257
3258         return 0;
3259 }
3260
3261 static int process_watchdog(sd_event *e) {
3262         assert(e);
3263
3264         if (!e->watchdog)
3265                 return 0;
3266
3267         /* Don't notify watchdog too often */
3268         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
3269                 return 0;
3270
3271         sd_notify(false, "WATCHDOG=1");
3272         e->watchdog_last = e->timestamp.monotonic;
3273
3274         return arm_watchdog(e);
3275 }
3276
3277 static void event_close_inode_data_fds(sd_event *e) {
3278         struct inode_data *d;
3279
3280         assert(e);
3281
3282         /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
3283          * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
3284          * adjustments to the even source, such as changing the priority (which requires us to remove and readd a watch
3285          * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
3286          * compromise. */
3287
3288         while ((d = e->inode_data_to_close)) {
3289                 assert(d->fd >= 0);
3290                 d->fd = safe_close(d->fd);
3291
3292                 LIST_REMOVE(to_close, e->inode_data_to_close, d);
3293         }
3294 }
3295
3296 _public_ int sd_event_prepare(sd_event *e) {
3297         int r;
3298
3299         assert_return(e, -EINVAL);
3300         assert_return(e = event_resolve(e), -ENOPKG);
3301         assert_return(!event_pid_changed(e), -ECHILD);
3302         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
3303         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
3304
3305         if (e->exit_requested)
3306                 goto pending;
3307
3308         e->iteration++;
3309
3310         e->state = SD_EVENT_PREPARING;
3311         r = event_prepare(e);
3312         e->state = SD_EVENT_INITIAL;
3313         if (r < 0)
3314                 return r;
3315
3316         r = event_arm_timer(e, &e->realtime);
3317         if (r < 0)
3318                 return r;
3319
3320         r = event_arm_timer(e, &e->boottime);
3321         if (r < 0)
3322                 return r;
3323
3324         r = event_arm_timer(e, &e->monotonic);
3325         if (r < 0)
3326                 return r;
3327
3328         r = event_arm_timer(e, &e->realtime_alarm);
3329         if (r < 0)
3330                 return r;
3331
3332         r = event_arm_timer(e, &e->boottime_alarm);
3333         if (r < 0)
3334                 return r;
3335
3336         event_close_inode_data_fds(e);
3337
3338         if (event_next_pending(e) || e->need_process_child)
3339                 goto pending;
3340
3341         e->state = SD_EVENT_ARMED;
3342
3343         return 0;
3344
3345 pending:
3346         e->state = SD_EVENT_ARMED;
3347         r = sd_event_wait(e, 0);
3348         if (r == 0)
3349                 e->state = SD_EVENT_ARMED;
3350
3351         return r;
3352 }
3353
3354 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
3355         struct epoll_event *ev_queue;
3356         unsigned ev_queue_max;
3357         int r, m, i;
3358
3359         assert_return(e, -EINVAL);
3360         assert_return(e = event_resolve(e), -ENOPKG);
3361         assert_return(!event_pid_changed(e), -ECHILD);
3362         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
3363         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
3364
3365         if (e->exit_requested) {
3366                 e->state = SD_EVENT_PENDING;
3367                 return 1;
3368         }
3369
3370         ev_queue_max = MAX(e->n_sources, 1u);
3371         ev_queue = newa(struct epoll_event, ev_queue_max);
3372
3373         /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
3374         if (e->inotify_data_buffered)
3375                 timeout = 0;
3376
3377         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
3378                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
3379         if (m < 0) {
3380                 if (errno == EINTR) {
3381                         e->state = SD_EVENT_PENDING;
3382                         return 1;
3383                 }
3384
3385                 r = -errno;
3386                 goto finish;
3387         }
3388
3389         triple_timestamp_get(&e->timestamp);
3390
3391         for (i = 0; i < m; i++) {
3392
3393                 if (ev_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
3394                         r = flush_timer(e, e->watchdog_fd, ev_queue[i].events, NULL);
3395                 else {
3396                         WakeupType *t = ev_queue[i].data.ptr;
3397
3398                         switch (*t) {
3399
3400                         case WAKEUP_EVENT_SOURCE:
3401                                 r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
3402                                 break;
3403
3404                         case WAKEUP_CLOCK_DATA: {
3405                                 struct clock_data *d = ev_queue[i].data.ptr;
3406                                 r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
3407                                 break;
3408                         }
3409
3410                         case WAKEUP_SIGNAL_DATA:
3411                                 r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
3412                                 break;
3413
3414                         case WAKEUP_INOTIFY_DATA:
3415                                 r = event_inotify_data_read(e, ev_queue[i].data.ptr, ev_queue[i].events);
3416                                 break;
3417
3418                         default:
3419                                 assert_not_reached("Invalid wake-up pointer");
3420                         }
3421                 }
3422                 if (r < 0)
3423                         goto finish;
3424         }
3425
3426         r = process_watchdog(e);
3427         if (r < 0)
3428                 goto finish;
3429
3430         r = process_timer(e, e->timestamp.realtime, &e->realtime);
3431         if (r < 0)
3432                 goto finish;
3433
3434         r = process_timer(e, e->timestamp.boottime, &e->boottime);
3435         if (r < 0)
3436                 goto finish;
3437
3438         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
3439         if (r < 0)
3440                 goto finish;
3441
3442         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
3443         if (r < 0)
3444                 goto finish;
3445
3446         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
3447         if (r < 0)
3448                 goto finish;
3449
3450         if (e->need_process_child) {
3451                 r = process_child(e);
3452                 if (r < 0)
3453                         goto finish;
3454         }
3455
3456         r = process_inotify(e);
3457         if (r < 0)
3458                 goto finish;
3459
3460         if (event_next_pending(e)) {
3461                 e->state = SD_EVENT_PENDING;
3462
3463                 return 1;
3464         }
3465
3466         r = 0;
3467
3468 finish:
3469         e->state = SD_EVENT_INITIAL;
3470
3471         return r;
3472 }
3473
3474 _public_ int sd_event_dispatch(sd_event *e) {
3475         sd_event_source *p;
3476         int r;
3477
3478         assert_return(e, -EINVAL);
3479         assert_return(e = event_resolve(e), -ENOPKG);
3480         assert_return(!event_pid_changed(e), -ECHILD);
3481         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
3482         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
3483
3484         if (e->exit_requested)
3485                 return dispatch_exit(e);
3486
3487         p = event_next_pending(e);
3488         if (p) {
3489                 _cleanup_(sd_event_unrefp) sd_event *ref = NULL;
3490
3491                 ref = sd_event_ref(e);
3492                 e->state = SD_EVENT_RUNNING;
3493                 r = source_dispatch(p);
3494                 e->state = SD_EVENT_INITIAL;
3495                 return r;
3496         }
3497
3498         e->state = SD_EVENT_INITIAL;
3499
3500         return 1;
3501 }
3502
3503 static void event_log_delays(sd_event *e) {
3504         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1];
3505         unsigned i;
3506         int o;
3507
3508         for (i = o = 0; i < ELEMENTSOF(e->delays); i++) {
3509                 o += snprintf(&b[o], sizeof(b) - o, "%u ", e->delays[i]);
3510                 e->delays[i] = 0;
3511         }
3512         log_debug("Event loop iterations: %.*s", o, b);
3513 }
3514
3515 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
3516         int r;
3517
3518         assert_return(e, -EINVAL);
3519         assert_return(e = event_resolve(e), -ENOPKG);
3520         assert_return(!event_pid_changed(e), -ECHILD);
3521         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
3522         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
3523
3524         if (e->profile_delays && e->last_run) {
3525                 usec_t this_run;
3526                 unsigned l;
3527
3528                 this_run = now(CLOCK_MONOTONIC);
3529
3530                 l = u64log2(this_run - e->last_run);
3531                 assert(l < sizeof(e->delays));
3532                 e->delays[l]++;
3533
3534                 if (this_run - e->last_log >= 5*USEC_PER_SEC) {
3535                         event_log_delays(e);
3536                         e->last_log = this_run;
3537                 }
3538         }
3539
3540         r = sd_event_prepare(e);
3541         if (r == 0)
3542                 /* There was nothing? Then wait... */
3543                 r = sd_event_wait(e, timeout);
3544
3545         if (e->profile_delays)
3546                 e->last_run = now(CLOCK_MONOTONIC);
3547
3548         if (r > 0) {
3549                 /* There's something now, then let's dispatch it */
3550                 r = sd_event_dispatch(e);
3551                 if (r < 0)
3552                         return r;
3553
3554                 return 1;
3555         }
3556
3557         return r;
3558 }
3559
3560 _public_ int sd_event_loop(sd_event *e) {
3561         _cleanup_(sd_event_unrefp) sd_event *ref = NULL;
3562         int r;
3563
3564         assert_return(e, -EINVAL);
3565         assert_return(e = event_resolve(e), -ENOPKG);
3566         assert_return(!event_pid_changed(e), -ECHILD);
3567         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
3568
3569         ref = sd_event_ref(e);
3570
3571         while (e->state != SD_EVENT_FINISHED) {
3572                 r = sd_event_run(e, (uint64_t) -1);
3573                 if (r < 0)
3574                         return r;
3575         }
3576
3577         return e->exit_code;
3578 }
3579
3580 _public_ int sd_event_get_fd(sd_event *e) {
3581
3582         assert_return(e, -EINVAL);
3583         assert_return(e = event_resolve(e), -ENOPKG);
3584         assert_return(!event_pid_changed(e), -ECHILD);
3585
3586         return e->epoll_fd;
3587 }
3588
3589 _public_ int sd_event_get_state(sd_event *e) {
3590         assert_return(e, -EINVAL);
3591         assert_return(e = event_resolve(e), -ENOPKG);
3592         assert_return(!event_pid_changed(e), -ECHILD);
3593
3594         return e->state;
3595 }
3596
3597 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
3598         assert_return(e, -EINVAL);
3599         assert_return(e = event_resolve(e), -ENOPKG);
3600         assert_return(code, -EINVAL);
3601         assert_return(!event_pid_changed(e), -ECHILD);
3602
3603         if (!e->exit_requested)
3604                 return -ENODATA;
3605
3606         *code = e->exit_code;
3607         return 0;
3608 }
3609
3610 _public_ int sd_event_exit(sd_event *e, int code) {
3611         assert_return(e, -EINVAL);
3612         assert_return(e = event_resolve(e), -ENOPKG);
3613         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
3614         assert_return(!event_pid_changed(e), -ECHILD);
3615
3616         e->exit_requested = true;
3617         e->exit_code = code;
3618
3619         return 0;
3620 }
3621
3622 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
3623         assert_return(e, -EINVAL);
3624         assert_return(e = event_resolve(e), -ENOPKG);
3625         assert_return(usec, -EINVAL);
3626         assert_return(!event_pid_changed(e), -ECHILD);
3627
3628         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
3629                 return -EOPNOTSUPP;
3630
3631         /* Generate a clean error in case CLOCK_BOOTTIME is not available. Note that don't use clock_supported() here,
3632          * for a reason: there are systems where CLOCK_BOOTTIME is supported, but CLOCK_BOOTTIME_ALARM is not, but for
3633          * the purpose of getting the time this doesn't matter. */
3634         if (IN_SET(clock, CLOCK_BOOTTIME, CLOCK_BOOTTIME_ALARM) && !clock_boottime_supported())
3635                 return -EOPNOTSUPP;
3636
3637         if (!triple_timestamp_is_set(&e->timestamp)) {
3638                 /* Implicitly fall back to now() if we never ran
3639                  * before and thus have no cached time. */
3640                 *usec = now(clock);
3641                 return 1;
3642         }
3643
3644         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
3645         return 0;
3646 }
3647
3648 _public_ int sd_event_default(sd_event **ret) {
3649         sd_event *e = NULL;
3650         int r;
3651
3652         if (!ret)
3653                 return !!default_event;
3654
3655         if (default_event) {
3656                 *ret = sd_event_ref(default_event);
3657                 return 0;
3658         }
3659
3660         r = sd_event_new(&e);
3661         if (r < 0)
3662                 return r;
3663
3664         e->default_event_ptr = &default_event;
3665         e->tid = gettid();
3666         default_event = e;
3667
3668         *ret = e;
3669         return 1;
3670 }
3671
3672 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
3673         assert_return(e, -EINVAL);
3674         assert_return(e = event_resolve(e), -ENOPKG);
3675         assert_return(tid, -EINVAL);
3676         assert_return(!event_pid_changed(e), -ECHILD);
3677
3678         if (e->tid != 0) {
3679                 *tid = e->tid;
3680                 return 0;
3681         }
3682
3683         return -ENXIO;
3684 }
3685
3686 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
3687         int r;
3688
3689         assert_return(e, -EINVAL);
3690         assert_return(e = event_resolve(e), -ENOPKG);
3691         assert_return(!event_pid_changed(e), -ECHILD);
3692
3693         if (e->watchdog == !!b)
3694                 return e->watchdog;
3695
3696         if (b) {
3697                 struct epoll_event ev;
3698
3699                 r = sd_watchdog_enabled(false, &e->watchdog_period);
3700                 if (r <= 0)
3701                         return r;
3702
3703                 /* Issue first ping immediately */
3704                 sd_notify(false, "WATCHDOG=1");
3705                 e->watchdog_last = now(CLOCK_MONOTONIC);
3706
3707                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
3708                 if (e->watchdog_fd < 0)
3709                         return -errno;
3710
3711                 r = arm_watchdog(e);
3712                 if (r < 0)
3713                         goto fail;
3714
3715                 ev = (struct epoll_event) {
3716                         .events = EPOLLIN,
3717                         .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
3718                 };
3719
3720                 r = epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev);
3721                 if (r < 0) {
3722                         r = -errno;
3723                         goto fail;
3724                 }
3725
3726         } else {
3727                 if (e->watchdog_fd >= 0) {
3728                         epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
3729                         e->watchdog_fd = safe_close(e->watchdog_fd);
3730                 }
3731         }
3732
3733         e->watchdog = !!b;
3734         return e->watchdog;
3735
3736 fail:
3737         e->watchdog_fd = safe_close(e->watchdog_fd);
3738         return r;
3739 }
3740
3741 _public_ int sd_event_get_watchdog(sd_event *e) {
3742         assert_return(e, -EINVAL);
3743         assert_return(e = event_resolve(e), -ENOPKG);
3744         assert_return(!event_pid_changed(e), -ECHILD);
3745
3746         return e->watchdog;
3747 }
3748
3749 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
3750         assert_return(e, -EINVAL);
3751         assert_return(e = event_resolve(e), -ENOPKG);
3752         assert_return(!event_pid_changed(e), -ECHILD);
3753
3754         *ret = e->iteration;
3755         return 0;
3756 }