chiark / gitweb /
journald: move stream protocol into its own .c file
[elogind.git] / src / journal / journald.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/epoll.h>
23 #include <sys/socket.h>
24 #include <errno.h>
25 #include <sys/signalfd.h>
26 #include <unistd.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <sys/ioctl.h>
30 #include <linux/sockios.h>
31 #include <sys/statvfs.h>
32 #include <sys/mman.h>
33
34 #include <libudev.h>
35 #include <systemd/sd-journal.h>
36 #include <systemd/sd-messages.h>
37 #include <systemd/sd-daemon.h>
38
39 #ifdef HAVE_LOGIND
40 #include <systemd/sd-login.h>
41 #endif
42
43 #include "mkdir.h"
44 #include "hashmap.h"
45 #include "journal-file.h"
46 #include "socket-util.h"
47 #include "cgroup-util.h"
48 #include "list.h"
49 #include "virt.h"
50 #include "missing.h"
51 #include "conf-parser.h"
52 #include "journal-rate-limit.h"
53 #include "journal-internal.h"
54 #include "journal-vacuum.h"
55 #include "journal-authenticate.h"
56 #include "journald.h"
57 #include "journald-kmsg.h"
58 #include "journald-syslog.h"
59 #include "journald-stream.h"
60
61 #ifdef HAVE_ACL
62 #include <sys/acl.h>
63 #include <acl/libacl.h>
64 #include "acl-util.h"
65 #endif
66
67 #ifdef HAVE_SELINUX
68 #include <selinux/selinux.h>
69 #endif
70
71 #define USER_JOURNALS_MAX 1024
72
73 #define DEFAULT_RATE_LIMIT_INTERVAL (10*USEC_PER_SEC)
74 #define DEFAULT_RATE_LIMIT_BURST 200
75
76 #define RECHECK_AVAILABLE_SPACE_USEC (30*USEC_PER_SEC)
77
78 #define ENTRY_SIZE_MAX (1024*1024*32)
79
80 static const char* const storage_table[] = {
81         [STORAGE_AUTO] = "auto",
82         [STORAGE_VOLATILE] = "volatile",
83         [STORAGE_PERSISTENT] = "persistent",
84         [STORAGE_NONE] = "none"
85 };
86
87 DEFINE_STRING_TABLE_LOOKUP(storage, Storage);
88 DEFINE_CONFIG_PARSE_ENUM(config_parse_storage, storage, Storage, "Failed to parse storage setting");
89
90 static uint64_t available_space(Server *s) {
91         char ids[33], *p;
92         const char *f;
93         sd_id128_t machine;
94         struct statvfs ss;
95         uint64_t sum = 0, avail = 0, ss_avail = 0;
96         int r;
97         DIR *d;
98         usec_t ts;
99         JournalMetrics *m;
100
101         ts = now(CLOCK_MONOTONIC);
102
103         if (s->cached_available_space_timestamp + RECHECK_AVAILABLE_SPACE_USEC > ts)
104                 return s->cached_available_space;
105
106         r = sd_id128_get_machine(&machine);
107         if (r < 0)
108                 return 0;
109
110         if (s->system_journal) {
111                 f = "/var/log/journal/";
112                 m = &s->system_metrics;
113         } else {
114                 f = "/run/log/journal/";
115                 m = &s->runtime_metrics;
116         }
117
118         assert(m);
119
120         p = strappend(f, sd_id128_to_string(machine, ids));
121         if (!p)
122                 return 0;
123
124         d = opendir(p);
125         free(p);
126
127         if (!d)
128                 return 0;
129
130         if (fstatvfs(dirfd(d), &ss) < 0)
131                 goto finish;
132
133         for (;;) {
134                 struct stat st;
135                 struct dirent buf, *de;
136
137                 r = readdir_r(d, &buf, &de);
138                 if (r != 0)
139                         break;
140
141                 if (!de)
142                         break;
143
144                 if (!endswith(de->d_name, ".journal") &&
145                     !endswith(de->d_name, ".journal~"))
146                         continue;
147
148                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
149                         continue;
150
151                 if (!S_ISREG(st.st_mode))
152                         continue;
153
154                 sum += (uint64_t) st.st_blocks * 512UL;
155         }
156
157         avail = sum >= m->max_use ? 0 : m->max_use - sum;
158
159         ss_avail = ss.f_bsize * ss.f_bavail;
160
161         ss_avail = ss_avail < m->keep_free ? 0 : ss_avail - m->keep_free;
162
163         if (ss_avail < avail)
164                 avail = ss_avail;
165
166         s->cached_available_space = avail;
167         s->cached_available_space_timestamp = ts;
168
169 finish:
170         closedir(d);
171
172         return avail;
173 }
174
175 static void server_read_file_gid(Server *s) {
176         const char *adm = "adm";
177         int r;
178
179         assert(s);
180
181         if (s->file_gid_valid)
182                 return;
183
184         r = get_group_creds(&adm, &s->file_gid);
185         if (r < 0)
186                 log_warning("Failed to resolve 'adm' group: %s", strerror(-r));
187
188         /* if we couldn't read the gid, then it will be 0, but that's
189          * fine and we shouldn't try to resolve the group again, so
190          * let's just pretend it worked right-away. */
191         s->file_gid_valid = true;
192 }
193
194 static void server_fix_perms(Server *s, JournalFile *f, uid_t uid) {
195         int r;
196 #ifdef HAVE_ACL
197         acl_t acl;
198         acl_entry_t entry;
199         acl_permset_t permset;
200 #endif
201
202         assert(f);
203
204         server_read_file_gid(s);
205
206         r = fchmod_and_fchown(f->fd, 0640, 0, s->file_gid);
207         if (r < 0)
208                 log_warning("Failed to fix access mode/rights on %s, ignoring: %s", f->path, strerror(-r));
209
210 #ifdef HAVE_ACL
211         if (uid <= 0)
212                 return;
213
214         acl = acl_get_fd(f->fd);
215         if (!acl) {
216                 log_warning("Failed to read ACL on %s, ignoring: %m", f->path);
217                 return;
218         }
219
220         r = acl_find_uid(acl, uid, &entry);
221         if (r <= 0) {
222
223                 if (acl_create_entry(&acl, &entry) < 0 ||
224                     acl_set_tag_type(entry, ACL_USER) < 0 ||
225                     acl_set_qualifier(entry, &uid) < 0) {
226                         log_warning("Failed to patch ACL on %s, ignoring: %m", f->path);
227                         goto finish;
228                 }
229         }
230
231         if (acl_get_permset(entry, &permset) < 0 ||
232             acl_add_perm(permset, ACL_READ) < 0 ||
233             acl_calc_mask(&acl) < 0) {
234                 log_warning("Failed to patch ACL on %s, ignoring: %m", f->path);
235                 goto finish;
236         }
237
238         if (acl_set_fd(f->fd, acl) < 0)
239                 log_warning("Failed to set ACL on %s, ignoring: %m", f->path);
240
241 finish:
242         acl_free(acl);
243 #endif
244 }
245
246 static JournalFile* find_journal(Server *s, uid_t uid) {
247         char *p;
248         int r;
249         JournalFile *f;
250         sd_id128_t machine;
251
252         assert(s);
253
254         /* We split up user logs only on /var, not on /run. If the
255          * runtime file is open, we write to it exclusively, in order
256          * to guarantee proper order as soon as we flush /run to
257          * /var and close the runtime file. */
258
259         if (s->runtime_journal)
260                 return s->runtime_journal;
261
262         if (uid <= 0)
263                 return s->system_journal;
264
265         r = sd_id128_get_machine(&machine);
266         if (r < 0)
267                 return s->system_journal;
268
269         f = hashmap_get(s->user_journals, UINT32_TO_PTR(uid));
270         if (f)
271                 return f;
272
273         if (asprintf(&p, "/var/log/journal/" SD_ID128_FORMAT_STR "/user-%lu.journal",
274                      SD_ID128_FORMAT_VAL(machine), (unsigned long) uid) < 0)
275                 return s->system_journal;
276
277         while (hashmap_size(s->user_journals) >= USER_JOURNALS_MAX) {
278                 /* Too many open? Then let's close one */
279                 f = hashmap_steal_first(s->user_journals);
280                 assert(f);
281                 journal_file_close(f);
282         }
283
284         r = journal_file_open_reliably(p, O_RDWR|O_CREAT, 0640, s->compress, s->seal, &s->system_metrics, s->mmap, s->system_journal, &f);
285         free(p);
286
287         if (r < 0)
288                 return s->system_journal;
289
290         server_fix_perms(s, f, uid);
291
292         r = hashmap_put(s->user_journals, UINT32_TO_PTR(uid), f);
293         if (r < 0) {
294                 journal_file_close(f);
295                 return s->system_journal;
296         }
297
298         return f;
299 }
300
301 static void server_rotate(Server *s) {
302         JournalFile *f;
303         void *k;
304         Iterator i;
305         int r;
306
307         log_info("Rotating...");
308
309         if (s->runtime_journal) {
310                 r = journal_file_rotate(&s->runtime_journal, s->compress, false);
311                 if (r < 0)
312                         if (s->runtime_journal)
313                                 log_error("Failed to rotate %s: %s", s->runtime_journal->path, strerror(-r));
314                         else
315                                 log_error("Failed to create new runtime journal: %s", strerror(-r));
316                 else
317                         server_fix_perms(s, s->runtime_journal, 0);
318         }
319
320         if (s->system_journal) {
321                 r = journal_file_rotate(&s->system_journal, s->compress, s->seal);
322                 if (r < 0)
323                         if (s->system_journal)
324                                 log_error("Failed to rotate %s: %s", s->system_journal->path, strerror(-r));
325                         else
326                                 log_error("Failed to create new system journal: %s", strerror(-r));
327
328                 else
329                         server_fix_perms(s, s->system_journal, 0);
330         }
331
332         HASHMAP_FOREACH_KEY(f, k, s->user_journals, i) {
333                 r = journal_file_rotate(&f, s->compress, s->seal);
334                 if (r < 0)
335                         if (f->path)
336                                 log_error("Failed to rotate %s: %s", f->path, strerror(-r));
337                         else
338                                 log_error("Failed to create user journal: %s", strerror(-r));
339                 else {
340                         hashmap_replace(s->user_journals, k, f);
341                         server_fix_perms(s, s->system_journal, PTR_TO_UINT32(k));
342                 }
343         }
344 }
345
346 static void server_vacuum(Server *s) {
347         char *p;
348         char ids[33];
349         sd_id128_t machine;
350         int r;
351
352         log_info("Vacuuming...");
353
354         r = sd_id128_get_machine(&machine);
355         if (r < 0) {
356                 log_error("Failed to get machine ID: %s", strerror(-r));
357                 return;
358         }
359
360         sd_id128_to_string(machine, ids);
361
362         if (s->system_journal) {
363                 if (asprintf(&p, "/var/log/journal/%s", ids) < 0) {
364                         log_oom();
365                         return;
366                 }
367
368                 r = journal_directory_vacuum(p, s->system_metrics.max_use, s->system_metrics.keep_free);
369                 if (r < 0 && r != -ENOENT)
370                         log_error("Failed to vacuum %s: %s", p, strerror(-r));
371                 free(p);
372         }
373
374         if (s->runtime_journal) {
375                 if (asprintf(&p, "/run/log/journal/%s", ids) < 0) {
376                         log_oom();
377                         return;
378                 }
379
380                 r = journal_directory_vacuum(p, s->runtime_metrics.max_use, s->runtime_metrics.keep_free);
381                 if (r < 0 && r != -ENOENT)
382                         log_error("Failed to vacuum %s: %s", p, strerror(-r));
383                 free(p);
384         }
385
386         s->cached_available_space_timestamp = 0;
387 }
388
389 static char *shortened_cgroup_path(pid_t pid) {
390         int r;
391         char *process_path, *init_path, *path;
392
393         assert(pid > 0);
394
395         r = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, pid, &process_path);
396         if (r < 0)
397                 return NULL;
398
399         r = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 1, &init_path);
400         if (r < 0) {
401                 free(process_path);
402                 return NULL;
403         }
404
405         if (endswith(init_path, "/system"))
406                 init_path[strlen(init_path) - 7] = 0;
407         else if (streq(init_path, "/"))
408                 init_path[0] = 0;
409
410         if (startswith(process_path, init_path)) {
411                 char *p;
412
413                 p = strdup(process_path + strlen(init_path));
414                 if (!p) {
415                         free(process_path);
416                         free(init_path);
417                         return NULL;
418                 }
419                 path = p;
420         } else {
421                 path = process_path;
422                 process_path = NULL;
423         }
424
425         free(process_path);
426         free(init_path);
427
428         return path;
429 }
430
431 static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned n) {
432         JournalFile *f;
433         bool vacuumed = false;
434         int r;
435
436         assert(s);
437         assert(iovec);
438         assert(n > 0);
439
440         f = find_journal(s, uid);
441         if (!f)
442                 return;
443
444         if (journal_file_rotate_suggested(f)) {
445                 log_info("Journal header limits reached or header out-of-date, rotating.");
446                 server_rotate(s);
447                 server_vacuum(s);
448                 vacuumed = true;
449
450                 f = find_journal(s, uid);
451                 if (!f)
452                         return;
453         }
454
455         for (;;) {
456                 r = journal_file_append_entry(f, NULL, iovec, n, &s->seqnum, NULL, NULL);
457                 if (r >= 0)
458                         return;
459
460                 if (vacuumed ||
461                     (r != -E2BIG && /* hit limit */
462                      r != -EFBIG && /* hit fs limit */
463                      r != -EDQUOT && /* quota hit */
464                      r != -ENOSPC && /* disk full */
465                      r != -EBADMSG && /* corrupted */
466                      r != -ENODATA && /* truncated */
467                      r != -EHOSTDOWN && /* other machine */
468                      r != -EPROTONOSUPPORT && /* unsupported feature */
469                      r != -EBUSY && /* unclean shutdown */
470                      r != -ESHUTDOWN /* already archived */)) {
471                         log_error("Failed to write entry, ignoring: %s", strerror(-r));
472                         return;
473                 }
474
475                 if (r == -E2BIG || r == -EFBIG || r == EDQUOT || r == ENOSPC)
476                         log_info("Allocation limit reached, rotating.");
477                 else if (r == -EHOSTDOWN)
478                         log_info("Journal file from other machine, rotating.");
479                 else if (r == -EBUSY)
480                         log_info("Unlcean shutdown, rotating.");
481                 else
482                         log_warning("Journal file corrupted, rotating.");
483
484                 server_rotate(s);
485                 server_vacuum(s);
486                 vacuumed = true;
487
488                 f = find_journal(s, uid);
489                 if (!f)
490                         return;
491
492                 log_info("Retrying write.");
493         }
494 }
495
496 static void dispatch_message_real(
497                 Server *s,
498                 struct iovec *iovec, unsigned n, unsigned m,
499                 struct ucred *ucred,
500                 struct timeval *tv,
501                 const char *label, size_t label_len,
502                 const char *unit_id) {
503
504         char *pid = NULL, *uid = NULL, *gid = NULL,
505                 *source_time = NULL, *boot_id = NULL, *machine_id = NULL,
506                 *comm = NULL, *cmdline = NULL, *hostname = NULL,
507                 *audit_session = NULL, *audit_loginuid = NULL,
508                 *exe = NULL, *cgroup = NULL, *session = NULL,
509                 *owner_uid = NULL, *unit = NULL, *selinux_context = NULL;
510
511         char idbuf[33];
512         sd_id128_t id;
513         int r;
514         char *t;
515         uid_t loginuid = 0, realuid = 0;
516
517         assert(s);
518         assert(iovec);
519         assert(n > 0);
520         assert(n + N_IOVEC_META_FIELDS <= m);
521
522         if (ucred) {
523                 uint32_t audit;
524 #ifdef HAVE_LOGIND
525                 uid_t owner;
526 #endif
527
528                 realuid = ucred->uid;
529
530                 if (asprintf(&pid, "_PID=%lu", (unsigned long) ucred->pid) >= 0)
531                         IOVEC_SET_STRING(iovec[n++], pid);
532
533                 if (asprintf(&uid, "_UID=%lu", (unsigned long) ucred->uid) >= 0)
534                         IOVEC_SET_STRING(iovec[n++], uid);
535
536                 if (asprintf(&gid, "_GID=%lu", (unsigned long) ucred->gid) >= 0)
537                         IOVEC_SET_STRING(iovec[n++], gid);
538
539                 r = get_process_comm(ucred->pid, &t);
540                 if (r >= 0) {
541                         comm = strappend("_COMM=", t);
542                         free(t);
543
544                         if (comm)
545                                 IOVEC_SET_STRING(iovec[n++], comm);
546                 }
547
548                 r = get_process_exe(ucred->pid, &t);
549                 if (r >= 0) {
550                         exe = strappend("_EXE=", t);
551                         free(t);
552
553                         if (exe)
554                                 IOVEC_SET_STRING(iovec[n++], exe);
555                 }
556
557                 r = get_process_cmdline(ucred->pid, LINE_MAX, false, &t);
558                 if (r >= 0) {
559                         cmdline = strappend("_CMDLINE=", t);
560                         free(t);
561
562                         if (cmdline)
563                                 IOVEC_SET_STRING(iovec[n++], cmdline);
564                 }
565
566                 r = audit_session_from_pid(ucred->pid, &audit);
567                 if (r >= 0)
568                         if (asprintf(&audit_session, "_AUDIT_SESSION=%lu", (unsigned long) audit) >= 0)
569                                 IOVEC_SET_STRING(iovec[n++], audit_session);
570
571                 r = audit_loginuid_from_pid(ucred->pid, &loginuid);
572                 if (r >= 0)
573                         if (asprintf(&audit_loginuid, "_AUDIT_LOGINUID=%lu", (unsigned long) loginuid) >= 0)
574                                 IOVEC_SET_STRING(iovec[n++], audit_loginuid);
575
576                 t = shortened_cgroup_path(ucred->pid);
577                 if (t) {
578                         cgroup = strappend("_SYSTEMD_CGROUP=", t);
579                         free(t);
580
581                         if (cgroup)
582                                 IOVEC_SET_STRING(iovec[n++], cgroup);
583                 }
584
585 #ifdef HAVE_LOGIND
586                 if (sd_pid_get_session(ucred->pid, &t) >= 0) {
587                         session = strappend("_SYSTEMD_SESSION=", t);
588                         free(t);
589
590                         if (session)
591                                 IOVEC_SET_STRING(iovec[n++], session);
592                 }
593
594                 if (sd_pid_get_owner_uid(ucred->uid, &owner) >= 0)
595                         if (asprintf(&owner_uid, "_SYSTEMD_OWNER_UID=%lu", (unsigned long) owner) >= 0)
596                                 IOVEC_SET_STRING(iovec[n++], owner_uid);
597 #endif
598
599                 if (cg_pid_get_unit(ucred->pid, &t) >= 0) {
600                         unit = strappend("_SYSTEMD_UNIT=", t);
601                         free(t);
602                 } else if (unit_id)
603                         unit = strappend("_SYSTEMD_UNIT=", unit_id);
604
605                 if (unit)
606                         IOVEC_SET_STRING(iovec[n++], unit);
607
608 #ifdef HAVE_SELINUX
609                 if (label) {
610                         selinux_context = malloc(sizeof("_SELINUX_CONTEXT=") + label_len);
611                         if (selinux_context) {
612                                 memcpy(selinux_context, "_SELINUX_CONTEXT=", sizeof("_SELINUX_CONTEXT=")-1);
613                                 memcpy(selinux_context+sizeof("_SELINUX_CONTEXT=")-1, label, label_len);
614                                 selinux_context[sizeof("_SELINUX_CONTEXT=")-1+label_len] = 0;
615                                 IOVEC_SET_STRING(iovec[n++], selinux_context);
616                         }
617                 } else {
618                         security_context_t con;
619
620                         if (getpidcon(ucred->pid, &con) >= 0) {
621                                 selinux_context = strappend("_SELINUX_CONTEXT=", con);
622                                 if (selinux_context)
623                                         IOVEC_SET_STRING(iovec[n++], selinux_context);
624
625                                 freecon(con);
626                         }
627                 }
628 #endif
629         }
630
631         if (tv) {
632                 if (asprintf(&source_time, "_SOURCE_REALTIME_TIMESTAMP=%llu",
633                              (unsigned long long) timeval_load(tv)) >= 0)
634                         IOVEC_SET_STRING(iovec[n++], source_time);
635         }
636
637         /* Note that strictly speaking storing the boot id here is
638          * redundant since the entry includes this in-line
639          * anyway. However, we need this indexed, too. */
640         r = sd_id128_get_boot(&id);
641         if (r >= 0)
642                 if (asprintf(&boot_id, "_BOOT_ID=%s", sd_id128_to_string(id, idbuf)) >= 0)
643                         IOVEC_SET_STRING(iovec[n++], boot_id);
644
645         r = sd_id128_get_machine(&id);
646         if (r >= 0)
647                 if (asprintf(&machine_id, "_MACHINE_ID=%s", sd_id128_to_string(id, idbuf)) >= 0)
648                         IOVEC_SET_STRING(iovec[n++], machine_id);
649
650         t = gethostname_malloc();
651         if (t) {
652                 hostname = strappend("_HOSTNAME=", t);
653                 free(t);
654                 if (hostname)
655                         IOVEC_SET_STRING(iovec[n++], hostname);
656         }
657
658         assert(n <= m);
659
660         write_to_journal(s, realuid == 0 ? 0 : loginuid, iovec, n);
661
662         free(pid);
663         free(uid);
664         free(gid);
665         free(comm);
666         free(exe);
667         free(cmdline);
668         free(source_time);
669         free(boot_id);
670         free(machine_id);
671         free(hostname);
672         free(audit_session);
673         free(audit_loginuid);
674         free(cgroup);
675         free(session);
676         free(owner_uid);
677         free(unit);
678         free(selinux_context);
679 }
680
681 void server_driver_message(Server *s, sd_id128_t message_id, const char *format, ...) {
682         char mid[11 + 32 + 1];
683         char buffer[16 + LINE_MAX + 1];
684         struct iovec iovec[N_IOVEC_META_FIELDS + 4];
685         int n = 0;
686         va_list ap;
687         struct ucred ucred;
688
689         assert(s);
690         assert(format);
691
692         IOVEC_SET_STRING(iovec[n++], "PRIORITY=6");
693         IOVEC_SET_STRING(iovec[n++], "_TRANSPORT=driver");
694
695         memcpy(buffer, "MESSAGE=", 8);
696         va_start(ap, format);
697         vsnprintf(buffer + 8, sizeof(buffer) - 8, format, ap);
698         va_end(ap);
699         char_array_0(buffer);
700         IOVEC_SET_STRING(iovec[n++], buffer);
701
702         snprintf(mid, sizeof(mid), "MESSAGE_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(message_id));
703         char_array_0(mid);
704         IOVEC_SET_STRING(iovec[n++], mid);
705
706         zero(ucred);
707         ucred.pid = getpid();
708         ucred.uid = getuid();
709         ucred.gid = getgid();
710
711         dispatch_message_real(s, iovec, n, ELEMENTSOF(iovec), &ucred, NULL, NULL, 0, NULL);
712 }
713
714 void server_dispatch_message(
715                 Server *s,
716                 struct iovec *iovec, unsigned n, unsigned m,
717                 struct ucred *ucred,
718                 struct timeval *tv,
719                 const char *label, size_t label_len,
720                 const char *unit_id,
721                 int priority) {
722
723         int rl;
724         char *path = NULL, *c;
725
726         assert(s);
727         assert(iovec || n == 0);
728
729         if (n == 0)
730                 return;
731
732         if (LOG_PRI(priority) > s->max_level_store)
733                 return;
734
735         if (!ucred)
736                 goto finish;
737
738         path = shortened_cgroup_path(ucred->pid);
739         if (!path)
740                 goto finish;
741
742         /* example: /user/lennart/3/foobar
743          *          /system/dbus.service/foobar
744          *
745          * So let's cut of everything past the third /, since that is
746          * wher user directories start */
747
748         c = strchr(path, '/');
749         if (c) {
750                 c = strchr(c+1, '/');
751                 if (c) {
752                         c = strchr(c+1, '/');
753                         if (c)
754                                 *c = 0;
755                 }
756         }
757
758         rl = journal_rate_limit_test(s->rate_limit, path, priority & LOG_PRIMASK, available_space(s));
759
760         if (rl == 0) {
761                 free(path);
762                 return;
763         }
764
765         /* Write a suppression message if we suppressed something */
766         if (rl > 1)
767                 server_driver_message(s, SD_MESSAGE_JOURNAL_DROPPED, "Suppressed %u messages from %s", rl - 1, path);
768
769         free(path);
770
771 finish:
772         dispatch_message_real(s, iovec, n, m, ucred, tv, label, label_len, unit_id);
773 }
774
775 void server_forward_console(Server *s, int priority, const char *identifier, const char *message, struct ucred *ucred) {
776         struct iovec iovec[4];
777         char header_pid[16];
778         int n = 0, fd;
779         char *ident_buf = NULL;
780         const char *tty;
781
782         assert(s);
783         assert(message);
784
785         if (LOG_PRI(priority) > s->max_level_console)
786                 return;
787
788         /* First: identifier and PID */
789         if (ucred) {
790                 if (!identifier) {
791                         get_process_comm(ucred->pid, &ident_buf);
792                         identifier = ident_buf;
793                 }
794
795                 snprintf(header_pid, sizeof(header_pid), "[%lu]: ", (unsigned long) ucred->pid);
796                 char_array_0(header_pid);
797
798                 if (identifier)
799                         IOVEC_SET_STRING(iovec[n++], identifier);
800
801                 IOVEC_SET_STRING(iovec[n++], header_pid);
802         } else if (identifier) {
803                 IOVEC_SET_STRING(iovec[n++], identifier);
804                 IOVEC_SET_STRING(iovec[n++], ": ");
805         }
806
807         /* Third: message */
808         IOVEC_SET_STRING(iovec[n++], message);
809         IOVEC_SET_STRING(iovec[n++], "\n");
810
811         tty = s->tty_path ? s->tty_path : "/dev/console";
812
813         fd = open_terminal(tty, O_WRONLY|O_NOCTTY|O_CLOEXEC);
814         if (fd < 0) {
815                 log_debug("Failed to open %s for logging: %s", tty, strerror(errno));
816                 goto finish;
817         }
818
819         if (writev(fd, iovec, n) < 0)
820                 log_debug("Failed to write to %s for logging: %s", tty, strerror(errno));
821
822         close_nointr_nofail(fd);
823
824 finish:
825         free(ident_buf);
826 }
827
828
829
830 static bool valid_user_field(const char *p, size_t l) {
831         const char *a;
832
833         /* We kinda enforce POSIX syntax recommendations for
834            environment variables here, but make a couple of additional
835            requirements.
836
837            http://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html */
838
839         /* No empty field names */
840         if (l <= 0)
841                 return false;
842
843         /* Don't allow names longer than 64 chars */
844         if (l > 64)
845                 return false;
846
847         /* Variables starting with an underscore are protected */
848         if (p[0] == '_')
849                 return false;
850
851         /* Don't allow digits as first character */
852         if (p[0] >= '0' && p[0] <= '9')
853                 return false;
854
855         /* Only allow A-Z0-9 and '_' */
856         for (a = p; a < p + l; a++)
857                 if (!((*a >= 'A' && *a <= 'Z') ||
858                       (*a >= '0' && *a <= '9') ||
859                       *a == '_'))
860                         return false;
861
862         return true;
863 }
864
865 static void process_native_message(
866                 Server *s,
867                 const void *buffer, size_t buffer_size,
868                 struct ucred *ucred,
869                 struct timeval *tv,
870                 const char *label, size_t label_len) {
871
872         struct iovec *iovec = NULL;
873         unsigned n = 0, m = 0, j, tn = (unsigned) -1;
874         const char *p;
875         size_t remaining;
876         int priority = LOG_INFO;
877         char *identifier = NULL, *message = NULL;
878
879         assert(s);
880         assert(buffer || buffer_size == 0);
881
882         p = buffer;
883         remaining = buffer_size;
884
885         while (remaining > 0) {
886                 const char *e, *q;
887
888                 e = memchr(p, '\n', remaining);
889
890                 if (!e) {
891                         /* Trailing noise, let's ignore it, and flush what we collected */
892                         log_debug("Received message with trailing noise, ignoring.");
893                         break;
894                 }
895
896                 if (e == p) {
897                         /* Entry separator */
898                         server_dispatch_message(s, iovec, n, m, ucred, tv, label, label_len, NULL, priority);
899                         n = 0;
900                         priority = LOG_INFO;
901
902                         p++;
903                         remaining--;
904                         continue;
905                 }
906
907                 if (*p == '.' || *p == '#') {
908                         /* Ignore control commands for now, and
909                          * comments too. */
910                         remaining -= (e - p) + 1;
911                         p = e + 1;
912                         continue;
913                 }
914
915                 /* A property follows */
916
917                 if (n+N_IOVEC_META_FIELDS >= m) {
918                         struct iovec *c;
919                         unsigned u;
920
921                         u = MAX((n+N_IOVEC_META_FIELDS+1) * 2U, 4U);
922                         c = realloc(iovec, u * sizeof(struct iovec));
923                         if (!c) {
924                                 log_oom();
925                                 break;
926                         }
927
928                         iovec = c;
929                         m = u;
930                 }
931
932                 q = memchr(p, '=', e - p);
933                 if (q) {
934                         if (valid_user_field(p, q - p)) {
935                                 size_t l;
936
937                                 l = e - p;
938
939                                 /* If the field name starts with an
940                                  * underscore, skip the variable,
941                                  * since that indidates a trusted
942                                  * field */
943                                 iovec[n].iov_base = (char*) p;
944                                 iovec[n].iov_len = l;
945                                 n++;
946
947                                 /* We need to determine the priority
948                                  * of this entry for the rate limiting
949                                  * logic */
950                                 if (l == 10 &&
951                                     memcmp(p, "PRIORITY=", 9) == 0 &&
952                                     p[9] >= '0' && p[9] <= '9')
953                                         priority = (priority & LOG_FACMASK) | (p[9] - '0');
954
955                                 else if (l == 17 &&
956                                          memcmp(p, "SYSLOG_FACILITY=", 16) == 0 &&
957                                          p[16] >= '0' && p[16] <= '9')
958                                         priority = (priority & LOG_PRIMASK) | ((p[16] - '0') << 3);
959
960                                 else if (l == 18 &&
961                                          memcmp(p, "SYSLOG_FACILITY=", 16) == 0 &&
962                                          p[16] >= '0' && p[16] <= '9' &&
963                                          p[17] >= '0' && p[17] <= '9')
964                                         priority = (priority & LOG_PRIMASK) | (((p[16] - '0')*10 + (p[17] - '0')) << 3);
965
966                                 else if (l >= 19 &&
967                                          memcmp(p, "SYSLOG_IDENTIFIER=", 18) == 0) {
968                                         char *t;
969
970                                         t = strndup(p + 18, l - 18);
971                                         if (t) {
972                                                 free(identifier);
973                                                 identifier = t;
974                                         }
975                                 } else if (l >= 8 &&
976                                            memcmp(p, "MESSAGE=", 8) == 0) {
977                                         char *t;
978
979                                         t = strndup(p + 8, l - 8);
980                                         if (t) {
981                                                 free(message);
982                                                 message = t;
983                                         }
984                                 }
985                         }
986
987                         remaining -= (e - p) + 1;
988                         p = e + 1;
989                         continue;
990                 } else {
991                         le64_t l_le;
992                         uint64_t l;
993                         char *k;
994
995                         if (remaining < e - p + 1 + sizeof(uint64_t) + 1) {
996                                 log_debug("Failed to parse message, ignoring.");
997                                 break;
998                         }
999
1000                         memcpy(&l_le, e + 1, sizeof(uint64_t));
1001                         l = le64toh(l_le);
1002
1003                         if (remaining < e - p + 1 + sizeof(uint64_t) + l + 1 ||
1004                             e[1+sizeof(uint64_t)+l] != '\n') {
1005                                 log_debug("Failed to parse message, ignoring.");
1006                                 break;
1007                         }
1008
1009                         k = malloc((e - p) + 1 + l);
1010                         if (!k) {
1011                                 log_oom();
1012                                 break;
1013                         }
1014
1015                         memcpy(k, p, e - p);
1016                         k[e - p] = '=';
1017                         memcpy(k + (e - p) + 1, e + 1 + sizeof(uint64_t), l);
1018
1019                         if (valid_user_field(p, e - p)) {
1020                                 iovec[n].iov_base = k;
1021                                 iovec[n].iov_len = (e - p) + 1 + l;
1022                                 n++;
1023                         } else
1024                                 free(k);
1025
1026                         remaining -= (e - p) + 1 + sizeof(uint64_t) + l + 1;
1027                         p = e + 1 + sizeof(uint64_t) + l + 1;
1028                 }
1029         }
1030
1031         if (n <= 0)
1032                 goto finish;
1033
1034         tn = n++;
1035         IOVEC_SET_STRING(iovec[tn], "_TRANSPORT=journal");
1036
1037         if (message) {
1038                 if (s->forward_to_syslog)
1039                         server_forward_syslog(s, priority, identifier, message, ucred, tv);
1040
1041                 if (s->forward_to_kmsg)
1042                         server_forward_kmsg(s, priority, identifier, message, ucred);
1043
1044                 if (s->forward_to_console)
1045                         server_forward_console(s, priority, identifier, message, ucred);
1046         }
1047
1048         server_dispatch_message(s, iovec, n, m, ucred, tv, label, label_len, NULL, priority);
1049
1050 finish:
1051         for (j = 0; j < n; j++)  {
1052                 if (j == tn)
1053                         continue;
1054
1055                 if (iovec[j].iov_base < buffer ||
1056                     (const uint8_t*) iovec[j].iov_base >= (const uint8_t*) buffer + buffer_size)
1057                         free(iovec[j].iov_base);
1058         }
1059
1060         free(iovec);
1061         free(identifier);
1062         free(message);
1063 }
1064
1065 static void process_native_file(
1066                 Server *s,
1067                 int fd,
1068                 struct ucred *ucred,
1069                 struct timeval *tv,
1070                 const char *label, size_t label_len) {
1071
1072         struct stat st;
1073         void *p;
1074         ssize_t n;
1075
1076         assert(s);
1077         assert(fd >= 0);
1078
1079         /* Data is in the passed file, since it didn't fit in a
1080          * datagram. We can't map the file here, since clients might
1081          * then truncate it and trigger a SIGBUS for us. So let's
1082          * stupidly read it */
1083
1084         if (fstat(fd, &st) < 0) {
1085                 log_error("Failed to stat passed file, ignoring: %m");
1086                 return;
1087         }
1088
1089         if (!S_ISREG(st.st_mode)) {
1090                 log_error("File passed is not regular. Ignoring.");
1091                 return;
1092         }
1093
1094         if (st.st_size <= 0)
1095                 return;
1096
1097         if (st.st_size > ENTRY_SIZE_MAX) {
1098                 log_error("File passed too large. Ignoring.");
1099                 return;
1100         }
1101
1102         p = malloc(st.st_size);
1103         if (!p) {
1104                 log_oom();
1105                 return;
1106         }
1107
1108         n = pread(fd, p, st.st_size, 0);
1109         if (n < 0)
1110                 log_error("Failed to read file, ignoring: %s", strerror(-n));
1111         else if (n > 0)
1112                 process_native_message(s, p, n, ucred, tv, label, label_len);
1113
1114         free(p);
1115 }
1116
1117 static int system_journal_open(Server *s) {
1118         int r;
1119         char *fn;
1120         sd_id128_t machine;
1121         char ids[33];
1122
1123         r = sd_id128_get_machine(&machine);
1124         if (r < 0)
1125                 return r;
1126
1127         sd_id128_to_string(machine, ids);
1128
1129         if (!s->system_journal &&
1130             (s->storage == STORAGE_PERSISTENT || s->storage == STORAGE_AUTO) &&
1131             access("/run/systemd/journal/flushed", F_OK) >= 0) {
1132
1133                 /* If in auto mode: first try to create the machine
1134                  * path, but not the prefix.
1135                  *
1136                  * If in persistent mode: create /var/log/journal and
1137                  * the machine path */
1138
1139                 if (s->storage == STORAGE_PERSISTENT)
1140                         (void) mkdir("/var/log/journal/", 0755);
1141
1142                 fn = strappend("/var/log/journal/", ids);
1143                 if (!fn)
1144                         return -ENOMEM;
1145
1146                 (void) mkdir(fn, 0755);
1147                 free(fn);
1148
1149                 fn = strjoin("/var/log/journal/", ids, "/system.journal", NULL);
1150                 if (!fn)
1151                         return -ENOMEM;
1152
1153                 r = journal_file_open_reliably(fn, O_RDWR|O_CREAT, 0640, s->compress, s->seal, &s->system_metrics, s->mmap, NULL, &s->system_journal);
1154                 free(fn);
1155
1156                 if (r >= 0)
1157                         server_fix_perms(s, s->system_journal, 0);
1158                 else if (r < 0) {
1159
1160                         if (r != -ENOENT && r != -EROFS)
1161                                 log_warning("Failed to open system journal: %s", strerror(-r));
1162
1163                         r = 0;
1164                 }
1165         }
1166
1167         if (!s->runtime_journal &&
1168             (s->storage != STORAGE_NONE)) {
1169
1170                 fn = strjoin("/run/log/journal/", ids, "/system.journal", NULL);
1171                 if (!fn)
1172                         return -ENOMEM;
1173
1174                 if (s->system_journal) {
1175
1176                         /* Try to open the runtime journal, but only
1177                          * if it already exists, so that we can flush
1178                          * it into the system journal */
1179
1180                         r = journal_file_open(fn, O_RDWR, 0640, s->compress, false, &s->runtime_metrics, s->mmap, NULL, &s->runtime_journal);
1181                         free(fn);
1182
1183                         if (r < 0) {
1184                                 if (r != -ENOENT)
1185                                         log_warning("Failed to open runtime journal: %s", strerror(-r));
1186
1187                                 r = 0;
1188                         }
1189
1190                 } else {
1191
1192                         /* OK, we really need the runtime journal, so create
1193                          * it if necessary. */
1194
1195                         (void) mkdir_parents(fn, 0755);
1196                         r = journal_file_open_reliably(fn, O_RDWR|O_CREAT, 0640, s->compress, false, &s->runtime_metrics, s->mmap, NULL, &s->runtime_journal);
1197                         free(fn);
1198
1199                         if (r < 0) {
1200                                 log_error("Failed to open runtime journal: %s", strerror(-r));
1201                                 return r;
1202                         }
1203                 }
1204
1205                 if (s->runtime_journal)
1206                         server_fix_perms(s, s->runtime_journal, 0);
1207         }
1208
1209         return r;
1210 }
1211
1212 static int server_flush_to_var(Server *s) {
1213         Object *o = NULL;
1214         int r;
1215         sd_id128_t machine;
1216         sd_journal *j;
1217
1218         assert(s);
1219
1220         if (s->storage != STORAGE_AUTO &&
1221             s->storage != STORAGE_PERSISTENT)
1222                 return 0;
1223
1224         if (!s->runtime_journal)
1225                 return 0;
1226
1227         system_journal_open(s);
1228
1229         if (!s->system_journal)
1230                 return 0;
1231
1232         log_info("Flushing to /var...");
1233
1234         r = sd_id128_get_machine(&machine);
1235         if (r < 0) {
1236                 log_error("Failed to get machine id: %s", strerror(-r));
1237                 return r;
1238         }
1239
1240         r = sd_journal_open(&j, SD_JOURNAL_RUNTIME_ONLY);
1241         if (r < 0) {
1242                 log_error("Failed to read runtime journal: %s", strerror(-r));
1243                 return r;
1244         }
1245
1246         SD_JOURNAL_FOREACH(j) {
1247                 JournalFile *f;
1248
1249                 f = j->current_file;
1250                 assert(f && f->current_offset > 0);
1251
1252                 r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o);
1253                 if (r < 0) {
1254                         log_error("Can't read entry: %s", strerror(-r));
1255                         goto finish;
1256                 }
1257
1258                 r = journal_file_copy_entry(f, s->system_journal, o, f->current_offset, NULL, NULL, NULL);
1259                 if (r == -E2BIG) {
1260                         log_info("Allocation limit reached.");
1261
1262                         journal_file_post_change(s->system_journal);
1263                         server_rotate(s);
1264                         server_vacuum(s);
1265
1266                         r = journal_file_copy_entry(f, s->system_journal, o, f->current_offset, NULL, NULL, NULL);
1267                 }
1268
1269                 if (r < 0) {
1270                         log_error("Can't write entry: %s", strerror(-r));
1271                         goto finish;
1272                 }
1273         }
1274
1275 finish:
1276         journal_file_post_change(s->system_journal);
1277
1278         journal_file_close(s->runtime_journal);
1279         s->runtime_journal = NULL;
1280
1281         if (r >= 0)
1282                 rm_rf("/run/log/journal", false, true, false);
1283
1284         return r;
1285 }
1286
1287 static int process_event(Server *s, struct epoll_event *ev) {
1288         assert(s);
1289         assert(ev);
1290
1291         if (ev->data.fd == s->signal_fd) {
1292                 struct signalfd_siginfo sfsi;
1293                 ssize_t n;
1294
1295                 if (ev->events != EPOLLIN) {
1296                         log_info("Got invalid event from epoll.");
1297                         return -EIO;
1298                 }
1299
1300                 n = read(s->signal_fd, &sfsi, sizeof(sfsi));
1301                 if (n != sizeof(sfsi)) {
1302
1303                         if (n >= 0)
1304                                 return -EIO;
1305
1306                         if (errno == EINTR || errno == EAGAIN)
1307                                 return 1;
1308
1309                         return -errno;
1310                 }
1311
1312                 log_info("Received SIG%s", signal_to_string(sfsi.ssi_signo));
1313
1314                 if (sfsi.ssi_signo == SIGUSR1) {
1315                         touch("/run/systemd/journal/flushed");
1316                         server_flush_to_var(s);
1317                         return 1;
1318                 }
1319
1320                 if (sfsi.ssi_signo == SIGUSR2) {
1321                         server_rotate(s);
1322                         server_vacuum(s);
1323                         return 1;
1324                 }
1325
1326                 return 0;
1327
1328         } else if (ev->data.fd == s->dev_kmsg_fd) {
1329                 int r;
1330
1331                 if (ev->events != EPOLLIN) {
1332                         log_info("Got invalid event from epoll.");
1333                         return -EIO;
1334                 }
1335
1336                 r = server_read_dev_kmsg(s);
1337                 if (r < 0)
1338                         return r;
1339
1340                 return 1;
1341
1342         } else if (ev->data.fd == s->native_fd ||
1343                    ev->data.fd == s->syslog_fd) {
1344
1345                 if (ev->events != EPOLLIN) {
1346                         log_info("Got invalid event from epoll.");
1347                         return -EIO;
1348                 }
1349
1350                 for (;;) {
1351                         struct msghdr msghdr;
1352                         struct iovec iovec;
1353                         struct ucred *ucred = NULL;
1354                         struct timeval *tv = NULL;
1355                         struct cmsghdr *cmsg;
1356                         char *label = NULL;
1357                         size_t label_len = 0;
1358                         union {
1359                                 struct cmsghdr cmsghdr;
1360
1361                                 /* We use NAME_MAX space for the
1362                                  * SELinux label here. The kernel
1363                                  * currently enforces no limit, but
1364                                  * according to suggestions from the
1365                                  * SELinux people this will change and
1366                                  * it will probably be identical to
1367                                  * NAME_MAX. For now we use that, but
1368                                  * this should be updated one day when
1369                                  * the final limit is known.*/
1370                                 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
1371                                             CMSG_SPACE(sizeof(struct timeval)) +
1372                                             CMSG_SPACE(sizeof(int)) + /* fd */
1373                                             CMSG_SPACE(NAME_MAX)]; /* selinux label */
1374                         } control;
1375                         ssize_t n;
1376                         int v;
1377                         int *fds = NULL;
1378                         unsigned n_fds = 0;
1379
1380                         if (ioctl(ev->data.fd, SIOCINQ, &v) < 0) {
1381                                 log_error("SIOCINQ failed: %m");
1382                                 return -errno;
1383                         }
1384
1385                         if (s->buffer_size < (size_t) v) {
1386                                 void *b;
1387                                 size_t l;
1388
1389                                 l = MAX(LINE_MAX + (size_t) v, s->buffer_size * 2);
1390                                 b = realloc(s->buffer, l+1);
1391
1392                                 if (!b) {
1393                                         log_error("Couldn't increase buffer.");
1394                                         return -ENOMEM;
1395                                 }
1396
1397                                 s->buffer_size = l;
1398                                 s->buffer = b;
1399                         }
1400
1401                         zero(iovec);
1402                         iovec.iov_base = s->buffer;
1403                         iovec.iov_len = s->buffer_size;
1404
1405                         zero(control);
1406                         zero(msghdr);
1407                         msghdr.msg_iov = &iovec;
1408                         msghdr.msg_iovlen = 1;
1409                         msghdr.msg_control = &control;
1410                         msghdr.msg_controllen = sizeof(control);
1411
1412                         n = recvmsg(ev->data.fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
1413                         if (n < 0) {
1414
1415                                 if (errno == EINTR || errno == EAGAIN)
1416                                         return 1;
1417
1418                                 log_error("recvmsg() failed: %m");
1419                                 return -errno;
1420                         }
1421
1422                         for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg; cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
1423
1424                                 if (cmsg->cmsg_level == SOL_SOCKET &&
1425                                     cmsg->cmsg_type == SCM_CREDENTIALS &&
1426                                     cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)))
1427                                         ucred = (struct ucred*) CMSG_DATA(cmsg);
1428                                 else if (cmsg->cmsg_level == SOL_SOCKET &&
1429                                          cmsg->cmsg_type == SCM_SECURITY) {
1430                                         label = (char*) CMSG_DATA(cmsg);
1431                                         label_len = cmsg->cmsg_len - CMSG_LEN(0);
1432                                 } else if (cmsg->cmsg_level == SOL_SOCKET &&
1433                                          cmsg->cmsg_type == SO_TIMESTAMP &&
1434                                          cmsg->cmsg_len == CMSG_LEN(sizeof(struct timeval)))
1435                                         tv = (struct timeval*) CMSG_DATA(cmsg);
1436                                 else if (cmsg->cmsg_level == SOL_SOCKET &&
1437                                          cmsg->cmsg_type == SCM_RIGHTS) {
1438                                         fds = (int*) CMSG_DATA(cmsg);
1439                                         n_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
1440                                 }
1441                         }
1442
1443                         if (ev->data.fd == s->syslog_fd) {
1444                                 char *e;
1445
1446                                 if (n > 0 && n_fds == 0) {
1447                                         e = memchr(s->buffer, '\n', n);
1448                                         if (e)
1449                                                 *e = 0;
1450                                         else
1451                                                 s->buffer[n] = 0;
1452
1453                                         server_process_syslog_message(s, strstrip(s->buffer), ucred, tv, label, label_len);
1454                                 } else if (n_fds > 0)
1455                                         log_warning("Got file descriptors via syslog socket. Ignoring.");
1456
1457                         } else {
1458                                 if (n > 0 && n_fds == 0)
1459                                         process_native_message(s, s->buffer, n, ucred, tv, label, label_len);
1460                                 else if (n == 0 && n_fds == 1)
1461                                         process_native_file(s, fds[0], ucred, tv, label, label_len);
1462                                 else if (n_fds > 0)
1463                                         log_warning("Got too many file descriptors via native socket. Ignoring.");
1464                         }
1465
1466                         close_many(fds, n_fds);
1467                 }
1468
1469                 return 1;
1470
1471         } else if (ev->data.fd == s->stdout_fd) {
1472
1473                 if (ev->events != EPOLLIN) {
1474                         log_info("Got invalid event from epoll.");
1475                         return -EIO;
1476                 }
1477
1478                 stdout_stream_new(s);
1479                 return 1;
1480
1481         } else {
1482                 StdoutStream *stream;
1483
1484                 if ((ev->events|EPOLLIN|EPOLLHUP) != (EPOLLIN|EPOLLHUP)) {
1485                         log_info("Got invalid event from epoll.");
1486                         return -EIO;
1487                 }
1488
1489                 /* If it is none of the well-known fds, it must be an
1490                  * stdout stream fd. Note that this is a bit ugly here
1491                  * (since we rely that none of the well-known fds
1492                  * could be interpreted as pointer), but nonetheless
1493                  * safe, since the well-known fds would never get an
1494                  * fd > 4096, i.e. beyond the first memory page */
1495
1496                 stream = ev->data.ptr;
1497
1498                 if (stdout_stream_process(stream) <= 0)
1499                         stdout_stream_free(stream);
1500
1501                 return 1;
1502         }
1503
1504         log_error("Unknown event.");
1505         return 0;
1506 }
1507
1508
1509 static int open_native_socket(Server*s) {
1510         union sockaddr_union sa;
1511         int one, r;
1512         struct epoll_event ev;
1513
1514         assert(s);
1515
1516         if (s->native_fd < 0) {
1517
1518                 s->native_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
1519                 if (s->native_fd < 0) {
1520                         log_error("socket() failed: %m");
1521                         return -errno;
1522                 }
1523
1524                 zero(sa);
1525                 sa.un.sun_family = AF_UNIX;
1526                 strncpy(sa.un.sun_path, "/run/systemd/journal/socket", sizeof(sa.un.sun_path));
1527
1528                 unlink(sa.un.sun_path);
1529
1530                 r = bind(s->native_fd, &sa.sa, offsetof(union sockaddr_union, un.sun_path) + strlen(sa.un.sun_path));
1531                 if (r < 0) {
1532                         log_error("bind() failed: %m");
1533                         return -errno;
1534                 }
1535
1536                 chmod(sa.un.sun_path, 0666);
1537         } else
1538                 fd_nonblock(s->native_fd, 1);
1539
1540         one = 1;
1541         r = setsockopt(s->native_fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
1542         if (r < 0) {
1543                 log_error("SO_PASSCRED failed: %m");
1544                 return -errno;
1545         }
1546
1547 #ifdef HAVE_SELINUX
1548         one = 1;
1549         r = setsockopt(s->syslog_fd, SOL_SOCKET, SO_PASSSEC, &one, sizeof(one));
1550         if (r < 0)
1551                 log_warning("SO_PASSSEC failed: %m");
1552 #endif
1553
1554         one = 1;
1555         r = setsockopt(s->native_fd, SOL_SOCKET, SO_TIMESTAMP, &one, sizeof(one));
1556         if (r < 0) {
1557                 log_error("SO_TIMESTAMP failed: %m");
1558                 return -errno;
1559         }
1560
1561         zero(ev);
1562         ev.events = EPOLLIN;
1563         ev.data.fd = s->native_fd;
1564         if (epoll_ctl(s->epoll_fd, EPOLL_CTL_ADD, s->native_fd, &ev) < 0) {
1565                 log_error("Failed to add native server fd to epoll object: %m");
1566                 return -errno;
1567         }
1568
1569         return 0;
1570 }
1571
1572
1573 static int open_signalfd(Server *s) {
1574         sigset_t mask;
1575         struct epoll_event ev;
1576
1577         assert(s);
1578
1579         assert_se(sigemptyset(&mask) == 0);
1580         sigset_add_many(&mask, SIGINT, SIGTERM, SIGUSR1, SIGUSR2, -1);
1581         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1582
1583         s->signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC);
1584         if (s->signal_fd < 0) {
1585                 log_error("signalfd(): %m");
1586                 return -errno;
1587         }
1588
1589         zero(ev);
1590         ev.events = EPOLLIN;
1591         ev.data.fd = s->signal_fd;
1592
1593         if (epoll_ctl(s->epoll_fd, EPOLL_CTL_ADD, s->signal_fd, &ev) < 0) {
1594                 log_error("epoll_ctl(): %m");
1595                 return -errno;
1596         }
1597
1598         return 0;
1599 }
1600
1601 static int server_parse_proc_cmdline(Server *s) {
1602         char *line, *w, *state;
1603         int r;
1604         size_t l;
1605
1606         if (detect_container(NULL) > 0)
1607                 return 0;
1608
1609         r = read_one_line_file("/proc/cmdline", &line);
1610         if (r < 0) {
1611                 log_warning("Failed to read /proc/cmdline, ignoring: %s", strerror(-r));
1612                 return 0;
1613         }
1614
1615         FOREACH_WORD_QUOTED(w, l, line, state) {
1616                 char *word;
1617
1618                 word = strndup(w, l);
1619                 if (!word) {
1620                         r = -ENOMEM;
1621                         goto finish;
1622                 }
1623
1624                 if (startswith(word, "systemd.journald.forward_to_syslog=")) {
1625                         r = parse_boolean(word + 35);
1626                         if (r < 0)
1627                                 log_warning("Failed to parse forward to syslog switch %s. Ignoring.", word + 35);
1628                         else
1629                                 s->forward_to_syslog = r;
1630                 } else if (startswith(word, "systemd.journald.forward_to_kmsg=")) {
1631                         r = parse_boolean(word + 33);
1632                         if (r < 0)
1633                                 log_warning("Failed to parse forward to kmsg switch %s. Ignoring.", word + 33);
1634                         else
1635                                 s->forward_to_kmsg = r;
1636                 } else if (startswith(word, "systemd.journald.forward_to_console=")) {
1637                         r = parse_boolean(word + 36);
1638                         if (r < 0)
1639                                 log_warning("Failed to parse forward to console switch %s. Ignoring.", word + 36);
1640                         else
1641                                 s->forward_to_console = r;
1642                 } else if (startswith(word, "systemd.journald"))
1643                         log_warning("Invalid systemd.journald parameter. Ignoring.");
1644
1645                 free(word);
1646         }
1647
1648         r = 0;
1649
1650 finish:
1651         free(line);
1652         return r;
1653 }
1654
1655 static int server_parse_config_file(Server *s) {
1656         FILE *f;
1657         const char *fn;
1658         int r;
1659
1660         assert(s);
1661
1662         fn = "/etc/systemd/journald.conf";
1663         f = fopen(fn, "re");
1664         if (!f) {
1665                 if (errno == ENOENT)
1666                         return 0;
1667
1668                 log_warning("Failed to open configuration file %s: %m", fn);
1669                 return -errno;
1670         }
1671
1672         r = config_parse(fn, f, "Journal\0", config_item_perf_lookup, (void*) journald_gperf_lookup, false, s);
1673         if (r < 0)
1674                 log_warning("Failed to parse configuration file: %s", strerror(-r));
1675
1676         fclose(f);
1677
1678         return r;
1679 }
1680
1681 static int server_init(Server *s) {
1682         int n, r, fd;
1683
1684         assert(s);
1685
1686         zero(*s);
1687         s->syslog_fd = s->native_fd = s->stdout_fd = s->signal_fd = s->epoll_fd = s->dev_kmsg_fd = -1;
1688         s->compress = true;
1689         s->seal = true;
1690
1691         s->rate_limit_interval = DEFAULT_RATE_LIMIT_INTERVAL;
1692         s->rate_limit_burst = DEFAULT_RATE_LIMIT_BURST;
1693
1694         s->forward_to_syslog = true;
1695
1696         s->max_level_store = LOG_DEBUG;
1697         s->max_level_syslog = LOG_DEBUG;
1698         s->max_level_kmsg = LOG_NOTICE;
1699         s->max_level_console = LOG_INFO;
1700
1701         memset(&s->system_metrics, 0xFF, sizeof(s->system_metrics));
1702         memset(&s->runtime_metrics, 0xFF, sizeof(s->runtime_metrics));
1703
1704         server_parse_config_file(s);
1705         server_parse_proc_cmdline(s);
1706
1707         mkdir_p("/run/systemd/journal", 0755);
1708
1709         s->user_journals = hashmap_new(trivial_hash_func, trivial_compare_func);
1710         if (!s->user_journals)
1711                 return log_oom();
1712
1713         s->mmap = mmap_cache_new();
1714         if (!s->mmap)
1715                 return log_oom();
1716
1717         s->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
1718         if (s->epoll_fd < 0) {
1719                 log_error("Failed to create epoll object: %m");
1720                 return -errno;
1721         }
1722
1723         n = sd_listen_fds(true);
1724         if (n < 0) {
1725                 log_error("Failed to read listening file descriptors from environment: %s", strerror(-n));
1726                 return n;
1727         }
1728
1729         for (fd = SD_LISTEN_FDS_START; fd < SD_LISTEN_FDS_START + n; fd++) {
1730
1731                 if (sd_is_socket_unix(fd, SOCK_DGRAM, -1, "/run/systemd/journal/socket", 0) > 0) {
1732
1733                         if (s->native_fd >= 0) {
1734                                 log_error("Too many native sockets passed.");
1735                                 return -EINVAL;
1736                         }
1737
1738                         s->native_fd = fd;
1739
1740                 } else if (sd_is_socket_unix(fd, SOCK_STREAM, 1, "/run/systemd/journal/stdout", 0) > 0) {
1741
1742                         if (s->stdout_fd >= 0) {
1743                                 log_error("Too many stdout sockets passed.");
1744                                 return -EINVAL;
1745                         }
1746
1747                         s->stdout_fd = fd;
1748
1749                 } else if (sd_is_socket_unix(fd, SOCK_DGRAM, -1, "/dev/log", 0) > 0) {
1750
1751                         if (s->syslog_fd >= 0) {
1752                                 log_error("Too many /dev/log sockets passed.");
1753                                 return -EINVAL;
1754                         }
1755
1756                         s->syslog_fd = fd;
1757
1758                 } else {
1759                         log_error("Unknown socket passed.");
1760                         return -EINVAL;
1761                 }
1762         }
1763
1764         r = server_open_syslog_socket(s);
1765         if (r < 0)
1766                 return r;
1767
1768         r = open_native_socket(s);
1769         if (r < 0)
1770                 return r;
1771
1772         r = server_open_stdout_socket(s);
1773         if (r < 0)
1774                 return r;
1775
1776         r = server_open_dev_kmsg(s);
1777         if (r < 0)
1778                 return r;
1779
1780         r = server_open_kernel_seqnum(s);
1781         if (r < 0)
1782                 return r;
1783
1784         r = open_signalfd(s);
1785         if (r < 0)
1786                 return r;
1787
1788         s->udev = udev_new();
1789         if (!s->udev)
1790                 return -ENOMEM;
1791
1792         s->rate_limit = journal_rate_limit_new(s->rate_limit_interval, s->rate_limit_burst);
1793         if (!s->rate_limit)
1794                 return -ENOMEM;
1795
1796         r = system_journal_open(s);
1797         if (r < 0)
1798                 return r;
1799
1800         return 0;
1801 }
1802
1803 static void maybe_append_tags(Server *s) {
1804 #ifdef HAVE_GCRYPT
1805         JournalFile *f;
1806         Iterator i;
1807         usec_t n;
1808
1809         n = now(CLOCK_REALTIME);
1810
1811         if (s->system_journal)
1812                 journal_file_maybe_append_tag(s->system_journal, n);
1813
1814         HASHMAP_FOREACH(f, s->user_journals, i)
1815                 journal_file_maybe_append_tag(f, n);
1816 #endif
1817 }
1818
1819 static void server_done(Server *s) {
1820         JournalFile *f;
1821         assert(s);
1822
1823         while (s->stdout_streams)
1824                 stdout_stream_free(s->stdout_streams);
1825
1826         if (s->system_journal)
1827                 journal_file_close(s->system_journal);
1828
1829         if (s->runtime_journal)
1830                 journal_file_close(s->runtime_journal);
1831
1832         while ((f = hashmap_steal_first(s->user_journals)))
1833                 journal_file_close(f);
1834
1835         hashmap_free(s->user_journals);
1836
1837         if (s->epoll_fd >= 0)
1838                 close_nointr_nofail(s->epoll_fd);
1839
1840         if (s->signal_fd >= 0)
1841                 close_nointr_nofail(s->signal_fd);
1842
1843         if (s->syslog_fd >= 0)
1844                 close_nointr_nofail(s->syslog_fd);
1845
1846         if (s->native_fd >= 0)
1847                 close_nointr_nofail(s->native_fd);
1848
1849         if (s->stdout_fd >= 0)
1850                 close_nointr_nofail(s->stdout_fd);
1851
1852         if (s->dev_kmsg_fd >= 0)
1853                 close_nointr_nofail(s->dev_kmsg_fd);
1854
1855         if (s->rate_limit)
1856                 journal_rate_limit_free(s->rate_limit);
1857
1858         if (s->kernel_seqnum)
1859                 munmap(s->kernel_seqnum, sizeof(uint64_t));
1860
1861         free(s->buffer);
1862         free(s->tty_path);
1863
1864         if (s->mmap)
1865                 mmap_cache_unref(s->mmap);
1866
1867         if (s->udev)
1868                 udev_unref(s->udev);
1869 }
1870
1871 int main(int argc, char *argv[]) {
1872         Server server;
1873         int r;
1874
1875         /* if (getppid() != 1) { */
1876         /*         log_error("This program should be invoked by init only."); */
1877         /*         return EXIT_FAILURE; */
1878         /* } */
1879
1880         if (argc > 1) {
1881                 log_error("This program does not take arguments.");
1882                 return EXIT_FAILURE;
1883         }
1884
1885         log_set_target(LOG_TARGET_SAFE);
1886         log_set_facility(LOG_SYSLOG);
1887         log_set_max_level(LOG_DEBUG);
1888         log_parse_environment();
1889         log_open();
1890
1891         umask(0022);
1892
1893         r = server_init(&server);
1894         if (r < 0)
1895                 goto finish;
1896
1897         server_vacuum(&server);
1898         server_flush_to_var(&server);
1899         server_flush_dev_kmsg(&server);
1900
1901         log_debug("systemd-journald running as pid %lu", (unsigned long) getpid());
1902         server_driver_message(&server, SD_MESSAGE_JOURNAL_START, "Journal started");
1903
1904         sd_notify(false,
1905                   "READY=1\n"
1906                   "STATUS=Processing requests...");
1907
1908         for (;;) {
1909                 struct epoll_event event;
1910                 int t;
1911
1912 #ifdef HAVE_GCRYPT
1913                 usec_t u;
1914
1915                 if (server.system_journal &&
1916                     journal_file_next_evolve_usec(server.system_journal, &u)) {
1917                         usec_t n;
1918
1919                         n = now(CLOCK_REALTIME);
1920
1921                         if (n >= u)
1922                                 t = 0;
1923                         else
1924                                 t = (int) ((u - n + USEC_PER_MSEC - 1) / USEC_PER_MSEC);
1925                 } else
1926 #endif
1927                         t = -1;
1928
1929                 r = epoll_wait(server.epoll_fd, &event, 1, t);
1930                 if (r < 0) {
1931
1932                         if (errno == EINTR)
1933                                 continue;
1934
1935                         log_error("epoll_wait() failed: %m");
1936                         r = -errno;
1937                         goto finish;
1938                 }
1939
1940                 if (r > 0) {
1941                         r = process_event(&server, &event);
1942                         if (r < 0)
1943                                 goto finish;
1944                         else if (r == 0)
1945                                 break;
1946                 }
1947
1948                 maybe_append_tags(&server);
1949         }
1950
1951         log_debug("systemd-journald stopped as pid %lu", (unsigned long) getpid());
1952         server_driver_message(&server, SD_MESSAGE_JOURNAL_STOP, "Journal stopped");
1953
1954 finish:
1955         sd_notify(false,
1956                   "STATUS=Shutting down...");
1957
1958         server_done(&server);
1959
1960         return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
1961 }