chiark / gitweb /
docs: install README files into /var/log and 7etc/rc.d/init.d
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 void journal_file_close(JournalFile *f) {
69         assert(f);
70
71 #ifdef HAVE_GCRYPT
72         /* Write the final tag */
73         if (f->seal && f->writable)
74                 journal_file_append_tag(f);
75 #endif
76
77         /* Sync everything to disk, before we mark the file offline */
78         if (f->mmap && f->fd >= 0)
79                 mmap_cache_close_fd(f->mmap, f->fd);
80
81         if (f->writable && f->fd >= 0)
82                 fdatasync(f->fd);
83
84         if (f->header) {
85                 /* Mark the file offline. Don't override the archived state if it already is set */
86                 if (f->writable && f->header->state == STATE_ONLINE)
87                         f->header->state = STATE_OFFLINE;
88
89                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
90         }
91
92         if (f->fd >= 0)
93                 close_nointr_nofail(f->fd);
94
95         free(f->path);
96
97         if (f->mmap)
98                 mmap_cache_unref(f->mmap);
99
100 #ifdef HAVE_XZ
101         free(f->compress_buffer);
102 #endif
103
104 #ifdef HAVE_GCRYPT
105         if (f->fss_file)
106                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
107         else if (f->fsprg_state)
108                 free(f->fsprg_state);
109
110         free(f->fsprg_seed);
111
112         if (f->hmac)
113                 gcry_md_close(f->hmac);
114 #endif
115
116         free(f);
117 }
118
119 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
120         Header h;
121         ssize_t k;
122         int r;
123
124         assert(f);
125
126         zero(h);
127         memcpy(h.signature, HEADER_SIGNATURE, 8);
128         h.header_size = htole64(ALIGN64(sizeof(h)));
129
130         h.incompatible_flags =
131                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
132
133         h.compatible_flags =
134                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
135
136         r = sd_id128_randomize(&h.file_id);
137         if (r < 0)
138                 return r;
139
140         if (template) {
141                 h.seqnum_id = template->header->seqnum_id;
142                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
143         } else
144                 h.seqnum_id = h.file_id;
145
146         k = pwrite(f->fd, &h, sizeof(h), 0);
147         if (k < 0)
148                 return -errno;
149
150         if (k != sizeof(h))
151                 return -EIO;
152
153         return 0;
154 }
155
156 static int journal_file_refresh_header(JournalFile *f) {
157         int r;
158         sd_id128_t boot_id;
159
160         assert(f);
161
162         r = sd_id128_get_machine(&f->header->machine_id);
163         if (r < 0)
164                 return r;
165
166         r = sd_id128_get_boot(&boot_id);
167         if (r < 0)
168                 return r;
169
170         if (sd_id128_equal(boot_id, f->header->boot_id))
171                 f->tail_entry_monotonic_valid = true;
172
173         f->header->boot_id = boot_id;
174
175         f->header->state = STATE_ONLINE;
176
177         /* Sync the online state to disk */
178         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
179         fdatasync(f->fd);
180
181         return 0;
182 }
183
184 static int journal_file_verify_header(JournalFile *f) {
185         assert(f);
186
187         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
188                 return -EBADMSG;
189
190         /* In both read and write mode we refuse to open files with
191          * incompatible flags we don't know */
192 #ifdef HAVE_XZ
193         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
194                 return -EPROTONOSUPPORT;
195 #else
196         if (f->header->incompatible_flags != 0)
197                 return -EPROTONOSUPPORT;
198 #endif
199
200         /* When open for writing we refuse to open files with
201          * compatible flags, too */
202         if (f->writable) {
203 #ifdef HAVE_GCRYPT
204                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
205                         return -EPROTONOSUPPORT;
206 #else
207                 if (f->header->compatible_flags != 0)
208                         return -EPROTONOSUPPORT;
209 #endif
210         }
211
212         if (f->header->state >= _STATE_MAX)
213                 return -EBADMSG;
214
215         /* The first addition was n_data, so check that we are at least this large */
216         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
217                 return -EBADMSG;
218
219         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
220                 return -EBADMSG;
221
222         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
223                 return -ENODATA;
224
225         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
226                 return -ENODATA;
227
228         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
229             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
230             !VALID64(le64toh(f->header->tail_object_offset)) ||
231             !VALID64(le64toh(f->header->entry_array_offset)))
232                 return -ENODATA;
233
234         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
235             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
236             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
237             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
238                 return -ENODATA;
239
240         if (f->writable) {
241                 uint8_t state;
242                 sd_id128_t machine_id;
243                 int r;
244
245                 r = sd_id128_get_machine(&machine_id);
246                 if (r < 0)
247                         return r;
248
249                 if (!sd_id128_equal(machine_id, f->header->machine_id))
250                         return -EHOSTDOWN;
251
252                 state = f->header->state;
253
254                 if (state == STATE_ONLINE) {
255                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
256                         return -EBUSY;
257                 } else if (state == STATE_ARCHIVED)
258                         return -ESHUTDOWN;
259                 else if (state != STATE_OFFLINE) {
260                         log_debug("Journal file %s has unknown state %u.", f->path, state);
261                         return -EBUSY;
262                 }
263         }
264
265         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
266
267         f->seal = JOURNAL_HEADER_SEALED(f->header);
268
269         return 0;
270 }
271
272 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
273         uint64_t old_size, new_size;
274         int r;
275
276         assert(f);
277
278         /* We assume that this file is not sparse, and we know that
279          * for sure, since we always call posix_fallocate()
280          * ourselves */
281
282         old_size =
283                 le64toh(f->header->header_size) +
284                 le64toh(f->header->arena_size);
285
286         new_size = PAGE_ALIGN(offset + size);
287         if (new_size < le64toh(f->header->header_size))
288                 new_size = le64toh(f->header->header_size);
289
290         if (new_size <= old_size)
291                 return 0;
292
293         if (f->metrics.max_size > 0 &&
294             new_size > f->metrics.max_size)
295                 return -E2BIG;
296
297         if (new_size > f->metrics.min_size &&
298             f->metrics.keep_free > 0) {
299                 struct statvfs svfs;
300
301                 if (fstatvfs(f->fd, &svfs) >= 0) {
302                         uint64_t available;
303
304                         available = svfs.f_bfree * svfs.f_bsize;
305
306                         if (available >= f->metrics.keep_free)
307                                 available -= f->metrics.keep_free;
308                         else
309                                 available = 0;
310
311                         if (new_size - old_size > available)
312                                 return -E2BIG;
313                 }
314         }
315
316         /* Note that the glibc fallocate() fallback is very
317            inefficient, hence we try to minimize the allocation area
318            as we can. */
319         r = posix_fallocate(f->fd, old_size, new_size - old_size);
320         if (r != 0)
321                 return -r;
322
323         if (fstat(f->fd, &f->last_stat) < 0)
324                 return -errno;
325
326         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
327
328         return 0;
329 }
330
331 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
332         assert(f);
333         assert(ret);
334
335         if (size <= 0)
336                 return -EINVAL;
337
338         /* Avoid SIGBUS on invalid accesses */
339         if (offset + size > (uint64_t) f->last_stat.st_size) {
340                 /* Hmm, out of range? Let's refresh the fstat() data
341                  * first, before we trust that check. */
342
343                 if (fstat(f->fd, &f->last_stat) < 0 ||
344                     offset + size > (uint64_t) f->last_stat.st_size)
345                         return -EADDRNOTAVAIL;
346         }
347
348         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
349 }
350
351 static uint64_t minimum_header_size(Object *o) {
352
353         static uint64_t table[] = {
354                 [OBJECT_DATA] = sizeof(DataObject),
355                 [OBJECT_FIELD] = sizeof(FieldObject),
356                 [OBJECT_ENTRY] = sizeof(EntryObject),
357                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
358                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
359                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
360                 [OBJECT_TAG] = sizeof(TagObject),
361         };
362
363         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
364                 return sizeof(ObjectHeader);
365
366         return table[o->object.type];
367 }
368
369 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
370         int r;
371         void *t;
372         Object *o;
373         uint64_t s;
374         unsigned context;
375
376         assert(f);
377         assert(ret);
378
379         /* Objects may only be located at multiple of 64 bit */
380         if (!VALID64(offset))
381                 return -EFAULT;
382
383         /* One context for each type, plus one catch-all for the rest */
384         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
385
386         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
387         if (r < 0)
388                 return r;
389
390         o = (Object*) t;
391         s = le64toh(o->object.size);
392
393         if (s < sizeof(ObjectHeader))
394                 return -EBADMSG;
395
396         if (o->object.type <= OBJECT_UNUSED)
397                 return -EBADMSG;
398
399         if (s < minimum_header_size(o))
400                 return -EBADMSG;
401
402         if (type >= 0 && o->object.type != type)
403                 return -EBADMSG;
404
405         if (s > sizeof(ObjectHeader)) {
406                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
407                 if (r < 0)
408                         return r;
409
410                 o = (Object*) t;
411         }
412
413         *ret = o;
414         return 0;
415 }
416
417 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
418         uint64_t r;
419
420         assert(f);
421
422         r = le64toh(f->header->tail_entry_seqnum) + 1;
423
424         if (seqnum) {
425                 /* If an external seqnum counter was passed, we update
426                  * both the local and the external one, and set it to
427                  * the maximum of both */
428
429                 if (*seqnum + 1 > r)
430                         r = *seqnum + 1;
431
432                 *seqnum = r;
433         }
434
435         f->header->tail_entry_seqnum = htole64(r);
436
437         if (f->header->head_entry_seqnum == 0)
438                 f->header->head_entry_seqnum = htole64(r);
439
440         return r;
441 }
442
443 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
444         int r;
445         uint64_t p;
446         Object *tail, *o;
447         void *t;
448
449         assert(f);
450         assert(type > 0 && type < _OBJECT_TYPE_MAX);
451         assert(size >= sizeof(ObjectHeader));
452         assert(offset);
453         assert(ret);
454
455         p = le64toh(f->header->tail_object_offset);
456         if (p == 0)
457                 p = le64toh(f->header->header_size);
458         else {
459                 r = journal_file_move_to_object(f, -1, p, &tail);
460                 if (r < 0)
461                         return r;
462
463                 p += ALIGN64(le64toh(tail->object.size));
464         }
465
466         r = journal_file_allocate(f, p, size);
467         if (r < 0)
468                 return r;
469
470         r = journal_file_move_to(f, type, false, p, size, &t);
471         if (r < 0)
472                 return r;
473
474         o = (Object*) t;
475
476         zero(o->object);
477         o->object.type = type;
478         o->object.size = htole64(size);
479
480         f->header->tail_object_offset = htole64(p);
481         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
482
483         *ret = o;
484         *offset = p;
485
486         return 0;
487 }
488
489 static int journal_file_setup_data_hash_table(JournalFile *f) {
490         uint64_t s, p;
491         Object *o;
492         int r;
493
494         assert(f);
495
496         /* We estimate that we need 1 hash table entry per 768 of
497            journal file and we want to make sure we never get beyond
498            75% fill level. Calculate the hash table size for the
499            maximum file size based on these metrics. */
500
501         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
502         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
503                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
504
505         log_debug("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
506
507         r = journal_file_append_object(f,
508                                        OBJECT_DATA_HASH_TABLE,
509                                        offsetof(Object, hash_table.items) + s,
510                                        &o, &p);
511         if (r < 0)
512                 return r;
513
514         memset(o->hash_table.items, 0, s);
515
516         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
517         f->header->data_hash_table_size = htole64(s);
518
519         return 0;
520 }
521
522 static int journal_file_setup_field_hash_table(JournalFile *f) {
523         uint64_t s, p;
524         Object *o;
525         int r;
526
527         assert(f);
528
529         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
530         r = journal_file_append_object(f,
531                                        OBJECT_FIELD_HASH_TABLE,
532                                        offsetof(Object, hash_table.items) + s,
533                                        &o, &p);
534         if (r < 0)
535                 return r;
536
537         memset(o->hash_table.items, 0, s);
538
539         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
540         f->header->field_hash_table_size = htole64(s);
541
542         return 0;
543 }
544
545 static int journal_file_map_data_hash_table(JournalFile *f) {
546         uint64_t s, p;
547         void *t;
548         int r;
549
550         assert(f);
551
552         p = le64toh(f->header->data_hash_table_offset);
553         s = le64toh(f->header->data_hash_table_size);
554
555         r = journal_file_move_to(f,
556                                  OBJECT_DATA_HASH_TABLE,
557                                  true,
558                                  p, s,
559                                  &t);
560         if (r < 0)
561                 return r;
562
563         f->data_hash_table = t;
564         return 0;
565 }
566
567 static int journal_file_map_field_hash_table(JournalFile *f) {
568         uint64_t s, p;
569         void *t;
570         int r;
571
572         assert(f);
573
574         p = le64toh(f->header->field_hash_table_offset);
575         s = le64toh(f->header->field_hash_table_size);
576
577         r = journal_file_move_to(f,
578                                  OBJECT_FIELD_HASH_TABLE,
579                                  true,
580                                  p, s,
581                                  &t);
582         if (r < 0)
583                 return r;
584
585         f->field_hash_table = t;
586         return 0;
587 }
588
589 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
590         uint64_t p, h;
591         int r;
592
593         assert(f);
594         assert(o);
595         assert(offset > 0);
596
597         if (o->object.type != OBJECT_DATA)
598                 return -EINVAL;
599
600         /* This might alter the window we are looking at */
601
602         o->data.next_hash_offset = o->data.next_field_offset = 0;
603         o->data.entry_offset = o->data.entry_array_offset = 0;
604         o->data.n_entries = 0;
605
606         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
607         p = le64toh(f->data_hash_table[h].tail_hash_offset);
608         if (p == 0) {
609                 /* Only entry in the hash table is easy */
610                 f->data_hash_table[h].head_hash_offset = htole64(offset);
611         } else {
612                 /* Move back to the previous data object, to patch in
613                  * pointer */
614
615                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
616                 if (r < 0)
617                         return r;
618
619                 o->data.next_hash_offset = htole64(offset);
620         }
621
622         f->data_hash_table[h].tail_hash_offset = htole64(offset);
623
624         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
625                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
626
627         return 0;
628 }
629
630 int journal_file_find_data_object_with_hash(
631                 JournalFile *f,
632                 const void *data, uint64_t size, uint64_t hash,
633                 Object **ret, uint64_t *offset) {
634
635         uint64_t p, osize, h;
636         int r;
637
638         assert(f);
639         assert(data || size == 0);
640
641         osize = offsetof(Object, data.payload) + size;
642
643         if (f->header->data_hash_table_size == 0)
644                 return -EBADMSG;
645
646         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
647         p = le64toh(f->data_hash_table[h].head_hash_offset);
648
649         while (p > 0) {
650                 Object *o;
651
652                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
653                 if (r < 0)
654                         return r;
655
656                 if (le64toh(o->data.hash) != hash)
657                         goto next;
658
659                 if (o->object.flags & OBJECT_COMPRESSED) {
660 #ifdef HAVE_XZ
661                         uint64_t l, rsize;
662
663                         l = le64toh(o->object.size);
664                         if (l <= offsetof(Object, data.payload))
665                                 return -EBADMSG;
666
667                         l -= offsetof(Object, data.payload);
668
669                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
670                                 return -EBADMSG;
671
672                         if (rsize == size &&
673                             memcmp(f->compress_buffer, data, size) == 0) {
674
675                                 if (ret)
676                                         *ret = o;
677
678                                 if (offset)
679                                         *offset = p;
680
681                                 return 1;
682                         }
683 #else
684                         return -EPROTONOSUPPORT;
685 #endif
686
687                 } else if (le64toh(o->object.size) == osize &&
688                            memcmp(o->data.payload, data, size) == 0) {
689
690                         if (ret)
691                                 *ret = o;
692
693                         if (offset)
694                                 *offset = p;
695
696                         return 1;
697                 }
698
699         next:
700                 p = le64toh(o->data.next_hash_offset);
701         }
702
703         return 0;
704 }
705
706 int journal_file_find_data_object(
707                 JournalFile *f,
708                 const void *data, uint64_t size,
709                 Object **ret, uint64_t *offset) {
710
711         uint64_t hash;
712
713         assert(f);
714         assert(data || size == 0);
715
716         hash = hash64(data, size);
717
718         return journal_file_find_data_object_with_hash(f,
719                                                        data, size, hash,
720                                                        ret, offset);
721 }
722
723 static int journal_file_append_data(
724                 JournalFile *f,
725                 const void *data, uint64_t size,
726                 Object **ret, uint64_t *offset) {
727
728         uint64_t hash, p;
729         uint64_t osize;
730         Object *o;
731         int r;
732         bool compressed = false;
733
734         assert(f);
735         assert(data || size == 0);
736
737         hash = hash64(data, size);
738
739         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
740         if (r < 0)
741                 return r;
742         else if (r > 0) {
743
744                 if (ret)
745                         *ret = o;
746
747                 if (offset)
748                         *offset = p;
749
750                 return 0;
751         }
752
753         osize = offsetof(Object, data.payload) + size;
754         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
755         if (r < 0)
756                 return r;
757
758         o->data.hash = htole64(hash);
759
760 #ifdef HAVE_XZ
761         if (f->compress &&
762             size >= COMPRESSION_SIZE_THRESHOLD) {
763                 uint64_t rsize;
764
765                 compressed = compress_blob(data, size, o->data.payload, &rsize);
766
767                 if (compressed) {
768                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
769                         o->object.flags |= OBJECT_COMPRESSED;
770
771                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
772                 }
773         }
774 #endif
775
776         if (!compressed && size > 0)
777                 memcpy(o->data.payload, data, size);
778
779         r = journal_file_link_data(f, o, p, hash);
780         if (r < 0)
781                 return r;
782
783         /* The linking might have altered the window, so let's
784          * refresh our pointer */
785         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
786         if (r < 0)
787                 return r;
788
789 #ifdef HAVE_GCRYPT
790         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
791         if (r < 0)
792                 return r;
793 #endif
794
795         if (ret)
796                 *ret = o;
797
798         if (offset)
799                 *offset = p;
800
801         return 0;
802 }
803
804 uint64_t journal_file_entry_n_items(Object *o) {
805         assert(o);
806
807         if (o->object.type != OBJECT_ENTRY)
808                 return 0;
809
810         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
811 }
812
813 uint64_t journal_file_entry_array_n_items(Object *o) {
814         assert(o);
815
816         if (o->object.type != OBJECT_ENTRY_ARRAY)
817                 return 0;
818
819         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
820 }
821
822 uint64_t journal_file_hash_table_n_items(Object *o) {
823         assert(o);
824
825         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
826             o->object.type != OBJECT_FIELD_HASH_TABLE)
827                 return 0;
828
829         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
830 }
831
832 static int link_entry_into_array(JournalFile *f,
833                                  le64_t *first,
834                                  le64_t *idx,
835                                  uint64_t p) {
836         int r;
837         uint64_t n = 0, ap = 0, q, i, a, hidx;
838         Object *o;
839
840         assert(f);
841         assert(first);
842         assert(idx);
843         assert(p > 0);
844
845         a = le64toh(*first);
846         i = hidx = le64toh(*idx);
847         while (a > 0) {
848
849                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
850                 if (r < 0)
851                         return r;
852
853                 n = journal_file_entry_array_n_items(o);
854                 if (i < n) {
855                         o->entry_array.items[i] = htole64(p);
856                         *idx = htole64(hidx + 1);
857                         return 0;
858                 }
859
860                 i -= n;
861                 ap = a;
862                 a = le64toh(o->entry_array.next_entry_array_offset);
863         }
864
865         if (hidx > n)
866                 n = (hidx+1) * 2;
867         else
868                 n = n * 2;
869
870         if (n < 4)
871                 n = 4;
872
873         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
874                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
875                                        &o, &q);
876         if (r < 0)
877                 return r;
878
879 #ifdef HAVE_GCRYPT
880         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
881         if (r < 0)
882                 return r;
883 #endif
884
885         o->entry_array.items[i] = htole64(p);
886
887         if (ap == 0)
888                 *first = htole64(q);
889         else {
890                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
891                 if (r < 0)
892                         return r;
893
894                 o->entry_array.next_entry_array_offset = htole64(q);
895         }
896
897         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
898                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
899
900         *idx = htole64(hidx + 1);
901
902         return 0;
903 }
904
905 static int link_entry_into_array_plus_one(JournalFile *f,
906                                           le64_t *extra,
907                                           le64_t *first,
908                                           le64_t *idx,
909                                           uint64_t p) {
910
911         int r;
912
913         assert(f);
914         assert(extra);
915         assert(first);
916         assert(idx);
917         assert(p > 0);
918
919         if (*idx == 0)
920                 *extra = htole64(p);
921         else {
922                 le64_t i;
923
924                 i = htole64(le64toh(*idx) - 1);
925                 r = link_entry_into_array(f, first, &i, p);
926                 if (r < 0)
927                         return r;
928         }
929
930         *idx = htole64(le64toh(*idx) + 1);
931         return 0;
932 }
933
934 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
935         uint64_t p;
936         int r;
937         assert(f);
938         assert(o);
939         assert(offset > 0);
940
941         p = le64toh(o->entry.items[i].object_offset);
942         if (p == 0)
943                 return -EINVAL;
944
945         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
946         if (r < 0)
947                 return r;
948
949         return link_entry_into_array_plus_one(f,
950                                               &o->data.entry_offset,
951                                               &o->data.entry_array_offset,
952                                               &o->data.n_entries,
953                                               offset);
954 }
955
956 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
957         uint64_t n, i;
958         int r;
959
960         assert(f);
961         assert(o);
962         assert(offset > 0);
963
964         if (o->object.type != OBJECT_ENTRY)
965                 return -EINVAL;
966
967         __sync_synchronize();
968
969         /* Link up the entry itself */
970         r = link_entry_into_array(f,
971                                   &f->header->entry_array_offset,
972                                   &f->header->n_entries,
973                                   offset);
974         if (r < 0)
975                 return r;
976
977         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
978
979         if (f->header->head_entry_realtime == 0)
980                 f->header->head_entry_realtime = o->entry.realtime;
981
982         f->header->tail_entry_realtime = o->entry.realtime;
983         f->header->tail_entry_monotonic = o->entry.monotonic;
984
985         f->tail_entry_monotonic_valid = true;
986
987         /* Link up the items */
988         n = journal_file_entry_n_items(o);
989         for (i = 0; i < n; i++) {
990                 r = journal_file_link_entry_item(f, o, offset, i);
991                 if (r < 0)
992                         return r;
993         }
994
995         return 0;
996 }
997
998 static int journal_file_append_entry_internal(
999                 JournalFile *f,
1000                 const dual_timestamp *ts,
1001                 uint64_t xor_hash,
1002                 const EntryItem items[], unsigned n_items,
1003                 uint64_t *seqnum,
1004                 Object **ret, uint64_t *offset) {
1005         uint64_t np;
1006         uint64_t osize;
1007         Object *o;
1008         int r;
1009
1010         assert(f);
1011         assert(items || n_items == 0);
1012         assert(ts);
1013
1014         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1015
1016         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1017         if (r < 0)
1018                 return r;
1019
1020         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1021         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1022         o->entry.realtime = htole64(ts->realtime);
1023         o->entry.monotonic = htole64(ts->monotonic);
1024         o->entry.xor_hash = htole64(xor_hash);
1025         o->entry.boot_id = f->header->boot_id;
1026
1027 #ifdef HAVE_GCRYPT
1028         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1029         if (r < 0)
1030                 return r;
1031 #endif
1032
1033         r = journal_file_link_entry(f, o, np);
1034         if (r < 0)
1035                 return r;
1036
1037         if (ret)
1038                 *ret = o;
1039
1040         if (offset)
1041                 *offset = np;
1042
1043         return 0;
1044 }
1045
1046 void journal_file_post_change(JournalFile *f) {
1047         assert(f);
1048
1049         /* inotify() does not receive IN_MODIFY events from file
1050          * accesses done via mmap(). After each access we hence
1051          * trigger IN_MODIFY by truncating the journal file to its
1052          * current size which triggers IN_MODIFY. */
1053
1054         __sync_synchronize();
1055
1056         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1057                 log_error("Failed to truncate file to its own size: %m");
1058 }
1059
1060 static int entry_item_cmp(const void *_a, const void *_b) {
1061         const EntryItem *a = _a, *b = _b;
1062
1063         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1064                 return -1;
1065         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1066                 return 1;
1067         return 0;
1068 }
1069
1070 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1071         unsigned i;
1072         EntryItem *items;
1073         int r;
1074         uint64_t xor_hash = 0;
1075         struct dual_timestamp _ts;
1076
1077         assert(f);
1078         assert(iovec || n_iovec == 0);
1079
1080         if (!f->writable)
1081                 return -EPERM;
1082
1083         if (!ts) {
1084                 dual_timestamp_get(&_ts);
1085                 ts = &_ts;
1086         }
1087
1088         if (f->tail_entry_monotonic_valid &&
1089             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1090                 return -EINVAL;
1091
1092 #ifdef HAVE_GCRYPT
1093         r = journal_file_maybe_append_tag(f, ts->realtime);
1094         if (r < 0)
1095                 return r;
1096 #endif
1097
1098         /* alloca() can't take 0, hence let's allocate at least one */
1099         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1100
1101         for (i = 0; i < n_iovec; i++) {
1102                 uint64_t p;
1103                 Object *o;
1104
1105                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1106                 if (r < 0)
1107                         return r;
1108
1109                 xor_hash ^= le64toh(o->data.hash);
1110                 items[i].object_offset = htole64(p);
1111                 items[i].hash = o->data.hash;
1112         }
1113
1114         /* Order by the position on disk, in order to improve seek
1115          * times for rotating media. */
1116         qsort(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1117
1118         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1119
1120         journal_file_post_change(f);
1121
1122         return r;
1123 }
1124
1125 static int generic_array_get(JournalFile *f,
1126                              uint64_t first,
1127                              uint64_t i,
1128                              Object **ret, uint64_t *offset) {
1129
1130         Object *o;
1131         uint64_t p = 0, a;
1132         int r;
1133
1134         assert(f);
1135
1136         a = first;
1137         while (a > 0) {
1138                 uint64_t n;
1139
1140                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1141                 if (r < 0)
1142                         return r;
1143
1144                 n = journal_file_entry_array_n_items(o);
1145                 if (i < n) {
1146                         p = le64toh(o->entry_array.items[i]);
1147                         break;
1148                 }
1149
1150                 i -= n;
1151                 a = le64toh(o->entry_array.next_entry_array_offset);
1152         }
1153
1154         if (a <= 0 || p <= 0)
1155                 return 0;
1156
1157         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1158         if (r < 0)
1159                 return r;
1160
1161         if (ret)
1162                 *ret = o;
1163
1164         if (offset)
1165                 *offset = p;
1166
1167         return 1;
1168 }
1169
1170 static int generic_array_get_plus_one(JournalFile *f,
1171                                       uint64_t extra,
1172                                       uint64_t first,
1173                                       uint64_t i,
1174                                       Object **ret, uint64_t *offset) {
1175
1176         Object *o;
1177
1178         assert(f);
1179
1180         if (i == 0) {
1181                 int r;
1182
1183                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1184                 if (r < 0)
1185                         return r;
1186
1187                 if (ret)
1188                         *ret = o;
1189
1190                 if (offset)
1191                         *offset = extra;
1192
1193                 return 1;
1194         }
1195
1196         return generic_array_get(f, first, i-1, ret, offset);
1197 }
1198
1199 enum {
1200         TEST_FOUND,
1201         TEST_LEFT,
1202         TEST_RIGHT
1203 };
1204
1205 static int generic_array_bisect(JournalFile *f,
1206                                 uint64_t first,
1207                                 uint64_t n,
1208                                 uint64_t needle,
1209                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1210                                 direction_t direction,
1211                                 Object **ret,
1212                                 uint64_t *offset,
1213                                 uint64_t *idx) {
1214
1215         uint64_t a, p, t = 0, i = 0, last_p = 0;
1216         bool subtract_one = false;
1217         Object *o, *array = NULL;
1218         int r;
1219
1220         assert(f);
1221         assert(test_object);
1222
1223         a = first;
1224         while (a > 0) {
1225                 uint64_t left, right, k, lp;
1226
1227                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1228                 if (r < 0)
1229                         return r;
1230
1231                 k = journal_file_entry_array_n_items(array);
1232                 right = MIN(k, n);
1233                 if (right <= 0)
1234                         return 0;
1235
1236                 i = right - 1;
1237                 lp = p = le64toh(array->entry_array.items[i]);
1238                 if (p <= 0)
1239                         return -EBADMSG;
1240
1241                 r = test_object(f, p, needle);
1242                 if (r < 0)
1243                         return r;
1244
1245                 if (r == TEST_FOUND)
1246                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1247
1248                 if (r == TEST_RIGHT) {
1249                         left = 0;
1250                         right -= 1;
1251                         for (;;) {
1252                                 if (left == right) {
1253                                         if (direction == DIRECTION_UP)
1254                                                 subtract_one = true;
1255
1256                                         i = left;
1257                                         goto found;
1258                                 }
1259
1260                                 assert(left < right);
1261
1262                                 i = (left + right) / 2;
1263                                 p = le64toh(array->entry_array.items[i]);
1264                                 if (p <= 0)
1265                                         return -EBADMSG;
1266
1267                                 r = test_object(f, p, needle);
1268                                 if (r < 0)
1269                                         return r;
1270
1271                                 if (r == TEST_FOUND)
1272                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1273
1274                                 if (r == TEST_RIGHT)
1275                                         right = i;
1276                                 else
1277                                         left = i + 1;
1278                         }
1279                 }
1280
1281                 if (k > n) {
1282                         if (direction == DIRECTION_UP) {
1283                                 i = n;
1284                                 subtract_one = true;
1285                                 goto found;
1286                         }
1287
1288                         return 0;
1289                 }
1290
1291                 last_p = lp;
1292
1293                 n -= k;
1294                 t += k;
1295                 a = le64toh(array->entry_array.next_entry_array_offset);
1296         }
1297
1298         return 0;
1299
1300 found:
1301         if (subtract_one && t == 0 && i == 0)
1302                 return 0;
1303
1304         if (subtract_one && i == 0)
1305                 p = last_p;
1306         else if (subtract_one)
1307                 p = le64toh(array->entry_array.items[i-1]);
1308         else
1309                 p = le64toh(array->entry_array.items[i]);
1310
1311         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1312         if (r < 0)
1313                 return r;
1314
1315         if (ret)
1316                 *ret = o;
1317
1318         if (offset)
1319                 *offset = p;
1320
1321         if (idx)
1322                 *idx = t + i + (subtract_one ? -1 : 0);
1323
1324         return 1;
1325 }
1326
1327 static int generic_array_bisect_plus_one(JournalFile *f,
1328                                          uint64_t extra,
1329                                          uint64_t first,
1330                                          uint64_t n,
1331                                          uint64_t needle,
1332                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1333                                          direction_t direction,
1334                                          Object **ret,
1335                                          uint64_t *offset,
1336                                          uint64_t *idx) {
1337
1338         int r;
1339         bool step_back = false;
1340         Object *o;
1341
1342         assert(f);
1343         assert(test_object);
1344
1345         if (n <= 0)
1346                 return 0;
1347
1348         /* This bisects the array in object 'first', but first checks
1349          * an extra  */
1350         r = test_object(f, extra, needle);
1351         if (r < 0)
1352                 return r;
1353
1354         if (r == TEST_FOUND)
1355                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1356
1357         /* if we are looking with DIRECTION_UP then we need to first
1358            see if in the actual array there is a matching entry, and
1359            return the last one of that. But if there isn't any we need
1360            to return this one. Hence remember this, and return it
1361            below. */
1362         if (r == TEST_LEFT)
1363                 step_back = direction == DIRECTION_UP;
1364
1365         if (r == TEST_RIGHT) {
1366                 if (direction == DIRECTION_DOWN)
1367                         goto found;
1368                 else
1369                         return 0;
1370         }
1371
1372         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1373
1374         if (r == 0 && step_back)
1375                 goto found;
1376
1377         if (r > 0 && idx)
1378                 (*idx) ++;
1379
1380         return r;
1381
1382 found:
1383         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1384         if (r < 0)
1385                 return r;
1386
1387         if (ret)
1388                 *ret = o;
1389
1390         if (offset)
1391                 *offset = extra;
1392
1393         if (idx)
1394                 *idx = 0;
1395
1396         return 1;
1397 }
1398
1399 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1400         assert(f);
1401         assert(p > 0);
1402
1403         if (p == needle)
1404                 return TEST_FOUND;
1405         else if (p < needle)
1406                 return TEST_LEFT;
1407         else
1408                 return TEST_RIGHT;
1409 }
1410
1411 int journal_file_move_to_entry_by_offset(
1412                 JournalFile *f,
1413                 uint64_t p,
1414                 direction_t direction,
1415                 Object **ret,
1416                 uint64_t *offset) {
1417
1418         return generic_array_bisect(f,
1419                                     le64toh(f->header->entry_array_offset),
1420                                     le64toh(f->header->n_entries),
1421                                     p,
1422                                     test_object_offset,
1423                                     direction,
1424                                     ret, offset, NULL);
1425 }
1426
1427
1428 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1429         Object *o;
1430         int r;
1431
1432         assert(f);
1433         assert(p > 0);
1434
1435         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1436         if (r < 0)
1437                 return r;
1438
1439         if (le64toh(o->entry.seqnum) == needle)
1440                 return TEST_FOUND;
1441         else if (le64toh(o->entry.seqnum) < needle)
1442                 return TEST_LEFT;
1443         else
1444                 return TEST_RIGHT;
1445 }
1446
1447 int journal_file_move_to_entry_by_seqnum(
1448                 JournalFile *f,
1449                 uint64_t seqnum,
1450                 direction_t direction,
1451                 Object **ret,
1452                 uint64_t *offset) {
1453
1454         return generic_array_bisect(f,
1455                                     le64toh(f->header->entry_array_offset),
1456                                     le64toh(f->header->n_entries),
1457                                     seqnum,
1458                                     test_object_seqnum,
1459                                     direction,
1460                                     ret, offset, NULL);
1461 }
1462
1463 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1464         Object *o;
1465         int r;
1466
1467         assert(f);
1468         assert(p > 0);
1469
1470         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1471         if (r < 0)
1472                 return r;
1473
1474         if (le64toh(o->entry.realtime) == needle)
1475                 return TEST_FOUND;
1476         else if (le64toh(o->entry.realtime) < needle)
1477                 return TEST_LEFT;
1478         else
1479                 return TEST_RIGHT;
1480 }
1481
1482 int journal_file_move_to_entry_by_realtime(
1483                 JournalFile *f,
1484                 uint64_t realtime,
1485                 direction_t direction,
1486                 Object **ret,
1487                 uint64_t *offset) {
1488
1489         return generic_array_bisect(f,
1490                                     le64toh(f->header->entry_array_offset),
1491                                     le64toh(f->header->n_entries),
1492                                     realtime,
1493                                     test_object_realtime,
1494                                     direction,
1495                                     ret, offset, NULL);
1496 }
1497
1498 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1499         Object *o;
1500         int r;
1501
1502         assert(f);
1503         assert(p > 0);
1504
1505         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1506         if (r < 0)
1507                 return r;
1508
1509         if (le64toh(o->entry.monotonic) == needle)
1510                 return TEST_FOUND;
1511         else if (le64toh(o->entry.monotonic) < needle)
1512                 return TEST_LEFT;
1513         else
1514                 return TEST_RIGHT;
1515 }
1516
1517 int journal_file_move_to_entry_by_monotonic(
1518                 JournalFile *f,
1519                 sd_id128_t boot_id,
1520                 uint64_t monotonic,
1521                 direction_t direction,
1522                 Object **ret,
1523                 uint64_t *offset) {
1524
1525         char t[9+32+1] = "_BOOT_ID=";
1526         Object *o;
1527         int r;
1528
1529         assert(f);
1530
1531         sd_id128_to_string(boot_id, t + 9);
1532         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1533         if (r < 0)
1534                 return r;
1535         if (r == 0)
1536                 return -ENOENT;
1537
1538         return generic_array_bisect_plus_one(f,
1539                                              le64toh(o->data.entry_offset),
1540                                              le64toh(o->data.entry_array_offset),
1541                                              le64toh(o->data.n_entries),
1542                                              monotonic,
1543                                              test_object_monotonic,
1544                                              direction,
1545                                              ret, offset, NULL);
1546 }
1547
1548 int journal_file_next_entry(
1549                 JournalFile *f,
1550                 Object *o, uint64_t p,
1551                 direction_t direction,
1552                 Object **ret, uint64_t *offset) {
1553
1554         uint64_t i, n;
1555         int r;
1556
1557         assert(f);
1558         assert(p > 0 || !o);
1559
1560         n = le64toh(f->header->n_entries);
1561         if (n <= 0)
1562                 return 0;
1563
1564         if (!o)
1565                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1566         else {
1567                 if (o->object.type != OBJECT_ENTRY)
1568                         return -EINVAL;
1569
1570                 r = generic_array_bisect(f,
1571                                          le64toh(f->header->entry_array_offset),
1572                                          le64toh(f->header->n_entries),
1573                                          p,
1574                                          test_object_offset,
1575                                          DIRECTION_DOWN,
1576                                          NULL, NULL,
1577                                          &i);
1578                 if (r <= 0)
1579                         return r;
1580
1581                 if (direction == DIRECTION_DOWN) {
1582                         if (i >= n - 1)
1583                                 return 0;
1584
1585                         i++;
1586                 } else {
1587                         if (i <= 0)
1588                                 return 0;
1589
1590                         i--;
1591                 }
1592         }
1593
1594         /* And jump to it */
1595         return generic_array_get(f,
1596                                  le64toh(f->header->entry_array_offset),
1597                                  i,
1598                                  ret, offset);
1599 }
1600
1601 int journal_file_skip_entry(
1602                 JournalFile *f,
1603                 Object *o, uint64_t p,
1604                 int64_t skip,
1605                 Object **ret, uint64_t *offset) {
1606
1607         uint64_t i, n;
1608         int r;
1609
1610         assert(f);
1611         assert(o);
1612         assert(p > 0);
1613
1614         if (o->object.type != OBJECT_ENTRY)
1615                 return -EINVAL;
1616
1617         r = generic_array_bisect(f,
1618                                  le64toh(f->header->entry_array_offset),
1619                                  le64toh(f->header->n_entries),
1620                                  p,
1621                                  test_object_offset,
1622                                  DIRECTION_DOWN,
1623                                  NULL, NULL,
1624                                  &i);
1625         if (r <= 0)
1626                 return r;
1627
1628         /* Calculate new index */
1629         if (skip < 0) {
1630                 if ((uint64_t) -skip >= i)
1631                         i = 0;
1632                 else
1633                         i = i - (uint64_t) -skip;
1634         } else
1635                 i  += (uint64_t) skip;
1636
1637         n = le64toh(f->header->n_entries);
1638         if (n <= 0)
1639                 return -EBADMSG;
1640
1641         if (i >= n)
1642                 i = n-1;
1643
1644         return generic_array_get(f,
1645                                  le64toh(f->header->entry_array_offset),
1646                                  i,
1647                                  ret, offset);
1648 }
1649
1650 int journal_file_next_entry_for_data(
1651                 JournalFile *f,
1652                 Object *o, uint64_t p,
1653                 uint64_t data_offset,
1654                 direction_t direction,
1655                 Object **ret, uint64_t *offset) {
1656
1657         uint64_t n, i;
1658         int r;
1659         Object *d;
1660
1661         assert(f);
1662         assert(p > 0 || !o);
1663
1664         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1665         if (r < 0)
1666                 return r;
1667
1668         n = le64toh(d->data.n_entries);
1669         if (n <= 0)
1670                 return n;
1671
1672         if (!o)
1673                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1674         else {
1675                 if (o->object.type != OBJECT_ENTRY)
1676                         return -EINVAL;
1677
1678                 r = generic_array_bisect_plus_one(f,
1679                                                   le64toh(d->data.entry_offset),
1680                                                   le64toh(d->data.entry_array_offset),
1681                                                   le64toh(d->data.n_entries),
1682                                                   p,
1683                                                   test_object_offset,
1684                                                   DIRECTION_DOWN,
1685                                                   NULL, NULL,
1686                                                   &i);
1687
1688                 if (r <= 0)
1689                         return r;
1690
1691                 if (direction == DIRECTION_DOWN) {
1692                         if (i >= n - 1)
1693                                 return 0;
1694
1695                         i++;
1696                 } else {
1697                         if (i <= 0)
1698                                 return 0;
1699
1700                         i--;
1701                 }
1702
1703         }
1704
1705         return generic_array_get_plus_one(f,
1706                                           le64toh(d->data.entry_offset),
1707                                           le64toh(d->data.entry_array_offset),
1708                                           i,
1709                                           ret, offset);
1710 }
1711
1712 int journal_file_move_to_entry_by_offset_for_data(
1713                 JournalFile *f,
1714                 uint64_t data_offset,
1715                 uint64_t p,
1716                 direction_t direction,
1717                 Object **ret, uint64_t *offset) {
1718
1719         int r;
1720         Object *d;
1721
1722         assert(f);
1723
1724         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1725         if (r < 0)
1726                 return r;
1727
1728         return generic_array_bisect_plus_one(f,
1729                                              le64toh(d->data.entry_offset),
1730                                              le64toh(d->data.entry_array_offset),
1731                                              le64toh(d->data.n_entries),
1732                                              p,
1733                                              test_object_offset,
1734                                              direction,
1735                                              ret, offset, NULL);
1736 }
1737
1738 int journal_file_move_to_entry_by_monotonic_for_data(
1739                 JournalFile *f,
1740                 uint64_t data_offset,
1741                 sd_id128_t boot_id,
1742                 uint64_t monotonic,
1743                 direction_t direction,
1744                 Object **ret, uint64_t *offset) {
1745
1746         char t[9+32+1] = "_BOOT_ID=";
1747         Object *o, *d;
1748         int r;
1749         uint64_t b, z;
1750
1751         assert(f);
1752
1753         /* First, seek by time */
1754         sd_id128_to_string(boot_id, t + 9);
1755         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1756         if (r < 0)
1757                 return r;
1758         if (r == 0)
1759                 return -ENOENT;
1760
1761         r = generic_array_bisect_plus_one(f,
1762                                           le64toh(o->data.entry_offset),
1763                                           le64toh(o->data.entry_array_offset),
1764                                           le64toh(o->data.n_entries),
1765                                           monotonic,
1766                                           test_object_monotonic,
1767                                           direction,
1768                                           NULL, &z, NULL);
1769         if (r <= 0)
1770                 return r;
1771
1772         /* And now, continue seeking until we find an entry that
1773          * exists in both bisection arrays */
1774
1775         for (;;) {
1776                 Object *qo;
1777                 uint64_t p, q;
1778
1779                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1780                 if (r < 0)
1781                         return r;
1782
1783                 r = generic_array_bisect_plus_one(f,
1784                                                   le64toh(d->data.entry_offset),
1785                                                   le64toh(d->data.entry_array_offset),
1786                                                   le64toh(d->data.n_entries),
1787                                                   z,
1788                                                   test_object_offset,
1789                                                   direction,
1790                                                   NULL, &p, NULL);
1791                 if (r <= 0)
1792                         return r;
1793
1794                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1795                 if (r < 0)
1796                         return r;
1797
1798                 r = generic_array_bisect_plus_one(f,
1799                                                   le64toh(o->data.entry_offset),
1800                                                   le64toh(o->data.entry_array_offset),
1801                                                   le64toh(o->data.n_entries),
1802                                                   p,
1803                                                   test_object_offset,
1804                                                   direction,
1805                                                   &qo, &q, NULL);
1806
1807                 if (r <= 0)
1808                         return r;
1809
1810                 if (p == q) {
1811                         if (ret)
1812                                 *ret = qo;
1813                         if (offset)
1814                                 *offset = q;
1815
1816                         return 1;
1817                 }
1818
1819                 z = q;
1820         }
1821
1822         return 0;
1823 }
1824
1825 int journal_file_move_to_entry_by_seqnum_for_data(
1826                 JournalFile *f,
1827                 uint64_t data_offset,
1828                 uint64_t seqnum,
1829                 direction_t direction,
1830                 Object **ret, uint64_t *offset) {
1831
1832         Object *d;
1833         int r;
1834
1835         assert(f);
1836
1837         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1838         if (r < 0)
1839                 return r;
1840
1841         return generic_array_bisect_plus_one(f,
1842                                              le64toh(d->data.entry_offset),
1843                                              le64toh(d->data.entry_array_offset),
1844                                              le64toh(d->data.n_entries),
1845                                              seqnum,
1846                                              test_object_seqnum,
1847                                              direction,
1848                                              ret, offset, NULL);
1849 }
1850
1851 int journal_file_move_to_entry_by_realtime_for_data(
1852                 JournalFile *f,
1853                 uint64_t data_offset,
1854                 uint64_t realtime,
1855                 direction_t direction,
1856                 Object **ret, uint64_t *offset) {
1857
1858         Object *d;
1859         int r;
1860
1861         assert(f);
1862
1863         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1864         if (r < 0)
1865                 return r;
1866
1867         return generic_array_bisect_plus_one(f,
1868                                              le64toh(d->data.entry_offset),
1869                                              le64toh(d->data.entry_array_offset),
1870                                              le64toh(d->data.n_entries),
1871                                              realtime,
1872                                              test_object_realtime,
1873                                              direction,
1874                                              ret, offset, NULL);
1875 }
1876
1877 void journal_file_dump(JournalFile *f) {
1878         Object *o;
1879         int r;
1880         uint64_t p;
1881
1882         assert(f);
1883
1884         journal_file_print_header(f);
1885
1886         p = le64toh(f->header->header_size);
1887         while (p != 0) {
1888                 r = journal_file_move_to_object(f, -1, p, &o);
1889                 if (r < 0)
1890                         goto fail;
1891
1892                 switch (o->object.type) {
1893
1894                 case OBJECT_UNUSED:
1895                         printf("Type: OBJECT_UNUSED\n");
1896                         break;
1897
1898                 case OBJECT_DATA:
1899                         printf("Type: OBJECT_DATA\n");
1900                         break;
1901
1902                 case OBJECT_ENTRY:
1903                         printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
1904                                (unsigned long long) le64toh(o->entry.seqnum),
1905                                (unsigned long long) le64toh(o->entry.monotonic),
1906                                (unsigned long long) le64toh(o->entry.realtime));
1907                         break;
1908
1909                 case OBJECT_FIELD_HASH_TABLE:
1910                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1911                         break;
1912
1913                 case OBJECT_DATA_HASH_TABLE:
1914                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1915                         break;
1916
1917                 case OBJECT_ENTRY_ARRAY:
1918                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1919                         break;
1920
1921                 case OBJECT_TAG:
1922                         printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
1923                                (unsigned long long) le64toh(o->tag.seqnum),
1924                                (unsigned long long) le64toh(o->tag.epoch));
1925                         break;
1926                 }
1927
1928                 if (o->object.flags & OBJECT_COMPRESSED)
1929                         printf("Flags: COMPRESSED\n");
1930
1931                 if (p == le64toh(f->header->tail_object_offset))
1932                         p = 0;
1933                 else
1934                         p = p + ALIGN64(le64toh(o->object.size));
1935         }
1936
1937         return;
1938 fail:
1939         log_error("File corrupt");
1940 }
1941
1942 void journal_file_print_header(JournalFile *f) {
1943         char a[33], b[33], c[33];
1944         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1945         struct stat st;
1946         char bytes[FORMAT_BYTES_MAX];
1947
1948         assert(f);
1949
1950         printf("File Path: %s\n"
1951                "File ID: %s\n"
1952                "Machine ID: %s\n"
1953                "Boot ID: %s\n"
1954                "Sequential Number ID: %s\n"
1955                "State: %s\n"
1956                "Compatible Flags:%s%s\n"
1957                "Incompatible Flags:%s%s\n"
1958                "Header size: %llu\n"
1959                "Arena size: %llu\n"
1960                "Data Hash Table Size: %llu\n"
1961                "Field Hash Table Size: %llu\n"
1962                "Rotate Suggested: %s\n"
1963                "Head Sequential Number: %llu\n"
1964                "Tail Sequential Number: %llu\n"
1965                "Head Realtime Timestamp: %s\n"
1966                "Tail Realtime Timestamp: %s\n"
1967                "Objects: %llu\n"
1968                "Entry Objects: %llu\n",
1969                f->path,
1970                sd_id128_to_string(f->header->file_id, a),
1971                sd_id128_to_string(f->header->machine_id, b),
1972                sd_id128_to_string(f->header->boot_id, c),
1973                sd_id128_to_string(f->header->seqnum_id, c),
1974                f->header->state == STATE_OFFLINE ? "OFFLINE" :
1975                f->header->state == STATE_ONLINE ? "ONLINE" :
1976                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
1977                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
1978                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
1979                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
1980                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
1981                (unsigned long long) le64toh(f->header->header_size),
1982                (unsigned long long) le64toh(f->header->arena_size),
1983                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1984                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1985                yes_no(journal_file_rotate_suggested(f, 0)),
1986                (unsigned long long) le64toh(f->header->head_entry_seqnum),
1987                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
1988                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1989                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
1990                (unsigned long long) le64toh(f->header->n_objects),
1991                (unsigned long long) le64toh(f->header->n_entries));
1992
1993         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1994                 printf("Data Objects: %llu\n"
1995                        "Data Hash Table Fill: %.1f%%\n",
1996                        (unsigned long long) le64toh(f->header->n_data),
1997                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1998
1999         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2000                 printf("Field Objects: %llu\n"
2001                        "Field Hash Table Fill: %.1f%%\n",
2002                        (unsigned long long) le64toh(f->header->n_fields),
2003                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2004
2005         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2006                 printf("Tag Objects: %llu\n",
2007                        (unsigned long long) le64toh(f->header->n_tags));
2008         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2009                 printf("Entry Array Objects: %llu\n",
2010                        (unsigned long long) le64toh(f->header->n_entry_arrays));
2011
2012         if (fstat(f->fd, &st) >= 0)
2013                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2014 }
2015
2016 int journal_file_open(
2017                 const char *fname,
2018                 int flags,
2019                 mode_t mode,
2020                 bool compress,
2021                 bool seal,
2022                 JournalMetrics *metrics,
2023                 MMapCache *mmap_cache,
2024                 JournalFile *template,
2025                 JournalFile **ret) {
2026
2027         JournalFile *f;
2028         int r;
2029         bool newly_created = false;
2030
2031         assert(fname);
2032         assert(ret);
2033
2034         if ((flags & O_ACCMODE) != O_RDONLY &&
2035             (flags & O_ACCMODE) != O_RDWR)
2036                 return -EINVAL;
2037
2038         if (!endswith(fname, ".journal") &&
2039             !endswith(fname, ".journal~"))
2040                 return -EINVAL;
2041
2042         f = new0(JournalFile, 1);
2043         if (!f)
2044                 return -ENOMEM;
2045
2046         f->fd = -1;
2047         f->mode = mode;
2048
2049         f->flags = flags;
2050         f->prot = prot_from_flags(flags);
2051         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2052 #ifdef HAVE_XZ
2053         f->compress = compress;
2054 #endif
2055 #ifdef HAVE_GCRYPT
2056         f->seal = seal;
2057 #endif
2058
2059         if (mmap_cache)
2060                 f->mmap = mmap_cache_ref(mmap_cache);
2061         else {
2062                 f->mmap = mmap_cache_new();
2063                 if (!f->mmap) {
2064                         r = -ENOMEM;
2065                         goto fail;
2066                 }
2067         }
2068
2069         f->path = strdup(fname);
2070         if (!f->path) {
2071                 r = -ENOMEM;
2072                 goto fail;
2073         }
2074
2075         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2076         if (f->fd < 0) {
2077                 r = -errno;
2078                 goto fail;
2079         }
2080
2081         if (fstat(f->fd, &f->last_stat) < 0) {
2082                 r = -errno;
2083                 goto fail;
2084         }
2085
2086         if (f->last_stat.st_size == 0 && f->writable) {
2087 #ifdef HAVE_XATTR
2088                 uint64_t crtime;
2089
2090                 /* Let's attach the creation time to the journal file,
2091                  * so that the vacuuming code knows the age of this
2092                  * file even if the file might end up corrupted one
2093                  * day... Ideally we'd just use the creation time many
2094                  * file systems maintain for each file, but there is
2095                  * currently no usable API to query this, hence let's
2096                  * emulate this via extended attributes. If extended
2097                  * attributes are not supported we'll just skip this,
2098                  * and rely solely on mtime/atime/ctime of the file.*/
2099
2100                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2101                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2102 #endif
2103
2104 #ifdef HAVE_GCRYPT
2105                 /* Try to load the FSPRG state, and if we can't, then
2106                  * just don't do sealing */
2107                 if (f->seal) {
2108                         r = journal_file_fss_load(f);
2109                         if (r < 0)
2110                                 f->seal = false;
2111                 }
2112 #endif
2113
2114                 r = journal_file_init_header(f, template);
2115                 if (r < 0)
2116                         goto fail;
2117
2118                 if (fstat(f->fd, &f->last_stat) < 0) {
2119                         r = -errno;
2120                         goto fail;
2121                 }
2122
2123                 newly_created = true;
2124         }
2125
2126         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2127                 r = -EIO;
2128                 goto fail;
2129         }
2130
2131         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2132         if (f->header == MAP_FAILED) {
2133                 f->header = NULL;
2134                 r = -errno;
2135                 goto fail;
2136         }
2137
2138         if (!newly_created) {
2139                 r = journal_file_verify_header(f);
2140                 if (r < 0)
2141                         goto fail;
2142         }
2143
2144 #ifdef HAVE_GCRYPT
2145         if (!newly_created && f->writable) {
2146                 r = journal_file_fss_load(f);
2147                 if (r < 0)
2148                         goto fail;
2149         }
2150 #endif
2151
2152         if (f->writable) {
2153                 if (metrics) {
2154                         journal_default_metrics(metrics, f->fd);
2155                         f->metrics = *metrics;
2156                 } else if (template)
2157                         f->metrics = template->metrics;
2158
2159                 r = journal_file_refresh_header(f);
2160                 if (r < 0)
2161                         goto fail;
2162         }
2163
2164 #ifdef HAVE_GCRYPT
2165         r = journal_file_hmac_setup(f);
2166         if (r < 0)
2167                 goto fail;
2168 #endif
2169
2170         if (newly_created) {
2171                 r = journal_file_setup_field_hash_table(f);
2172                 if (r < 0)
2173                         goto fail;
2174
2175                 r = journal_file_setup_data_hash_table(f);
2176                 if (r < 0)
2177                         goto fail;
2178
2179 #ifdef HAVE_GCRYPT
2180                 r = journal_file_append_first_tag(f);
2181                 if (r < 0)
2182                         goto fail;
2183 #endif
2184         }
2185
2186         r = journal_file_map_field_hash_table(f);
2187         if (r < 0)
2188                 goto fail;
2189
2190         r = journal_file_map_data_hash_table(f);
2191         if (r < 0)
2192                 goto fail;
2193
2194         *ret = f;
2195         return 0;
2196
2197 fail:
2198         journal_file_close(f);
2199
2200         return r;
2201 }
2202
2203 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2204         char *p;
2205         size_t l;
2206         JournalFile *old_file, *new_file = NULL;
2207         int r;
2208
2209         assert(f);
2210         assert(*f);
2211
2212         old_file = *f;
2213
2214         if (!old_file->writable)
2215                 return -EINVAL;
2216
2217         if (!endswith(old_file->path, ".journal"))
2218                 return -EINVAL;
2219
2220         l = strlen(old_file->path);
2221
2222         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2223         if (!p)
2224                 return -ENOMEM;
2225
2226         memcpy(p, old_file->path, l - 8);
2227         p[l-8] = '@';
2228         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2229         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2230                  "-%016llx-%016llx.journal",
2231                  (unsigned long long) le64toh((*f)->header->head_entry_seqnum),
2232                  (unsigned long long) le64toh((*f)->header->head_entry_realtime));
2233
2234         r = rename(old_file->path, p);
2235         free(p);
2236
2237         if (r < 0)
2238                 return -errno;
2239
2240         old_file->header->state = STATE_ARCHIVED;
2241
2242         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2243         journal_file_close(old_file);
2244
2245         *f = new_file;
2246         return r;
2247 }
2248
2249 int journal_file_open_reliably(
2250                 const char *fname,
2251                 int flags,
2252                 mode_t mode,
2253                 bool compress,
2254                 bool seal,
2255                 JournalMetrics *metrics,
2256                 MMapCache *mmap_cache,
2257                 JournalFile *template,
2258                 JournalFile **ret) {
2259
2260         int r;
2261         size_t l;
2262         char *p;
2263
2264         r = journal_file_open(fname, flags, mode, compress, seal,
2265                               metrics, mmap_cache, template, ret);
2266         if (r != -EBADMSG && /* corrupted */
2267             r != -ENODATA && /* truncated */
2268             r != -EHOSTDOWN && /* other machine */
2269             r != -EPROTONOSUPPORT && /* incompatible feature */
2270             r != -EBUSY && /* unclean shutdown */
2271             r != -ESHUTDOWN /* already archived */)
2272                 return r;
2273
2274         if ((flags & O_ACCMODE) == O_RDONLY)
2275                 return r;
2276
2277         if (!(flags & O_CREAT))
2278                 return r;
2279
2280         if (!endswith(fname, ".journal"))
2281                 return r;
2282
2283         /* The file is corrupted. Rotate it away and try it again (but only once) */
2284
2285         l = strlen(fname);
2286         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2287                      (int) (l-8), fname,
2288                      (unsigned long long) now(CLOCK_REALTIME),
2289                      random_ull()) < 0)
2290                 return -ENOMEM;
2291
2292         r = rename(fname, p);
2293         free(p);
2294         if (r < 0)
2295                 return -errno;
2296
2297         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2298
2299         return journal_file_open(fname, flags, mode, compress, seal,
2300                                  metrics, mmap_cache, template, ret);
2301 }
2302
2303
2304 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2305         uint64_t i, n;
2306         uint64_t q, xor_hash = 0;
2307         int r;
2308         EntryItem *items;
2309         dual_timestamp ts;
2310
2311         assert(from);
2312         assert(to);
2313         assert(o);
2314         assert(p);
2315
2316         if (!to->writable)
2317                 return -EPERM;
2318
2319         ts.monotonic = le64toh(o->entry.monotonic);
2320         ts.realtime = le64toh(o->entry.realtime);
2321
2322         if (to->tail_entry_monotonic_valid &&
2323             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2324                 return -EINVAL;
2325
2326         n = journal_file_entry_n_items(o);
2327         items = alloca(sizeof(EntryItem) * n);
2328
2329         for (i = 0; i < n; i++) {
2330                 uint64_t l, h;
2331                 le64_t le_hash;
2332                 size_t t;
2333                 void *data;
2334                 Object *u;
2335
2336                 q = le64toh(o->entry.items[i].object_offset);
2337                 le_hash = o->entry.items[i].hash;
2338
2339                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2340                 if (r < 0)
2341                         return r;
2342
2343                 if (le_hash != o->data.hash)
2344                         return -EBADMSG;
2345
2346                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2347                 t = (size_t) l;
2348
2349                 /* We hit the limit on 32bit machines */
2350                 if ((uint64_t) t != l)
2351                         return -E2BIG;
2352
2353                 if (o->object.flags & OBJECT_COMPRESSED) {
2354 #ifdef HAVE_XZ
2355                         uint64_t rsize;
2356
2357                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2358                                 return -EBADMSG;
2359
2360                         data = from->compress_buffer;
2361                         l = rsize;
2362 #else
2363                         return -EPROTONOSUPPORT;
2364 #endif
2365                 } else
2366                         data = o->data.payload;
2367
2368                 r = journal_file_append_data(to, data, l, &u, &h);
2369                 if (r < 0)
2370                         return r;
2371
2372                 xor_hash ^= le64toh(u->data.hash);
2373                 items[i].object_offset = htole64(h);
2374                 items[i].hash = u->data.hash;
2375
2376                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2377                 if (r < 0)
2378                         return r;
2379         }
2380
2381         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2382 }
2383
2384 void journal_default_metrics(JournalMetrics *m, int fd) {
2385         uint64_t fs_size = 0;
2386         struct statvfs ss;
2387         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2388
2389         assert(m);
2390         assert(fd >= 0);
2391
2392         if (fstatvfs(fd, &ss) >= 0)
2393                 fs_size = ss.f_frsize * ss.f_blocks;
2394
2395         if (m->max_use == (uint64_t) -1) {
2396
2397                 if (fs_size > 0) {
2398                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2399
2400                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2401                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2402
2403                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2404                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2405                 } else
2406                         m->max_use = DEFAULT_MAX_USE_LOWER;
2407         } else {
2408                 m->max_use = PAGE_ALIGN(m->max_use);
2409
2410                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2411                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2412         }
2413
2414         if (m->max_size == (uint64_t) -1) {
2415                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2416
2417                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2418                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2419         } else
2420                 m->max_size = PAGE_ALIGN(m->max_size);
2421
2422         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2423                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2424
2425         if (m->max_size*2 > m->max_use)
2426                 m->max_use = m->max_size*2;
2427
2428         if (m->min_size == (uint64_t) -1)
2429                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2430         else {
2431                 m->min_size = PAGE_ALIGN(m->min_size);
2432
2433                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2434                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2435
2436                 if (m->min_size > m->max_size)
2437                         m->max_size = m->min_size;
2438         }
2439
2440         if (m->keep_free == (uint64_t) -1) {
2441
2442                 if (fs_size > 0) {
2443                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2444
2445                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2446                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2447
2448                 } else
2449                         m->keep_free = DEFAULT_KEEP_FREE;
2450         }
2451
2452         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2453                   format_bytes(a, sizeof(a), m->max_use),
2454                   format_bytes(b, sizeof(b), m->max_size),
2455                   format_bytes(c, sizeof(c), m->min_size),
2456                   format_bytes(d, sizeof(d), m->keep_free));
2457 }
2458
2459 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2460         assert(f);
2461         assert(from || to);
2462
2463         if (from) {
2464                 if (f->header->head_entry_realtime == 0)
2465                         return -ENOENT;
2466
2467                 *from = le64toh(f->header->head_entry_realtime);
2468         }
2469
2470         if (to) {
2471                 if (f->header->tail_entry_realtime == 0)
2472                         return -ENOENT;
2473
2474                 *to = le64toh(f->header->tail_entry_realtime);
2475         }
2476
2477         return 1;
2478 }
2479
2480 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2481         char t[9+32+1] = "_BOOT_ID=";
2482         Object *o;
2483         uint64_t p;
2484         int r;
2485
2486         assert(f);
2487         assert(from || to);
2488
2489         sd_id128_to_string(boot_id, t + 9);
2490
2491         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2492         if (r <= 0)
2493                 return r;
2494
2495         if (le64toh(o->data.n_entries) <= 0)
2496                 return 0;
2497
2498         if (from) {
2499                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2500                 if (r < 0)
2501                         return r;
2502
2503                 *from = le64toh(o->entry.monotonic);
2504         }
2505
2506         if (to) {
2507                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2508                 if (r < 0)
2509                         return r;
2510
2511                 r = generic_array_get_plus_one(f,
2512                                                le64toh(o->data.entry_offset),
2513                                                le64toh(o->data.entry_array_offset),
2514                                                le64toh(o->data.n_entries)-1,
2515                                                &o, NULL);
2516                 if (r <= 0)
2517                         return r;
2518
2519                 *to = le64toh(o->entry.monotonic);
2520         }
2521
2522         return 1;
2523 }
2524
2525 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2526         assert(f);
2527
2528         /* If we gained new header fields we gained new features,
2529          * hence suggest a rotation */
2530         if (le64toh(f->header->header_size) < sizeof(Header)) {
2531                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2532                 return true;
2533         }
2534
2535         /* Let's check if the hash tables grew over a certain fill
2536          * level (75%, borrowing this value from Java's hash table
2537          * implementation), and if so suggest a rotation. To calculate
2538          * the fill level we need the n_data field, which only exists
2539          * in newer versions. */
2540
2541         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2542                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2543                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2544                                   f->path,
2545                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2546                                   (unsigned long long) le64toh(f->header->n_data),
2547                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2548                                   (unsigned long long) (f->last_stat.st_size),
2549                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2550                         return true;
2551                 }
2552
2553         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2554                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2555                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2556                                   f->path,
2557                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2558                                   (unsigned long long) le64toh(f->header->n_fields),
2559                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2560                         return true;
2561                 }
2562
2563         if (max_file_usec > 0) {
2564                 usec_t t, h;
2565
2566                 h = le64toh(f->header->head_entry_realtime);
2567                 t = now(CLOCK_REALTIME);
2568
2569                 if (h > 0 && t > h + max_file_usec)
2570                         return true;
2571         }
2572
2573         return false;
2574 }