chiark / gitweb /
journald: don't reposition window if we don't have to
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
33 #include "lookup3.h"
34 #include "compress.h"
35 #include "fsprg.h"
36
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 void journal_file_close(JournalFile *f) {
65         assert(f);
66
67 #ifdef HAVE_GCRYPT
68         /* Write the final tag */
69         if (f->seal && f->writable)
70                 journal_file_append_tag(f);
71 #endif
72
73         /* Sync everything to disk, before we mark the file offline */
74         if (f->mmap && f->fd >= 0)
75                 mmap_cache_close_fd(f->mmap, f->fd);
76
77         if (f->writable && f->fd >= 0)
78                 fdatasync(f->fd);
79
80         if (f->header) {
81                 /* Mark the file offline. Don't override the archived state if it already is set */
82                 if (f->writable && f->header->state == STATE_ONLINE)
83                         f->header->state = STATE_OFFLINE;
84
85                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
86         }
87
88         if (f->fd >= 0)
89                 close_nointr_nofail(f->fd);
90
91         free(f->path);
92
93         if (f->mmap)
94                 mmap_cache_unref(f->mmap);
95
96 #ifdef HAVE_XZ
97         free(f->compress_buffer);
98 #endif
99
100 #ifdef HAVE_GCRYPT
101         if (f->fss_file)
102                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
103         else if (f->fsprg_state)
104                 free(f->fsprg_state);
105
106         free(f->fsprg_seed);
107
108         if (f->hmac)
109                 gcry_md_close(f->hmac);
110 #endif
111
112         free(f);
113 }
114
115 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
116         Header h;
117         ssize_t k;
118         int r;
119
120         assert(f);
121
122         zero(h);
123         memcpy(h.signature, HEADER_SIGNATURE, 8);
124         h.header_size = htole64(ALIGN64(sizeof(h)));
125
126         h.incompatible_flags =
127                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
128
129         h.compatible_flags =
130                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
131
132         r = sd_id128_randomize(&h.file_id);
133         if (r < 0)
134                 return r;
135
136         if (template) {
137                 h.seqnum_id = template->header->seqnum_id;
138                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
139         } else
140                 h.seqnum_id = h.file_id;
141
142         k = pwrite(f->fd, &h, sizeof(h), 0);
143         if (k < 0)
144                 return -errno;
145
146         if (k != sizeof(h))
147                 return -EIO;
148
149         return 0;
150 }
151
152 static int journal_file_refresh_header(JournalFile *f) {
153         int r;
154         sd_id128_t boot_id;
155
156         assert(f);
157
158         r = sd_id128_get_machine(&f->header->machine_id);
159         if (r < 0)
160                 return r;
161
162         r = sd_id128_get_boot(&boot_id);
163         if (r < 0)
164                 return r;
165
166         if (sd_id128_equal(boot_id, f->header->boot_id))
167                 f->tail_entry_monotonic_valid = true;
168
169         f->header->boot_id = boot_id;
170
171         f->header->state = STATE_ONLINE;
172
173         /* Sync the online state to disk */
174         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
175         fdatasync(f->fd);
176
177         return 0;
178 }
179
180 static int journal_file_verify_header(JournalFile *f) {
181         assert(f);
182
183         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
184                 return -EBADMSG;
185
186         /* In both read and write mode we refuse to open files with
187          * incompatible flags we don't know */
188 #ifdef HAVE_XZ
189         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
190                 return -EPROTONOSUPPORT;
191 #else
192         if (f->header->incompatible_flags != 0)
193                 return -EPROTONOSUPPORT;
194 #endif
195
196         /* When open for writing we refuse to open files with
197          * compatible flags, too */
198         if (f->writable) {
199 #ifdef HAVE_GCRYPT
200                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
201                         return -EPROTONOSUPPORT;
202 #else
203                 if (f->header->compatible_flags != 0)
204                         return -EPROTONOSUPPORT;
205 #endif
206         }
207
208         if (f->header->state >= _STATE_MAX)
209                 return -EBADMSG;
210
211         /* The first addition was n_data, so check that we are at least this large */
212         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
213                 return -EBADMSG;
214
215         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
216                 return -EBADMSG;
217
218         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
219                 return -ENODATA;
220
221         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
222                 return -ENODATA;
223
224         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
225             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
226             !VALID64(le64toh(f->header->tail_object_offset)) ||
227             !VALID64(le64toh(f->header->entry_array_offset)))
228                 return -ENODATA;
229
230         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
231             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
232             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
233             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
234                 return -ENODATA;
235
236         if (f->writable) {
237                 uint8_t state;
238                 sd_id128_t machine_id;
239                 int r;
240
241                 r = sd_id128_get_machine(&machine_id);
242                 if (r < 0)
243                         return r;
244
245                 if (!sd_id128_equal(machine_id, f->header->machine_id))
246                         return -EHOSTDOWN;
247
248                 state = f->header->state;
249
250                 if (state == STATE_ONLINE) {
251                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
252                         return -EBUSY;
253                 } else if (state == STATE_ARCHIVED)
254                         return -ESHUTDOWN;
255                 else if (state != STATE_OFFLINE) {
256                         log_debug("Journal file %s has unknown state %u.", f->path, state);
257                         return -EBUSY;
258                 }
259         }
260
261         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
262
263         if (f->writable)
264                 f->seal = JOURNAL_HEADER_SEALED(f->header);
265
266         return 0;
267 }
268
269 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
270         uint64_t old_size, new_size;
271         int r;
272
273         assert(f);
274
275         /* We assume that this file is not sparse, and we know that
276          * for sure, since we always call posix_fallocate()
277          * ourselves */
278
279         old_size =
280                 le64toh(f->header->header_size) +
281                 le64toh(f->header->arena_size);
282
283         new_size = PAGE_ALIGN(offset + size);
284         if (new_size < le64toh(f->header->header_size))
285                 new_size = le64toh(f->header->header_size);
286
287         if (new_size <= old_size)
288                 return 0;
289
290         if (f->metrics.max_size > 0 &&
291             new_size > f->metrics.max_size)
292                 return -E2BIG;
293
294         if (new_size > f->metrics.min_size &&
295             f->metrics.keep_free > 0) {
296                 struct statvfs svfs;
297
298                 if (fstatvfs(f->fd, &svfs) >= 0) {
299                         uint64_t available;
300
301                         available = svfs.f_bfree * svfs.f_bsize;
302
303                         if (available >= f->metrics.keep_free)
304                                 available -= f->metrics.keep_free;
305                         else
306                                 available = 0;
307
308                         if (new_size - old_size > available)
309                                 return -E2BIG;
310                 }
311         }
312
313         /* Note that the glibc fallocate() fallback is very
314            inefficient, hence we try to minimize the allocation area
315            as we can. */
316         r = posix_fallocate(f->fd, old_size, new_size - old_size);
317         if (r != 0)
318                 return -r;
319
320         if (fstat(f->fd, &f->last_stat) < 0)
321                 return -errno;
322
323         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
324
325         return 0;
326 }
327
328 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
329         assert(f);
330         assert(ret);
331
332         if (size <= 0)
333                 return -EINVAL;
334
335         /* Avoid SIGBUS on invalid accesses */
336         if (offset + size > (uint64_t) f->last_stat.st_size) {
337                 /* Hmm, out of range? Let's refresh the fstat() data
338                  * first, before we trust that check. */
339
340                 if (fstat(f->fd, &f->last_stat) < 0 ||
341                     offset + size > (uint64_t) f->last_stat.st_size)
342                         return -EADDRNOTAVAIL;
343         }
344
345         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
346 }
347
348 static uint64_t minimum_header_size(Object *o) {
349
350         static uint64_t table[] = {
351                 [OBJECT_DATA] = sizeof(DataObject),
352                 [OBJECT_FIELD] = sizeof(FieldObject),
353                 [OBJECT_ENTRY] = sizeof(EntryObject),
354                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
355                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
356                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
357                 [OBJECT_TAG] = sizeof(TagObject),
358         };
359
360         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
361                 return sizeof(ObjectHeader);
362
363         return table[o->object.type];
364 }
365
366 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
367         int r;
368         void *t;
369         Object *o;
370         uint64_t s;
371         unsigned context;
372
373         assert(f);
374         assert(ret);
375
376         /* Objects may only be located at multiple of 64 bit */
377         if (!VALID64(offset))
378                 return -EFAULT;
379
380         /* One context for each type, plus one catch-all for the rest */
381         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
382
383         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
384         if (r < 0)
385                 return r;
386
387         o = (Object*) t;
388         s = le64toh(o->object.size);
389
390         if (s < sizeof(ObjectHeader))
391                 return -EBADMSG;
392
393         if (o->object.type <= OBJECT_UNUSED)
394                 return -EBADMSG;
395
396         if (s < minimum_header_size(o))
397                 return -EBADMSG;
398
399         if (type >= 0 && o->object.type != type)
400                 return -EBADMSG;
401
402         if (s > sizeof(ObjectHeader)) {
403                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
404                 if (r < 0)
405                         return r;
406
407                 o = (Object*) t;
408         }
409
410         *ret = o;
411         return 0;
412 }
413
414 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
415         uint64_t r;
416
417         assert(f);
418
419         r = le64toh(f->header->tail_entry_seqnum) + 1;
420
421         if (seqnum) {
422                 /* If an external seqnum counter was passed, we update
423                  * both the local and the external one, and set it to
424                  * the maximum of both */
425
426                 if (*seqnum + 1 > r)
427                         r = *seqnum + 1;
428
429                 *seqnum = r;
430         }
431
432         f->header->tail_entry_seqnum = htole64(r);
433
434         if (f->header->head_entry_seqnum == 0)
435                 f->header->head_entry_seqnum = htole64(r);
436
437         return r;
438 }
439
440 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
441         int r;
442         uint64_t p;
443         Object *tail, *o;
444         void *t;
445
446         assert(f);
447         assert(type > 0 && type < _OBJECT_TYPE_MAX);
448         assert(size >= sizeof(ObjectHeader));
449         assert(offset);
450         assert(ret);
451
452         p = le64toh(f->header->tail_object_offset);
453         if (p == 0)
454                 p = le64toh(f->header->header_size);
455         else {
456                 r = journal_file_move_to_object(f, -1, p, &tail);
457                 if (r < 0)
458                         return r;
459
460                 p += ALIGN64(le64toh(tail->object.size));
461         }
462
463         r = journal_file_allocate(f, p, size);
464         if (r < 0)
465                 return r;
466
467         r = journal_file_move_to(f, type, false, p, size, &t);
468         if (r < 0)
469                 return r;
470
471         o = (Object*) t;
472
473         zero(o->object);
474         o->object.type = type;
475         o->object.size = htole64(size);
476
477         f->header->tail_object_offset = htole64(p);
478         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
479
480         *ret = o;
481         *offset = p;
482
483         return 0;
484 }
485
486 static int journal_file_setup_data_hash_table(JournalFile *f) {
487         uint64_t s, p;
488         Object *o;
489         int r;
490
491         assert(f);
492
493         /* We estimate that we need 1 hash table entry per 768 of
494            journal file and we want to make sure we never get beyond
495            75% fill level. Calculate the hash table size for the
496            maximum file size based on these metrics. */
497
498         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
499         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
500                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
501
502         log_debug("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
503
504         r = journal_file_append_object(f,
505                                        OBJECT_DATA_HASH_TABLE,
506                                        offsetof(Object, hash_table.items) + s,
507                                        &o, &p);
508         if (r < 0)
509                 return r;
510
511         memset(o->hash_table.items, 0, s);
512
513         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
514         f->header->data_hash_table_size = htole64(s);
515
516         return 0;
517 }
518
519 static int journal_file_setup_field_hash_table(JournalFile *f) {
520         uint64_t s, p;
521         Object *o;
522         int r;
523
524         assert(f);
525
526         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
527         r = journal_file_append_object(f,
528                                        OBJECT_FIELD_HASH_TABLE,
529                                        offsetof(Object, hash_table.items) + s,
530                                        &o, &p);
531         if (r < 0)
532                 return r;
533
534         memset(o->hash_table.items, 0, s);
535
536         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
537         f->header->field_hash_table_size = htole64(s);
538
539         return 0;
540 }
541
542 static int journal_file_map_data_hash_table(JournalFile *f) {
543         uint64_t s, p;
544         void *t;
545         int r;
546
547         assert(f);
548
549         p = le64toh(f->header->data_hash_table_offset);
550         s = le64toh(f->header->data_hash_table_size);
551
552         r = journal_file_move_to(f,
553                                  OBJECT_DATA_HASH_TABLE,
554                                  true,
555                                  p, s,
556                                  &t);
557         if (r < 0)
558                 return r;
559
560         f->data_hash_table = t;
561         return 0;
562 }
563
564 static int journal_file_map_field_hash_table(JournalFile *f) {
565         uint64_t s, p;
566         void *t;
567         int r;
568
569         assert(f);
570
571         p = le64toh(f->header->field_hash_table_offset);
572         s = le64toh(f->header->field_hash_table_size);
573
574         r = journal_file_move_to(f,
575                                  OBJECT_FIELD_HASH_TABLE,
576                                  true,
577                                  p, s,
578                                  &t);
579         if (r < 0)
580                 return r;
581
582         f->field_hash_table = t;
583         return 0;
584 }
585
586 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
587         uint64_t p, h;
588         int r;
589
590         assert(f);
591         assert(o);
592         assert(offset > 0);
593         assert(o->object.type == OBJECT_DATA);
594
595         /* This might alter the window we are looking at */
596
597         o->data.next_hash_offset = o->data.next_field_offset = 0;
598         o->data.entry_offset = o->data.entry_array_offset = 0;
599         o->data.n_entries = 0;
600
601         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
602         p = le64toh(f->data_hash_table[h].tail_hash_offset);
603         if (p == 0) {
604                 /* Only entry in the hash table is easy */
605                 f->data_hash_table[h].head_hash_offset = htole64(offset);
606         } else {
607                 /* Move back to the previous data object, to patch in
608                  * pointer */
609
610                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
611                 if (r < 0)
612                         return r;
613
614                 o->data.next_hash_offset = htole64(offset);
615         }
616
617         f->data_hash_table[h].tail_hash_offset = htole64(offset);
618
619         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
620                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
621
622         return 0;
623 }
624
625 int journal_file_find_data_object_with_hash(
626                 JournalFile *f,
627                 const void *data, uint64_t size, uint64_t hash,
628                 Object **ret, uint64_t *offset) {
629
630         uint64_t p, osize, h;
631         int r;
632
633         assert(f);
634         assert(data || size == 0);
635
636         osize = offsetof(Object, data.payload) + size;
637
638         if (f->header->data_hash_table_size == 0)
639                 return -EBADMSG;
640
641         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
642         p = le64toh(f->data_hash_table[h].head_hash_offset);
643
644         while (p > 0) {
645                 Object *o;
646
647                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
648                 if (r < 0)
649                         return r;
650
651                 if (le64toh(o->data.hash) != hash)
652                         goto next;
653
654                 if (o->object.flags & OBJECT_COMPRESSED) {
655 #ifdef HAVE_XZ
656                         uint64_t l, rsize;
657
658                         l = le64toh(o->object.size);
659                         if (l <= offsetof(Object, data.payload))
660                                 return -EBADMSG;
661
662                         l -= offsetof(Object, data.payload);
663
664                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
665                                 return -EBADMSG;
666
667                         if (rsize == size &&
668                             memcmp(f->compress_buffer, data, size) == 0) {
669
670                                 if (ret)
671                                         *ret = o;
672
673                                 if (offset)
674                                         *offset = p;
675
676                                 return 1;
677                         }
678 #else
679                         return -EPROTONOSUPPORT;
680 #endif
681
682                 } else if (le64toh(o->object.size) == osize &&
683                            memcmp(o->data.payload, data, size) == 0) {
684
685                         if (ret)
686                                 *ret = o;
687
688                         if (offset)
689                                 *offset = p;
690
691                         return 1;
692                 }
693
694         next:
695                 p = le64toh(o->data.next_hash_offset);
696         }
697
698         return 0;
699 }
700
701 int journal_file_find_data_object(
702                 JournalFile *f,
703                 const void *data, uint64_t size,
704                 Object **ret, uint64_t *offset) {
705
706         uint64_t hash;
707
708         assert(f);
709         assert(data || size == 0);
710
711         hash = hash64(data, size);
712
713         return journal_file_find_data_object_with_hash(f,
714                                                        data, size, hash,
715                                                        ret, offset);
716 }
717
718 static int journal_file_append_data(
719                 JournalFile *f,
720                 const void *data, uint64_t size,
721                 Object **ret, uint64_t *offset) {
722
723         uint64_t hash, p;
724         uint64_t osize;
725         Object *o;
726         int r;
727         bool compressed = false;
728
729         assert(f);
730         assert(data || size == 0);
731
732         hash = hash64(data, size);
733
734         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
735         if (r < 0)
736                 return r;
737         else if (r > 0) {
738
739                 if (ret)
740                         *ret = o;
741
742                 if (offset)
743                         *offset = p;
744
745                 return 0;
746         }
747
748         osize = offsetof(Object, data.payload) + size;
749         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
750         if (r < 0)
751                 return r;
752
753         o->data.hash = htole64(hash);
754
755 #ifdef HAVE_XZ
756         if (f->compress &&
757             size >= COMPRESSION_SIZE_THRESHOLD) {
758                 uint64_t rsize;
759
760                 compressed = compress_blob(data, size, o->data.payload, &rsize);
761
762                 if (compressed) {
763                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
764                         o->object.flags |= OBJECT_COMPRESSED;
765
766                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
767                 }
768         }
769 #endif
770
771         if (!compressed && size > 0)
772                 memcpy(o->data.payload, data, size);
773
774         r = journal_file_link_data(f, o, p, hash);
775         if (r < 0)
776                 return r;
777
778         /* The linking might have altered the window, so let's
779          * refresh our pointer */
780         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
781         if (r < 0)
782                 return r;
783
784 #ifdef HAVE_GCRYPT
785         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
786         if (r < 0)
787                 return r;
788 #endif
789
790         if (ret)
791                 *ret = o;
792
793         if (offset)
794                 *offset = p;
795
796         return 0;
797 }
798
799 uint64_t journal_file_entry_n_items(Object *o) {
800         assert(o);
801         assert(o->object.type == OBJECT_ENTRY);
802
803         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
804 }
805
806 uint64_t journal_file_entry_array_n_items(Object *o) {
807         assert(o);
808         assert(o->object.type == OBJECT_ENTRY_ARRAY);
809
810         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
811 }
812
813 uint64_t journal_file_hash_table_n_items(Object *o) {
814         assert(o);
815         assert(o->object.type == OBJECT_DATA_HASH_TABLE ||
816                o->object.type == OBJECT_FIELD_HASH_TABLE);
817
818         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
819 }
820
821 static int link_entry_into_array(JournalFile *f,
822                                  le64_t *first,
823                                  le64_t *idx,
824                                  uint64_t p) {
825         int r;
826         uint64_t n = 0, ap = 0, q, i, a, hidx;
827         Object *o;
828
829         assert(f);
830         assert(first);
831         assert(idx);
832         assert(p > 0);
833
834         a = le64toh(*first);
835         i = hidx = le64toh(*idx);
836         while (a > 0) {
837
838                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
839                 if (r < 0)
840                         return r;
841
842                 n = journal_file_entry_array_n_items(o);
843                 if (i < n) {
844                         o->entry_array.items[i] = htole64(p);
845                         *idx = htole64(hidx + 1);
846                         return 0;
847                 }
848
849                 i -= n;
850                 ap = a;
851                 a = le64toh(o->entry_array.next_entry_array_offset);
852         }
853
854         if (hidx > n)
855                 n = (hidx+1) * 2;
856         else
857                 n = n * 2;
858
859         if (n < 4)
860                 n = 4;
861
862         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
863                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
864                                        &o, &q);
865         if (r < 0)
866                 return r;
867
868 #ifdef HAVE_GCRYPT
869         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
870         if (r < 0)
871                 return r;
872 #endif
873
874         o->entry_array.items[i] = htole64(p);
875
876         if (ap == 0)
877                 *first = htole64(q);
878         else {
879                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
880                 if (r < 0)
881                         return r;
882
883                 o->entry_array.next_entry_array_offset = htole64(q);
884         }
885
886         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
887                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
888
889         *idx = htole64(hidx + 1);
890
891         return 0;
892 }
893
894 static int link_entry_into_array_plus_one(JournalFile *f,
895                                           le64_t *extra,
896                                           le64_t *first,
897                                           le64_t *idx,
898                                           uint64_t p) {
899
900         int r;
901
902         assert(f);
903         assert(extra);
904         assert(first);
905         assert(idx);
906         assert(p > 0);
907
908         if (*idx == 0)
909                 *extra = htole64(p);
910         else {
911                 le64_t i;
912
913                 i = htole64(le64toh(*idx) - 1);
914                 r = link_entry_into_array(f, first, &i, p);
915                 if (r < 0)
916                         return r;
917         }
918
919         *idx = htole64(le64toh(*idx) + 1);
920         return 0;
921 }
922
923 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
924         uint64_t p;
925         int r;
926         assert(f);
927         assert(o);
928         assert(offset > 0);
929
930         p = le64toh(o->entry.items[i].object_offset);
931         if (p == 0)
932                 return -EINVAL;
933
934         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
935         if (r < 0)
936                 return r;
937
938         return link_entry_into_array_plus_one(f,
939                                               &o->data.entry_offset,
940                                               &o->data.entry_array_offset,
941                                               &o->data.n_entries,
942                                               offset);
943 }
944
945 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
946         uint64_t n, i;
947         int r;
948
949         assert(f);
950         assert(o);
951         assert(offset > 0);
952         assert(o->object.type == OBJECT_ENTRY);
953
954         __sync_synchronize();
955
956         /* Link up the entry itself */
957         r = link_entry_into_array(f,
958                                   &f->header->entry_array_offset,
959                                   &f->header->n_entries,
960                                   offset);
961         if (r < 0)
962                 return r;
963
964         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
965
966         if (f->header->head_entry_realtime == 0)
967                 f->header->head_entry_realtime = o->entry.realtime;
968
969         f->header->tail_entry_realtime = o->entry.realtime;
970         f->header->tail_entry_monotonic = o->entry.monotonic;
971
972         f->tail_entry_monotonic_valid = true;
973
974         /* Link up the items */
975         n = journal_file_entry_n_items(o);
976         for (i = 0; i < n; i++) {
977                 r = journal_file_link_entry_item(f, o, offset, i);
978                 if (r < 0)
979                         return r;
980         }
981
982         return 0;
983 }
984
985 static int journal_file_append_entry_internal(
986                 JournalFile *f,
987                 const dual_timestamp *ts,
988                 uint64_t xor_hash,
989                 const EntryItem items[], unsigned n_items,
990                 uint64_t *seqnum,
991                 Object **ret, uint64_t *offset) {
992         uint64_t np;
993         uint64_t osize;
994         Object *o;
995         int r;
996
997         assert(f);
998         assert(items || n_items == 0);
999         assert(ts);
1000
1001         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1002
1003         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1004         if (r < 0)
1005                 return r;
1006
1007         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1008         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1009         o->entry.realtime = htole64(ts->realtime);
1010         o->entry.monotonic = htole64(ts->monotonic);
1011         o->entry.xor_hash = htole64(xor_hash);
1012         o->entry.boot_id = f->header->boot_id;
1013
1014 #ifdef HAVE_GCRYPT
1015         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1016         if (r < 0)
1017                 return r;
1018 #endif
1019
1020         r = journal_file_link_entry(f, o, np);
1021         if (r < 0)
1022                 return r;
1023
1024         if (ret)
1025                 *ret = o;
1026
1027         if (offset)
1028                 *offset = np;
1029
1030         return 0;
1031 }
1032
1033 void journal_file_post_change(JournalFile *f) {
1034         assert(f);
1035
1036         /* inotify() does not receive IN_MODIFY events from file
1037          * accesses done via mmap(). After each access we hence
1038          * trigger IN_MODIFY by truncating the journal file to its
1039          * current size which triggers IN_MODIFY. */
1040
1041         __sync_synchronize();
1042
1043         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1044                 log_error("Failed to to truncate file to its own size: %m");
1045 }
1046
1047 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1048         unsigned i;
1049         EntryItem *items;
1050         int r;
1051         uint64_t xor_hash = 0;
1052         struct dual_timestamp _ts;
1053
1054         assert(f);
1055         assert(iovec || n_iovec == 0);
1056
1057         if (!f->writable)
1058                 return -EPERM;
1059
1060         if (!ts) {
1061                 dual_timestamp_get(&_ts);
1062                 ts = &_ts;
1063         }
1064
1065         if (f->tail_entry_monotonic_valid &&
1066             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1067                 return -EINVAL;
1068
1069 #ifdef HAVE_GCRYPT
1070         r = journal_file_maybe_append_tag(f, ts->realtime);
1071         if (r < 0)
1072                 return r;
1073 #endif
1074
1075         /* alloca() can't take 0, hence let's allocate at least one */
1076         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1077
1078         for (i = 0; i < n_iovec; i++) {
1079                 uint64_t p;
1080                 Object *o;
1081
1082                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1083                 if (r < 0)
1084                         return r;
1085
1086                 xor_hash ^= le64toh(o->data.hash);
1087                 items[i].object_offset = htole64(p);
1088                 items[i].hash = o->data.hash;
1089         }
1090
1091         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1092
1093         journal_file_post_change(f);
1094
1095         return r;
1096 }
1097
1098 static int generic_array_get(JournalFile *f,
1099                              uint64_t first,
1100                              uint64_t i,
1101                              Object **ret, uint64_t *offset) {
1102
1103         Object *o;
1104         uint64_t p = 0, a;
1105         int r;
1106
1107         assert(f);
1108
1109         a = first;
1110         while (a > 0) {
1111                 uint64_t n;
1112
1113                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1114                 if (r < 0)
1115                         return r;
1116
1117                 n = journal_file_entry_array_n_items(o);
1118                 if (i < n) {
1119                         p = le64toh(o->entry_array.items[i]);
1120                         break;
1121                 }
1122
1123                 i -= n;
1124                 a = le64toh(o->entry_array.next_entry_array_offset);
1125         }
1126
1127         if (a <= 0 || p <= 0)
1128                 return 0;
1129
1130         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1131         if (r < 0)
1132                 return r;
1133
1134         if (ret)
1135                 *ret = o;
1136
1137         if (offset)
1138                 *offset = p;
1139
1140         return 1;
1141 }
1142
1143 static int generic_array_get_plus_one(JournalFile *f,
1144                                       uint64_t extra,
1145                                       uint64_t first,
1146                                       uint64_t i,
1147                                       Object **ret, uint64_t *offset) {
1148
1149         Object *o;
1150
1151         assert(f);
1152
1153         if (i == 0) {
1154                 int r;
1155
1156                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1157                 if (r < 0)
1158                         return r;
1159
1160                 if (ret)
1161                         *ret = o;
1162
1163                 if (offset)
1164                         *offset = extra;
1165
1166                 return 1;
1167         }
1168
1169         return generic_array_get(f, first, i-1, ret, offset);
1170 }
1171
1172 enum {
1173         TEST_FOUND,
1174         TEST_LEFT,
1175         TEST_RIGHT
1176 };
1177
1178 static int generic_array_bisect(JournalFile *f,
1179                                 uint64_t first,
1180                                 uint64_t n,
1181                                 uint64_t needle,
1182                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1183                                 direction_t direction,
1184                                 Object **ret,
1185                                 uint64_t *offset,
1186                                 uint64_t *idx) {
1187
1188         uint64_t a, p, t = 0, i = 0, last_p = 0;
1189         bool subtract_one = false;
1190         Object *o, *array = NULL;
1191         int r;
1192
1193         assert(f);
1194         assert(test_object);
1195
1196         a = first;
1197         while (a > 0) {
1198                 uint64_t left, right, k, lp;
1199
1200                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1201                 if (r < 0)
1202                         return r;
1203
1204                 k = journal_file_entry_array_n_items(array);
1205                 right = MIN(k, n);
1206                 if (right <= 0)
1207                         return 0;
1208
1209                 i = right - 1;
1210                 lp = p = le64toh(array->entry_array.items[i]);
1211                 if (p <= 0)
1212                         return -EBADMSG;
1213
1214                 r = test_object(f, p, needle);
1215                 if (r < 0)
1216                         return r;
1217
1218                 if (r == TEST_FOUND)
1219                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1220
1221                 if (r == TEST_RIGHT) {
1222                         left = 0;
1223                         right -= 1;
1224                         for (;;) {
1225                                 if (left == right) {
1226                                         if (direction == DIRECTION_UP)
1227                                                 subtract_one = true;
1228
1229                                         i = left;
1230                                         goto found;
1231                                 }
1232
1233                                 assert(left < right);
1234
1235                                 i = (left + right) / 2;
1236                                 p = le64toh(array->entry_array.items[i]);
1237                                 if (p <= 0)
1238                                         return -EBADMSG;
1239
1240                                 r = test_object(f, p, needle);
1241                                 if (r < 0)
1242                                         return r;
1243
1244                                 if (r == TEST_FOUND)
1245                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1246
1247                                 if (r == TEST_RIGHT)
1248                                         right = i;
1249                                 else
1250                                         left = i + 1;
1251                         }
1252                 }
1253
1254                 if (k > n) {
1255                         if (direction == DIRECTION_UP) {
1256                                 i = n;
1257                                 subtract_one = true;
1258                                 goto found;
1259                         }
1260
1261                         return 0;
1262                 }
1263
1264                 last_p = lp;
1265
1266                 n -= k;
1267                 t += k;
1268                 a = le64toh(array->entry_array.next_entry_array_offset);
1269         }
1270
1271         return 0;
1272
1273 found:
1274         if (subtract_one && t == 0 && i == 0)
1275                 return 0;
1276
1277         if (subtract_one && i == 0)
1278                 p = last_p;
1279         else if (subtract_one)
1280                 p = le64toh(array->entry_array.items[i-1]);
1281         else
1282                 p = le64toh(array->entry_array.items[i]);
1283
1284         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1285         if (r < 0)
1286                 return r;
1287
1288         if (ret)
1289                 *ret = o;
1290
1291         if (offset)
1292                 *offset = p;
1293
1294         if (idx)
1295                 *idx = t + i + (subtract_one ? -1 : 0);
1296
1297         return 1;
1298 }
1299
1300 static int generic_array_bisect_plus_one(JournalFile *f,
1301                                          uint64_t extra,
1302                                          uint64_t first,
1303                                          uint64_t n,
1304                                          uint64_t needle,
1305                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1306                                          direction_t direction,
1307                                          Object **ret,
1308                                          uint64_t *offset,
1309                                          uint64_t *idx) {
1310
1311         int r;
1312         bool step_back = false;
1313         Object *o;
1314
1315         assert(f);
1316         assert(test_object);
1317
1318         if (n <= 0)
1319                 return 0;
1320
1321         /* This bisects the array in object 'first', but first checks
1322          * an extra  */
1323         r = test_object(f, extra, needle);
1324         if (r < 0)
1325                 return r;
1326
1327         if (r == TEST_FOUND)
1328                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1329
1330         /* if we are looking with DIRECTION_UP then we need to first
1331            see if in the actual array there is a matching entry, and
1332            return the last one of that. But if there isn't any we need
1333            to return this one. Hence remember this, and return it
1334            below. */
1335         if (r == TEST_LEFT)
1336                 step_back = direction == DIRECTION_UP;
1337
1338         if (r == TEST_RIGHT) {
1339                 if (direction == DIRECTION_DOWN)
1340                         goto found;
1341                 else
1342                         return 0;
1343         }
1344
1345         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1346
1347         if (r == 0 && step_back)
1348                 goto found;
1349
1350         if (r > 0 && idx)
1351                 (*idx) ++;
1352
1353         return r;
1354
1355 found:
1356         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1357         if (r < 0)
1358                 return r;
1359
1360         if (ret)
1361                 *ret = o;
1362
1363         if (offset)
1364                 *offset = extra;
1365
1366         if (idx)
1367                 *idx = 0;
1368
1369         return 1;
1370 }
1371
1372 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1373         assert(f);
1374         assert(p > 0);
1375
1376         if (p == needle)
1377                 return TEST_FOUND;
1378         else if (p < needle)
1379                 return TEST_LEFT;
1380         else
1381                 return TEST_RIGHT;
1382 }
1383
1384 int journal_file_move_to_entry_by_offset(
1385                 JournalFile *f,
1386                 uint64_t p,
1387                 direction_t direction,
1388                 Object **ret,
1389                 uint64_t *offset) {
1390
1391         return generic_array_bisect(f,
1392                                     le64toh(f->header->entry_array_offset),
1393                                     le64toh(f->header->n_entries),
1394                                     p,
1395                                     test_object_offset,
1396                                     direction,
1397                                     ret, offset, NULL);
1398 }
1399
1400
1401 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1402         Object *o;
1403         int r;
1404
1405         assert(f);
1406         assert(p > 0);
1407
1408         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1409         if (r < 0)
1410                 return r;
1411
1412         if (le64toh(o->entry.seqnum) == needle)
1413                 return TEST_FOUND;
1414         else if (le64toh(o->entry.seqnum) < needle)
1415                 return TEST_LEFT;
1416         else
1417                 return TEST_RIGHT;
1418 }
1419
1420 int journal_file_move_to_entry_by_seqnum(
1421                 JournalFile *f,
1422                 uint64_t seqnum,
1423                 direction_t direction,
1424                 Object **ret,
1425                 uint64_t *offset) {
1426
1427         return generic_array_bisect(f,
1428                                     le64toh(f->header->entry_array_offset),
1429                                     le64toh(f->header->n_entries),
1430                                     seqnum,
1431                                     test_object_seqnum,
1432                                     direction,
1433                                     ret, offset, NULL);
1434 }
1435
1436 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1437         Object *o;
1438         int r;
1439
1440         assert(f);
1441         assert(p > 0);
1442
1443         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1444         if (r < 0)
1445                 return r;
1446
1447         if (le64toh(o->entry.realtime) == needle)
1448                 return TEST_FOUND;
1449         else if (le64toh(o->entry.realtime) < needle)
1450                 return TEST_LEFT;
1451         else
1452                 return TEST_RIGHT;
1453 }
1454
1455 int journal_file_move_to_entry_by_realtime(
1456                 JournalFile *f,
1457                 uint64_t realtime,
1458                 direction_t direction,
1459                 Object **ret,
1460                 uint64_t *offset) {
1461
1462         return generic_array_bisect(f,
1463                                     le64toh(f->header->entry_array_offset),
1464                                     le64toh(f->header->n_entries),
1465                                     realtime,
1466                                     test_object_realtime,
1467                                     direction,
1468                                     ret, offset, NULL);
1469 }
1470
1471 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1472         Object *o;
1473         int r;
1474
1475         assert(f);
1476         assert(p > 0);
1477
1478         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1479         if (r < 0)
1480                 return r;
1481
1482         if (le64toh(o->entry.monotonic) == needle)
1483                 return TEST_FOUND;
1484         else if (le64toh(o->entry.monotonic) < needle)
1485                 return TEST_LEFT;
1486         else
1487                 return TEST_RIGHT;
1488 }
1489
1490 int journal_file_move_to_entry_by_monotonic(
1491                 JournalFile *f,
1492                 sd_id128_t boot_id,
1493                 uint64_t monotonic,
1494                 direction_t direction,
1495                 Object **ret,
1496                 uint64_t *offset) {
1497
1498         char t[9+32+1] = "_BOOT_ID=";
1499         Object *o;
1500         int r;
1501
1502         assert(f);
1503
1504         sd_id128_to_string(boot_id, t + 9);
1505         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1506         if (r < 0)
1507                 return r;
1508         if (r == 0)
1509                 return -ENOENT;
1510
1511         return generic_array_bisect_plus_one(f,
1512                                              le64toh(o->data.entry_offset),
1513                                              le64toh(o->data.entry_array_offset),
1514                                              le64toh(o->data.n_entries),
1515                                              monotonic,
1516                                              test_object_monotonic,
1517                                              direction,
1518                                              ret, offset, NULL);
1519 }
1520
1521 int journal_file_next_entry(
1522                 JournalFile *f,
1523                 Object *o, uint64_t p,
1524                 direction_t direction,
1525                 Object **ret, uint64_t *offset) {
1526
1527         uint64_t i, n;
1528         int r;
1529
1530         assert(f);
1531         assert(p > 0 || !o);
1532
1533         n = le64toh(f->header->n_entries);
1534         if (n <= 0)
1535                 return 0;
1536
1537         if (!o)
1538                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1539         else {
1540                 if (o->object.type != OBJECT_ENTRY)
1541                         return -EINVAL;
1542
1543                 r = generic_array_bisect(f,
1544                                          le64toh(f->header->entry_array_offset),
1545                                          le64toh(f->header->n_entries),
1546                                          p,
1547                                          test_object_offset,
1548                                          DIRECTION_DOWN,
1549                                          NULL, NULL,
1550                                          &i);
1551                 if (r <= 0)
1552                         return r;
1553
1554                 if (direction == DIRECTION_DOWN) {
1555                         if (i >= n - 1)
1556                                 return 0;
1557
1558                         i++;
1559                 } else {
1560                         if (i <= 0)
1561                                 return 0;
1562
1563                         i--;
1564                 }
1565         }
1566
1567         /* And jump to it */
1568         return generic_array_get(f,
1569                                  le64toh(f->header->entry_array_offset),
1570                                  i,
1571                                  ret, offset);
1572 }
1573
1574 int journal_file_skip_entry(
1575                 JournalFile *f,
1576                 Object *o, uint64_t p,
1577                 int64_t skip,
1578                 Object **ret, uint64_t *offset) {
1579
1580         uint64_t i, n;
1581         int r;
1582
1583         assert(f);
1584         assert(o);
1585         assert(p > 0);
1586
1587         if (o->object.type != OBJECT_ENTRY)
1588                 return -EINVAL;
1589
1590         r = generic_array_bisect(f,
1591                                  le64toh(f->header->entry_array_offset),
1592                                  le64toh(f->header->n_entries),
1593                                  p,
1594                                  test_object_offset,
1595                                  DIRECTION_DOWN,
1596                                  NULL, NULL,
1597                                  &i);
1598         if (r <= 0)
1599                 return r;
1600
1601         /* Calculate new index */
1602         if (skip < 0) {
1603                 if ((uint64_t) -skip >= i)
1604                         i = 0;
1605                 else
1606                         i = i - (uint64_t) -skip;
1607         } else
1608                 i  += (uint64_t) skip;
1609
1610         n = le64toh(f->header->n_entries);
1611         if (n <= 0)
1612                 return -EBADMSG;
1613
1614         if (i >= n)
1615                 i = n-1;
1616
1617         return generic_array_get(f,
1618                                  le64toh(f->header->entry_array_offset),
1619                                  i,
1620                                  ret, offset);
1621 }
1622
1623 int journal_file_next_entry_for_data(
1624                 JournalFile *f,
1625                 Object *o, uint64_t p,
1626                 uint64_t data_offset,
1627                 direction_t direction,
1628                 Object **ret, uint64_t *offset) {
1629
1630         uint64_t n, i;
1631         int r;
1632         Object *d;
1633
1634         assert(f);
1635         assert(p > 0 || !o);
1636
1637         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1638         if (r < 0)
1639                 return r;
1640
1641         n = le64toh(d->data.n_entries);
1642         if (n <= 0)
1643                 return n;
1644
1645         if (!o)
1646                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1647         else {
1648                 if (o->object.type != OBJECT_ENTRY)
1649                         return -EINVAL;
1650
1651                 r = generic_array_bisect_plus_one(f,
1652                                                   le64toh(d->data.entry_offset),
1653                                                   le64toh(d->data.entry_array_offset),
1654                                                   le64toh(d->data.n_entries),
1655                                                   p,
1656                                                   test_object_offset,
1657                                                   DIRECTION_DOWN,
1658                                                   NULL, NULL,
1659                                                   &i);
1660
1661                 if (r <= 0)
1662                         return r;
1663
1664                 if (direction == DIRECTION_DOWN) {
1665                         if (i >= n - 1)
1666                                 return 0;
1667
1668                         i++;
1669                 } else {
1670                         if (i <= 0)
1671                                 return 0;
1672
1673                         i--;
1674                 }
1675
1676         }
1677
1678         return generic_array_get_plus_one(f,
1679                                           le64toh(d->data.entry_offset),
1680                                           le64toh(d->data.entry_array_offset),
1681                                           i,
1682                                           ret, offset);
1683 }
1684
1685 int journal_file_move_to_entry_by_offset_for_data(
1686                 JournalFile *f,
1687                 uint64_t data_offset,
1688                 uint64_t p,
1689                 direction_t direction,
1690                 Object **ret, uint64_t *offset) {
1691
1692         int r;
1693         Object *d;
1694
1695         assert(f);
1696
1697         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1698         if (r < 0)
1699                 return r;
1700
1701         return generic_array_bisect_plus_one(f,
1702                                              le64toh(d->data.entry_offset),
1703                                              le64toh(d->data.entry_array_offset),
1704                                              le64toh(d->data.n_entries),
1705                                              p,
1706                                              test_object_offset,
1707                                              direction,
1708                                              ret, offset, NULL);
1709 }
1710
1711 int journal_file_move_to_entry_by_monotonic_for_data(
1712                 JournalFile *f,
1713                 uint64_t data_offset,
1714                 sd_id128_t boot_id,
1715                 uint64_t monotonic,
1716                 direction_t direction,
1717                 Object **ret, uint64_t *offset) {
1718
1719         char t[9+32+1] = "_BOOT_ID=";
1720         Object *o, *d;
1721         int r;
1722         uint64_t b, z;
1723
1724         assert(f);
1725
1726         /* First, seek by time */
1727         sd_id128_to_string(boot_id, t + 9);
1728         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1729         if (r < 0)
1730                 return r;
1731         if (r == 0)
1732                 return -ENOENT;
1733
1734         r = generic_array_bisect_plus_one(f,
1735                                           le64toh(o->data.entry_offset),
1736                                           le64toh(o->data.entry_array_offset),
1737                                           le64toh(o->data.n_entries),
1738                                           monotonic,
1739                                           test_object_monotonic,
1740                                           direction,
1741                                           NULL, &z, NULL);
1742         if (r <= 0)
1743                 return r;
1744
1745         /* And now, continue seeking until we find an entry that
1746          * exists in both bisection arrays */
1747
1748         for (;;) {
1749                 Object *qo;
1750                 uint64_t p, q;
1751
1752                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1753                 if (r < 0)
1754                         return r;
1755
1756                 r = generic_array_bisect_plus_one(f,
1757                                                   le64toh(d->data.entry_offset),
1758                                                   le64toh(d->data.entry_array_offset),
1759                                                   le64toh(d->data.n_entries),
1760                                                   z,
1761                                                   test_object_offset,
1762                                                   direction,
1763                                                   NULL, &p, NULL);
1764                 if (r <= 0)
1765                         return r;
1766
1767                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1768                 if (r < 0)
1769                         return r;
1770
1771                 r = generic_array_bisect_plus_one(f,
1772                                                   le64toh(o->data.entry_offset),
1773                                                   le64toh(o->data.entry_array_offset),
1774                                                   le64toh(o->data.n_entries),
1775                                                   p,
1776                                                   test_object_offset,
1777                                                   direction,
1778                                                   &qo, &q, NULL);
1779
1780                 if (r <= 0)
1781                         return r;
1782
1783                 if (p == q) {
1784                         if (ret)
1785                                 *ret = qo;
1786                         if (offset)
1787                                 *offset = q;
1788
1789                         return 1;
1790                 }
1791
1792                 z = q;
1793         }
1794
1795         return 0;
1796 }
1797
1798 int journal_file_move_to_entry_by_seqnum_for_data(
1799                 JournalFile *f,
1800                 uint64_t data_offset,
1801                 uint64_t seqnum,
1802                 direction_t direction,
1803                 Object **ret, uint64_t *offset) {
1804
1805         Object *d;
1806         int r;
1807
1808         assert(f);
1809
1810         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1811         if (r < 0)
1812                 return r;
1813
1814         return generic_array_bisect_plus_one(f,
1815                                              le64toh(d->data.entry_offset),
1816                                              le64toh(d->data.entry_array_offset),
1817                                              le64toh(d->data.n_entries),
1818                                              seqnum,
1819                                              test_object_seqnum,
1820                                              direction,
1821                                              ret, offset, NULL);
1822 }
1823
1824 int journal_file_move_to_entry_by_realtime_for_data(
1825                 JournalFile *f,
1826                 uint64_t data_offset,
1827                 uint64_t realtime,
1828                 direction_t direction,
1829                 Object **ret, uint64_t *offset) {
1830
1831         Object *d;
1832         int r;
1833
1834         assert(f);
1835
1836         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1837         if (r < 0)
1838                 return r;
1839
1840         return generic_array_bisect_plus_one(f,
1841                                              le64toh(d->data.entry_offset),
1842                                              le64toh(d->data.entry_array_offset),
1843                                              le64toh(d->data.n_entries),
1844                                              realtime,
1845                                              test_object_realtime,
1846                                              direction,
1847                                              ret, offset, NULL);
1848 }
1849
1850 void journal_file_dump(JournalFile *f) {
1851         Object *o;
1852         int r;
1853         uint64_t p;
1854
1855         assert(f);
1856
1857         journal_file_print_header(f);
1858
1859         p = le64toh(f->header->header_size);
1860         while (p != 0) {
1861                 r = journal_file_move_to_object(f, -1, p, &o);
1862                 if (r < 0)
1863                         goto fail;
1864
1865                 switch (o->object.type) {
1866
1867                 case OBJECT_UNUSED:
1868                         printf("Type: OBJECT_UNUSED\n");
1869                         break;
1870
1871                 case OBJECT_DATA:
1872                         printf("Type: OBJECT_DATA\n");
1873                         break;
1874
1875                 case OBJECT_ENTRY:
1876                         printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
1877                                (unsigned long long) le64toh(o->entry.seqnum),
1878                                (unsigned long long) le64toh(o->entry.monotonic),
1879                                (unsigned long long) le64toh(o->entry.realtime));
1880                         break;
1881
1882                 case OBJECT_FIELD_HASH_TABLE:
1883                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1884                         break;
1885
1886                 case OBJECT_DATA_HASH_TABLE:
1887                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1888                         break;
1889
1890                 case OBJECT_ENTRY_ARRAY:
1891                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1892                         break;
1893
1894                 case OBJECT_TAG:
1895                         printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
1896                                (unsigned long long) le64toh(o->tag.seqnum),
1897                                (unsigned long long) le64toh(o->tag.epoch));
1898                         break;
1899                 }
1900
1901                 if (o->object.flags & OBJECT_COMPRESSED)
1902                         printf("Flags: COMPRESSED\n");
1903
1904                 if (p == le64toh(f->header->tail_object_offset))
1905                         p = 0;
1906                 else
1907                         p = p + ALIGN64(le64toh(o->object.size));
1908         }
1909
1910         return;
1911 fail:
1912         log_error("File corrupt");
1913 }
1914
1915 void journal_file_print_header(JournalFile *f) {
1916         char a[33], b[33], c[33];
1917         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1918         struct stat st;
1919         char bytes[FORMAT_BYTES_MAX];
1920
1921         assert(f);
1922
1923         printf("File Path: %s\n"
1924                "File ID: %s\n"
1925                "Machine ID: %s\n"
1926                "Boot ID: %s\n"
1927                "Sequential Number ID: %s\n"
1928                "State: %s\n"
1929                "Compatible Flags:%s%s\n"
1930                "Incompatible Flags:%s%s\n"
1931                "Header size: %llu\n"
1932                "Arena size: %llu\n"
1933                "Data Hash Table Size: %llu\n"
1934                "Field Hash Table Size: %llu\n"
1935                "Rotate Suggested: %s\n"
1936                "Head Sequential Number: %llu\n"
1937                "Tail Sequential Number: %llu\n"
1938                "Head Realtime Timestamp: %s\n"
1939                "Tail Realtime Timestamp: %s\n"
1940                "Objects: %llu\n"
1941                "Entry Objects: %llu\n",
1942                f->path,
1943                sd_id128_to_string(f->header->file_id, a),
1944                sd_id128_to_string(f->header->machine_id, b),
1945                sd_id128_to_string(f->header->boot_id, c),
1946                sd_id128_to_string(f->header->seqnum_id, c),
1947                f->header->state == STATE_OFFLINE ? "OFFLINE" :
1948                f->header->state == STATE_ONLINE ? "ONLINE" :
1949                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
1950                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
1951                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
1952                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
1953                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
1954                (unsigned long long) le64toh(f->header->header_size),
1955                (unsigned long long) le64toh(f->header->arena_size),
1956                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1957                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1958                yes_no(journal_file_rotate_suggested(f)),
1959                (unsigned long long) le64toh(f->header->head_entry_seqnum),
1960                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
1961                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1962                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
1963                (unsigned long long) le64toh(f->header->n_objects),
1964                (unsigned long long) le64toh(f->header->n_entries));
1965
1966         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1967                 printf("Data Objects: %llu\n"
1968                        "Data Hash Table Fill: %.1f%%\n",
1969                        (unsigned long long) le64toh(f->header->n_data),
1970                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1971
1972         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1973                 printf("Field Objects: %llu\n"
1974                        "Field Hash Table Fill: %.1f%%\n",
1975                        (unsigned long long) le64toh(f->header->n_fields),
1976                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1977
1978         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
1979                 printf("Tag Objects: %llu\n",
1980                        (unsigned long long) le64toh(f->header->n_tags));
1981         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1982                 printf("Entry Array Objects: %llu\n",
1983                        (unsigned long long) le64toh(f->header->n_entry_arrays));
1984
1985         if (fstat(f->fd, &st) >= 0)
1986                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
1987 }
1988
1989 int journal_file_open(
1990                 const char *fname,
1991                 int flags,
1992                 mode_t mode,
1993                 bool compress,
1994                 bool seal,
1995                 JournalMetrics *metrics,
1996                 MMapCache *mmap_cache,
1997                 JournalFile *template,
1998                 JournalFile **ret) {
1999
2000         JournalFile *f;
2001         int r;
2002         bool newly_created = false;
2003
2004         assert(fname);
2005
2006         if ((flags & O_ACCMODE) != O_RDONLY &&
2007             (flags & O_ACCMODE) != O_RDWR)
2008                 return -EINVAL;
2009
2010         if (!endswith(fname, ".journal") &&
2011             !endswith(fname, ".journal~"))
2012                 return -EINVAL;
2013
2014         f = new0(JournalFile, 1);
2015         if (!f)
2016                 return -ENOMEM;
2017
2018         f->fd = -1;
2019         f->mode = mode;
2020
2021         f->flags = flags;
2022         f->prot = prot_from_flags(flags);
2023         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2024 #ifdef HAVE_XZ
2025         f->compress = compress;
2026 #endif
2027 #ifdef HAVE_GCRYPT
2028         f->seal = seal;
2029 #endif
2030
2031         if (mmap_cache)
2032                 f->mmap = mmap_cache_ref(mmap_cache);
2033         else {
2034                 f->mmap = mmap_cache_new();
2035                 if (!f->mmap) {
2036                         r = -ENOMEM;
2037                         goto fail;
2038                 }
2039         }
2040
2041         f->path = strdup(fname);
2042         if (!f->path) {
2043                 r = -ENOMEM;
2044                 goto fail;
2045         }
2046
2047         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2048         if (f->fd < 0) {
2049                 r = -errno;
2050                 goto fail;
2051         }
2052
2053         if (fstat(f->fd, &f->last_stat) < 0) {
2054                 r = -errno;
2055                 goto fail;
2056         }
2057
2058         if (f->last_stat.st_size == 0 && f->writable) {
2059                 newly_created = true;
2060
2061 #ifdef HAVE_GCRYPT
2062                 /* Try to load the FSPRG state, and if we can't, then
2063                  * just don't do sealing */
2064                 if (f->seal) {
2065                         r = journal_file_fss_load(f);
2066                         if (r < 0)
2067                                 f->seal = false;
2068                 }
2069 #endif
2070
2071                 r = journal_file_init_header(f, template);
2072                 if (r < 0)
2073                         goto fail;
2074
2075                 if (fstat(f->fd, &f->last_stat) < 0) {
2076                         r = -errno;
2077                         goto fail;
2078                 }
2079         }
2080
2081         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2082                 r = -EIO;
2083                 goto fail;
2084         }
2085
2086         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2087         if (f->header == MAP_FAILED) {
2088                 f->header = NULL;
2089                 r = -errno;
2090                 goto fail;
2091         }
2092
2093         if (!newly_created) {
2094                 r = journal_file_verify_header(f);
2095                 if (r < 0)
2096                         goto fail;
2097         }
2098
2099 #ifdef HAVE_GCRYPT
2100         if (!newly_created && f->writable) {
2101                 r = journal_file_fss_load(f);
2102                 if (r < 0)
2103                         goto fail;
2104         }
2105 #endif
2106
2107         if (f->writable) {
2108                 if (metrics) {
2109                         journal_default_metrics(metrics, f->fd);
2110                         f->metrics = *metrics;
2111                 } else if (template)
2112                         f->metrics = template->metrics;
2113
2114                 r = journal_file_refresh_header(f);
2115                 if (r < 0)
2116                         goto fail;
2117         }
2118
2119 #ifdef HAVE_GCRYPT
2120         r = journal_file_hmac_setup(f);
2121         if (r < 0)
2122                 goto fail;
2123 #endif
2124
2125         if (newly_created) {
2126                 r = journal_file_setup_field_hash_table(f);
2127                 if (r < 0)
2128                         goto fail;
2129
2130                 r = journal_file_setup_data_hash_table(f);
2131                 if (r < 0)
2132                         goto fail;
2133
2134 #ifdef HAVE_GCRYPT
2135                 r = journal_file_append_first_tag(f);
2136                 if (r < 0)
2137                         goto fail;
2138 #endif
2139         }
2140
2141         r = journal_file_map_field_hash_table(f);
2142         if (r < 0)
2143                 goto fail;
2144
2145         r = journal_file_map_data_hash_table(f);
2146         if (r < 0)
2147                 goto fail;
2148
2149         if (ret)
2150                 *ret = f;
2151
2152         return 0;
2153
2154 fail:
2155         journal_file_close(f);
2156
2157         return r;
2158 }
2159
2160 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2161         char *p;
2162         size_t l;
2163         JournalFile *old_file, *new_file = NULL;
2164         int r;
2165
2166         assert(f);
2167         assert(*f);
2168
2169         old_file = *f;
2170
2171         if (!old_file->writable)
2172                 return -EINVAL;
2173
2174         if (!endswith(old_file->path, ".journal"))
2175                 return -EINVAL;
2176
2177         l = strlen(old_file->path);
2178
2179         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2180         if (!p)
2181                 return -ENOMEM;
2182
2183         memcpy(p, old_file->path, l - 8);
2184         p[l-8] = '@';
2185         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2186         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2187                  "-%016llx-%016llx.journal",
2188                  (unsigned long long) le64toh((*f)->header->tail_entry_seqnum),
2189                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2190
2191         r = rename(old_file->path, p);
2192         free(p);
2193
2194         if (r < 0)
2195                 return -errno;
2196
2197         old_file->header->state = STATE_ARCHIVED;
2198
2199         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2200         journal_file_close(old_file);
2201
2202         *f = new_file;
2203         return r;
2204 }
2205
2206 int journal_file_open_reliably(
2207                 const char *fname,
2208                 int flags,
2209                 mode_t mode,
2210                 bool compress,
2211                 bool seal,
2212                 JournalMetrics *metrics,
2213                 MMapCache *mmap_cache,
2214                 JournalFile *template,
2215                 JournalFile **ret) {
2216
2217         int r;
2218         size_t l;
2219         char *p;
2220
2221         r = journal_file_open(fname, flags, mode, compress, seal,
2222                               metrics, mmap_cache, template, ret);
2223         if (r != -EBADMSG && /* corrupted */
2224             r != -ENODATA && /* truncated */
2225             r != -EHOSTDOWN && /* other machine */
2226             r != -EPROTONOSUPPORT && /* incompatible feature */
2227             r != -EBUSY && /* unclean shutdown */
2228             r != -ESHUTDOWN /* already archived */)
2229                 return r;
2230
2231         if ((flags & O_ACCMODE) == O_RDONLY)
2232                 return r;
2233
2234         if (!(flags & O_CREAT))
2235                 return r;
2236
2237         if (!endswith(fname, ".journal"))
2238                 return r;
2239
2240         /* The file is corrupted. Rotate it away and try it again (but only once) */
2241
2242         l = strlen(fname);
2243         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2244                      (int) (l-8), fname,
2245                      (unsigned long long) now(CLOCK_REALTIME),
2246                      random_ull()) < 0)
2247                 return -ENOMEM;
2248
2249         r = rename(fname, p);
2250         free(p);
2251         if (r < 0)
2252                 return -errno;
2253
2254         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2255
2256         return journal_file_open(fname, flags, mode, compress, seal,
2257                                  metrics, mmap_cache, template, ret);
2258 }
2259
2260
2261 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2262         uint64_t i, n;
2263         uint64_t q, xor_hash = 0;
2264         int r;
2265         EntryItem *items;
2266         dual_timestamp ts;
2267
2268         assert(from);
2269         assert(to);
2270         assert(o);
2271         assert(p);
2272
2273         if (!to->writable)
2274                 return -EPERM;
2275
2276         ts.monotonic = le64toh(o->entry.monotonic);
2277         ts.realtime = le64toh(o->entry.realtime);
2278
2279         if (to->tail_entry_monotonic_valid &&
2280             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2281                 return -EINVAL;
2282
2283         n = journal_file_entry_n_items(o);
2284         items = alloca(sizeof(EntryItem) * n);
2285
2286         for (i = 0; i < n; i++) {
2287                 uint64_t l, h;
2288                 le64_t le_hash;
2289                 size_t t;
2290                 void *data;
2291                 Object *u;
2292
2293                 q = le64toh(o->entry.items[i].object_offset);
2294                 le_hash = o->entry.items[i].hash;
2295
2296                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2297                 if (r < 0)
2298                         return r;
2299
2300                 if (le_hash != o->data.hash)
2301                         return -EBADMSG;
2302
2303                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2304                 t = (size_t) l;
2305
2306                 /* We hit the limit on 32bit machines */
2307                 if ((uint64_t) t != l)
2308                         return -E2BIG;
2309
2310                 if (o->object.flags & OBJECT_COMPRESSED) {
2311 #ifdef HAVE_XZ
2312                         uint64_t rsize;
2313
2314                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2315                                 return -EBADMSG;
2316
2317                         data = from->compress_buffer;
2318                         l = rsize;
2319 #else
2320                         return -EPROTONOSUPPORT;
2321 #endif
2322                 } else
2323                         data = o->data.payload;
2324
2325                 r = journal_file_append_data(to, data, l, &u, &h);
2326                 if (r < 0)
2327                         return r;
2328
2329                 xor_hash ^= le64toh(u->data.hash);
2330                 items[i].object_offset = htole64(h);
2331                 items[i].hash = u->data.hash;
2332
2333                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2334                 if (r < 0)
2335                         return r;
2336         }
2337
2338         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2339 }
2340
2341 void journal_default_metrics(JournalMetrics *m, int fd) {
2342         uint64_t fs_size = 0;
2343         struct statvfs ss;
2344         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2345
2346         assert(m);
2347         assert(fd >= 0);
2348
2349         if (fstatvfs(fd, &ss) >= 0)
2350                 fs_size = ss.f_frsize * ss.f_blocks;
2351
2352         if (m->max_use == (uint64_t) -1) {
2353
2354                 if (fs_size > 0) {
2355                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2356
2357                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2358                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2359
2360                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2361                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2362                 } else
2363                         m->max_use = DEFAULT_MAX_USE_LOWER;
2364         } else {
2365                 m->max_use = PAGE_ALIGN(m->max_use);
2366
2367                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2368                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2369         }
2370
2371         if (m->max_size == (uint64_t) -1) {
2372                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2373
2374                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2375                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2376         } else
2377                 m->max_size = PAGE_ALIGN(m->max_size);
2378
2379         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2380                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2381
2382         if (m->max_size*2 > m->max_use)
2383                 m->max_use = m->max_size*2;
2384
2385         if (m->min_size == (uint64_t) -1)
2386                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2387         else {
2388                 m->min_size = PAGE_ALIGN(m->min_size);
2389
2390                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2391                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2392
2393                 if (m->min_size > m->max_size)
2394                         m->max_size = m->min_size;
2395         }
2396
2397         if (m->keep_free == (uint64_t) -1) {
2398
2399                 if (fs_size > 0) {
2400                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2401
2402                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2403                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2404
2405                 } else
2406                         m->keep_free = DEFAULT_KEEP_FREE;
2407         }
2408
2409         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2410                   format_bytes(a, sizeof(a), m->max_use),
2411                   format_bytes(b, sizeof(b), m->max_size),
2412                   format_bytes(c, sizeof(c), m->min_size),
2413                   format_bytes(d, sizeof(d), m->keep_free));
2414 }
2415
2416 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2417         assert(f);
2418         assert(from || to);
2419
2420         if (from) {
2421                 if (f->header->head_entry_realtime == 0)
2422                         return -ENOENT;
2423
2424                 *from = le64toh(f->header->head_entry_realtime);
2425         }
2426
2427         if (to) {
2428                 if (f->header->tail_entry_realtime == 0)
2429                         return -ENOENT;
2430
2431                 *to = le64toh(f->header->tail_entry_realtime);
2432         }
2433
2434         return 1;
2435 }
2436
2437 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2438         char t[9+32+1] = "_BOOT_ID=";
2439         Object *o;
2440         uint64_t p;
2441         int r;
2442
2443         assert(f);
2444         assert(from || to);
2445
2446         sd_id128_to_string(boot_id, t + 9);
2447
2448         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2449         if (r <= 0)
2450                 return r;
2451
2452         if (le64toh(o->data.n_entries) <= 0)
2453                 return 0;
2454
2455         if (from) {
2456                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2457                 if (r < 0)
2458                         return r;
2459
2460                 *from = le64toh(o->entry.monotonic);
2461         }
2462
2463         if (to) {
2464                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2465                 if (r < 0)
2466                         return r;
2467
2468                 r = generic_array_get_plus_one(f,
2469                                                le64toh(o->data.entry_offset),
2470                                                le64toh(o->data.entry_array_offset),
2471                                                le64toh(o->data.n_entries)-1,
2472                                                &o, NULL);
2473                 if (r <= 0)
2474                         return r;
2475
2476                 *to = le64toh(o->entry.monotonic);
2477         }
2478
2479         return 1;
2480 }
2481
2482 bool journal_file_rotate_suggested(JournalFile *f) {
2483         assert(f);
2484
2485         /* If we gained new header fields we gained new features,
2486          * hence suggest a rotation */
2487         if (le64toh(f->header->header_size) < sizeof(Header)) {
2488                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2489                 return true;
2490         }
2491
2492         /* Let's check if the hash tables grew over a certain fill
2493          * level (75%, borrowing this value from Java's hash table
2494          * implementation), and if so suggest a rotation. To calculate
2495          * the fill level we need the n_data field, which only exists
2496          * in newer versions. */
2497
2498         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2499                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2500                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2501                                   f->path,
2502                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2503                                   (unsigned long long) le64toh(f->header->n_data),
2504                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2505                                   (unsigned long long) (f->last_stat.st_size),
2506                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2507                         return true;
2508                 }
2509
2510         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2511                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2512                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2513                                   f->path,
2514                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2515                                   (unsigned long long) le64toh(f->header->n_fields),
2516                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2517                         return true;
2518                 }
2519
2520         return false;
2521 }