chiark / gitweb /
journald: detect invalid header pointers correctly
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
33 #include "lookup3.h"
34 #include "compress.h"
35 #include "fsprg.h"
36
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 void journal_file_close(JournalFile *f) {
65         assert(f);
66
67 #ifdef HAVE_GCRYPT
68         /* Write the final tag */
69         if (f->seal && f->writable)
70                 journal_file_append_tag(f);
71 #endif
72
73         /* Sync everything to disk, before we mark the file offline */
74         if (f->mmap && f->fd >= 0)
75                 mmap_cache_close_fd(f->mmap, f->fd);
76
77         if (f->writable && f->fd >= 0)
78                 fdatasync(f->fd);
79
80         if (f->header) {
81                 /* Mark the file offline. Don't override the archived state if it already is set */
82                 if (f->writable && f->header->state == STATE_ONLINE)
83                         f->header->state = STATE_OFFLINE;
84
85                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
86         }
87
88         if (f->fd >= 0)
89                 close_nointr_nofail(f->fd);
90
91         free(f->path);
92
93         if (f->mmap)
94                 mmap_cache_unref(f->mmap);
95
96 #ifdef HAVE_XZ
97         free(f->compress_buffer);
98 #endif
99
100 #ifdef HAVE_GCRYPT
101         if (f->fss_file)
102                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
103         else if (f->fsprg_state)
104                 free(f->fsprg_state);
105
106         free(f->fsprg_seed);
107
108         if (f->hmac)
109                 gcry_md_close(f->hmac);
110 #endif
111
112         free(f);
113 }
114
115 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
116         Header h;
117         ssize_t k;
118         int r;
119
120         assert(f);
121
122         zero(h);
123         memcpy(h.signature, HEADER_SIGNATURE, 8);
124         h.header_size = htole64(ALIGN64(sizeof(h)));
125
126         h.incompatible_flags =
127                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
128
129         h.compatible_flags =
130                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
131
132         r = sd_id128_randomize(&h.file_id);
133         if (r < 0)
134                 return r;
135
136         if (template) {
137                 h.seqnum_id = template->header->seqnum_id;
138                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
139         } else
140                 h.seqnum_id = h.file_id;
141
142         k = pwrite(f->fd, &h, sizeof(h), 0);
143         if (k < 0)
144                 return -errno;
145
146         if (k != sizeof(h))
147                 return -EIO;
148
149         return 0;
150 }
151
152 static int journal_file_refresh_header(JournalFile *f) {
153         int r;
154         sd_id128_t boot_id;
155
156         assert(f);
157
158         r = sd_id128_get_machine(&f->header->machine_id);
159         if (r < 0)
160                 return r;
161
162         r = sd_id128_get_boot(&boot_id);
163         if (r < 0)
164                 return r;
165
166         if (sd_id128_equal(boot_id, f->header->boot_id))
167                 f->tail_entry_monotonic_valid = true;
168
169         f->header->boot_id = boot_id;
170
171         f->header->state = STATE_ONLINE;
172
173         /* Sync the online state to disk */
174         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
175         fdatasync(f->fd);
176
177         return 0;
178 }
179
180 static int journal_file_verify_header(JournalFile *f) {
181         assert(f);
182
183         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
184                 return -EBADMSG;
185
186         /* In both read and write mode we refuse to open files with
187          * incompatible flags we don't know */
188 #ifdef HAVE_XZ
189         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
190                 return -EPROTONOSUPPORT;
191 #else
192         if (f->header->incompatible_flags != 0)
193                 return -EPROTONOSUPPORT;
194 #endif
195
196         /* When open for writing we refuse to open files with
197          * compatible flags, too */
198         if (f->writable) {
199 #ifdef HAVE_GCRYPT
200                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
201                         return -EPROTONOSUPPORT;
202 #else
203                 if (f->header->compatible_flags != 0)
204                         return -EPROTONOSUPPORT;
205 #endif
206         }
207
208         if (f->header->state >= _STATE_MAX)
209                 return -EBADMSG;
210
211         /* The first addition was n_data, so check that we are at least this large */
212         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
213                 return -EBADMSG;
214
215         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
216                 return -EBADMSG;
217
218         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
219                 return -ENODATA;
220
221         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
222                 return -ENODATA;
223
224         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
225             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
226             !VALID64(le64toh(f->header->tail_object_offset)) ||
227             !VALID64(le64toh(f->header->entry_array_offset)))
228                 return -ENODATA;
229
230         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
231             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
232             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
233             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
234                 return -ENODATA;
235
236         if (f->writable) {
237                 uint8_t state;
238                 sd_id128_t machine_id;
239                 int r;
240
241                 r = sd_id128_get_machine(&machine_id);
242                 if (r < 0)
243                         return r;
244
245                 if (!sd_id128_equal(machine_id, f->header->machine_id))
246                         return -EHOSTDOWN;
247
248                 state = f->header->state;
249
250                 if (state == STATE_ONLINE) {
251                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
252                         return -EBUSY;
253                 } else if (state == STATE_ARCHIVED)
254                         return -ESHUTDOWN;
255                 else if (state != STATE_OFFLINE) {
256                         log_debug("Journal file %s has unknown state %u.", f->path, state);
257                         return -EBUSY;
258                 }
259         }
260
261         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
262
263         if (f->writable)
264                 f->seal = JOURNAL_HEADER_SEALED(f->header);
265
266         return 0;
267 }
268
269 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
270         uint64_t old_size, new_size;
271         int r;
272
273         assert(f);
274
275         /* We assume that this file is not sparse, and we know that
276          * for sure, since we always call posix_fallocate()
277          * ourselves */
278
279         old_size =
280                 le64toh(f->header->header_size) +
281                 le64toh(f->header->arena_size);
282
283         new_size = PAGE_ALIGN(offset + size);
284         if (new_size < le64toh(f->header->header_size))
285                 new_size = le64toh(f->header->header_size);
286
287         if (new_size <= old_size)
288                 return 0;
289
290         if (f->metrics.max_size > 0 &&
291             new_size > f->metrics.max_size)
292                 return -E2BIG;
293
294         if (new_size > f->metrics.min_size &&
295             f->metrics.keep_free > 0) {
296                 struct statvfs svfs;
297
298                 if (fstatvfs(f->fd, &svfs) >= 0) {
299                         uint64_t available;
300
301                         available = svfs.f_bfree * svfs.f_bsize;
302
303                         if (available >= f->metrics.keep_free)
304                                 available -= f->metrics.keep_free;
305                         else
306                                 available = 0;
307
308                         if (new_size - old_size > available)
309                                 return -E2BIG;
310                 }
311         }
312
313         /* Note that the glibc fallocate() fallback is very
314            inefficient, hence we try to minimize the allocation area
315            as we can. */
316         r = posix_fallocate(f->fd, old_size, new_size - old_size);
317         if (r != 0)
318                 return -r;
319
320         if (fstat(f->fd, &f->last_stat) < 0)
321                 return -errno;
322
323         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
324
325         return 0;
326 }
327
328 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
329         assert(f);
330         assert(ret);
331
332         if (size <= 0)
333                 return -EINVAL;
334
335         /* Avoid SIGBUS on invalid accesses */
336         if (offset + size > (uint64_t) f->last_stat.st_size) {
337                 /* Hmm, out of range? Let's refresh the fstat() data
338                  * first, before we trust that check. */
339
340                 if (fstat(f->fd, &f->last_stat) < 0 ||
341                     offset + size > (uint64_t) f->last_stat.st_size)
342                         return -EADDRNOTAVAIL;
343         }
344
345         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
346 }
347
348 static uint64_t minimum_header_size(Object *o) {
349
350         static uint64_t table[] = {
351                 [OBJECT_DATA] = sizeof(DataObject),
352                 [OBJECT_FIELD] = sizeof(FieldObject),
353                 [OBJECT_ENTRY] = sizeof(EntryObject),
354                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
355                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
356                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
357                 [OBJECT_TAG] = sizeof(TagObject),
358         };
359
360         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
361                 return sizeof(ObjectHeader);
362
363         return table[o->object.type];
364 }
365
366 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
367         int r;
368         void *t;
369         Object *o;
370         uint64_t s;
371         unsigned context;
372
373         assert(f);
374         assert(ret);
375
376         /* Objects may only be located at multiple of 64 bit */
377         if (!VALID64(offset))
378                 return -EFAULT;
379
380         /* One context for each type, plus one catch-all for the rest */
381         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
382
383         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
384         if (r < 0)
385                 return r;
386
387         o = (Object*) t;
388         s = le64toh(o->object.size);
389
390         if (s < sizeof(ObjectHeader))
391                 return -EBADMSG;
392
393         if (o->object.type <= OBJECT_UNUSED)
394                 return -EBADMSG;
395
396         if (s < minimum_header_size(o))
397                 return -EBADMSG;
398
399         if (type >= 0 && o->object.type != type)
400                 return -EBADMSG;
401
402         if (s > sizeof(ObjectHeader)) {
403                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
404                 if (r < 0)
405                         return r;
406
407                 o = (Object*) t;
408         }
409
410         *ret = o;
411         return 0;
412 }
413
414 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
415         uint64_t r;
416
417         assert(f);
418
419         r = le64toh(f->header->tail_entry_seqnum) + 1;
420
421         if (seqnum) {
422                 /* If an external seqnum counter was passed, we update
423                  * both the local and the external one, and set it to
424                  * the maximum of both */
425
426                 if (*seqnum + 1 > r)
427                         r = *seqnum + 1;
428
429                 *seqnum = r;
430         }
431
432         f->header->tail_entry_seqnum = htole64(r);
433
434         if (f->header->head_entry_seqnum == 0)
435                 f->header->head_entry_seqnum = htole64(r);
436
437         return r;
438 }
439
440 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
441         int r;
442         uint64_t p;
443         Object *tail, *o;
444         void *t;
445
446         assert(f);
447         assert(type > 0 && type < _OBJECT_TYPE_MAX);
448         assert(size >= sizeof(ObjectHeader));
449         assert(offset);
450         assert(ret);
451
452         p = le64toh(f->header->tail_object_offset);
453         if (p == 0)
454                 p = le64toh(f->header->header_size);
455         else {
456                 r = journal_file_move_to_object(f, -1, p, &tail);
457                 if (r < 0)
458                         return r;
459
460                 p += ALIGN64(le64toh(tail->object.size));
461         }
462
463         r = journal_file_allocate(f, p, size);
464         if (r < 0)
465                 return r;
466
467         r = journal_file_move_to(f, type, false, p, size, &t);
468         if (r < 0)
469                 return r;
470
471         o = (Object*) t;
472
473         zero(o->object);
474         o->object.type = type;
475         o->object.size = htole64(size);
476
477         f->header->tail_object_offset = htole64(p);
478         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
479
480         *ret = o;
481         *offset = p;
482
483         return 0;
484 }
485
486 static int journal_file_setup_data_hash_table(JournalFile *f) {
487         uint64_t s, p;
488         Object *o;
489         int r;
490
491         assert(f);
492
493         /* We estimate that we need 1 hash table entry per 768 of
494            journal file and we want to make sure we never get beyond
495            75% fill level. Calculate the hash table size for the
496            maximum file size based on these metrics. */
497
498         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
499         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
500                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
501
502         log_debug("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
503
504         r = journal_file_append_object(f,
505                                        OBJECT_DATA_HASH_TABLE,
506                                        offsetof(Object, hash_table.items) + s,
507                                        &o, &p);
508         if (r < 0)
509                 return r;
510
511         memset(o->hash_table.items, 0, s);
512
513         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
514         f->header->data_hash_table_size = htole64(s);
515
516         return 0;
517 }
518
519 static int journal_file_setup_field_hash_table(JournalFile *f) {
520         uint64_t s, p;
521         Object *o;
522         int r;
523
524         assert(f);
525
526         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
527         r = journal_file_append_object(f,
528                                        OBJECT_FIELD_HASH_TABLE,
529                                        offsetof(Object, hash_table.items) + s,
530                                        &o, &p);
531         if (r < 0)
532                 return r;
533
534         memset(o->hash_table.items, 0, s);
535
536         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
537         f->header->field_hash_table_size = htole64(s);
538
539         return 0;
540 }
541
542 static int journal_file_map_data_hash_table(JournalFile *f) {
543         uint64_t s, p;
544         void *t;
545         int r;
546
547         assert(f);
548
549         p = le64toh(f->header->data_hash_table_offset);
550         s = le64toh(f->header->data_hash_table_size);
551
552         r = journal_file_move_to(f,
553                                  OBJECT_DATA_HASH_TABLE,
554                                  true,
555                                  p, s,
556                                  &t);
557         if (r < 0)
558                 return r;
559
560         f->data_hash_table = t;
561         return 0;
562 }
563
564 static int journal_file_map_field_hash_table(JournalFile *f) {
565         uint64_t s, p;
566         void *t;
567         int r;
568
569         assert(f);
570
571         p = le64toh(f->header->field_hash_table_offset);
572         s = le64toh(f->header->field_hash_table_size);
573
574         r = journal_file_move_to(f,
575                                  OBJECT_FIELD_HASH_TABLE,
576                                  true,
577                                  p, s,
578                                  &t);
579         if (r < 0)
580                 return r;
581
582         f->field_hash_table = t;
583         return 0;
584 }
585
586 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
587         uint64_t p, h;
588         int r;
589
590         assert(f);
591         assert(o);
592         assert(offset > 0);
593         assert(o->object.type == OBJECT_DATA);
594
595         /* This might alter the window we are looking at */
596
597         o->data.next_hash_offset = o->data.next_field_offset = 0;
598         o->data.entry_offset = o->data.entry_array_offset = 0;
599         o->data.n_entries = 0;
600
601         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
602         p = le64toh(f->data_hash_table[h].tail_hash_offset);
603         if (p == 0) {
604                 /* Only entry in the hash table is easy */
605                 f->data_hash_table[h].head_hash_offset = htole64(offset);
606         } else {
607                 /* Move back to the previous data object, to patch in
608                  * pointer */
609
610                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
611                 if (r < 0)
612                         return r;
613
614                 o->data.next_hash_offset = htole64(offset);
615         }
616
617         f->data_hash_table[h].tail_hash_offset = htole64(offset);
618
619         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
620                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
621
622         return 0;
623 }
624
625 int journal_file_find_data_object_with_hash(
626                 JournalFile *f,
627                 const void *data, uint64_t size, uint64_t hash,
628                 Object **ret, uint64_t *offset) {
629
630         uint64_t p, osize, h;
631         int r;
632
633         assert(f);
634         assert(data || size == 0);
635
636         osize = offsetof(Object, data.payload) + size;
637
638         if (f->header->data_hash_table_size == 0)
639                 return -EBADMSG;
640
641         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
642         p = le64toh(f->data_hash_table[h].head_hash_offset);
643
644         while (p > 0) {
645                 Object *o;
646
647                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
648                 if (r < 0)
649                         return r;
650
651                 if (le64toh(o->data.hash) != hash)
652                         goto next;
653
654                 if (o->object.flags & OBJECT_COMPRESSED) {
655 #ifdef HAVE_XZ
656                         uint64_t l, rsize;
657
658                         l = le64toh(o->object.size);
659                         if (l <= offsetof(Object, data.payload))
660                                 return -EBADMSG;
661
662                         l -= offsetof(Object, data.payload);
663
664                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
665                                 return -EBADMSG;
666
667                         if (rsize == size &&
668                             memcmp(f->compress_buffer, data, size) == 0) {
669
670                                 if (ret)
671                                         *ret = o;
672
673                                 if (offset)
674                                         *offset = p;
675
676                                 return 1;
677                         }
678 #else
679                         return -EPROTONOSUPPORT;
680 #endif
681
682                 } else if (le64toh(o->object.size) == osize &&
683                            memcmp(o->data.payload, data, size) == 0) {
684
685                         if (ret)
686                                 *ret = o;
687
688                         if (offset)
689                                 *offset = p;
690
691                         return 1;
692                 }
693
694         next:
695                 p = le64toh(o->data.next_hash_offset);
696         }
697
698         return 0;
699 }
700
701 int journal_file_find_data_object(
702                 JournalFile *f,
703                 const void *data, uint64_t size,
704                 Object **ret, uint64_t *offset) {
705
706         uint64_t hash;
707
708         assert(f);
709         assert(data || size == 0);
710
711         hash = hash64(data, size);
712
713         return journal_file_find_data_object_with_hash(f,
714                                                        data, size, hash,
715                                                        ret, offset);
716 }
717
718 static int journal_file_append_data(
719                 JournalFile *f,
720                 const void *data, uint64_t size,
721                 Object **ret, uint64_t *offset) {
722
723         uint64_t hash, p;
724         uint64_t osize;
725         Object *o;
726         int r;
727         bool compressed = false;
728
729         assert(f);
730         assert(data || size == 0);
731
732         hash = hash64(data, size);
733
734         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
735         if (r < 0)
736                 return r;
737         else if (r > 0) {
738
739                 if (ret)
740                         *ret = o;
741
742                 if (offset)
743                         *offset = p;
744
745                 return 0;
746         }
747
748         osize = offsetof(Object, data.payload) + size;
749         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
750         if (r < 0)
751                 return r;
752
753         o->data.hash = htole64(hash);
754
755 #ifdef HAVE_XZ
756         if (f->compress &&
757             size >= COMPRESSION_SIZE_THRESHOLD) {
758                 uint64_t rsize;
759
760                 compressed = compress_blob(data, size, o->data.payload, &rsize);
761
762                 if (compressed) {
763                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
764                         o->object.flags |= OBJECT_COMPRESSED;
765
766                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
767                 }
768         }
769 #endif
770
771         if (!compressed && size > 0)
772                 memcpy(o->data.payload, data, size);
773
774         r = journal_file_link_data(f, o, p, hash);
775         if (r < 0)
776                 return r;
777
778 #ifdef HAVE_GCRYPT
779         r = journal_file_hmac_put_object(f, OBJECT_DATA, p);
780         if (r < 0)
781                 return r;
782 #endif
783
784         /* The linking might have altered the window, so let's
785          * refresh our pointer */
786         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
787         if (r < 0)
788                 return r;
789
790         if (ret)
791                 *ret = o;
792
793         if (offset)
794                 *offset = p;
795
796         return 0;
797 }
798
799 uint64_t journal_file_entry_n_items(Object *o) {
800         assert(o);
801         assert(o->object.type == OBJECT_ENTRY);
802
803         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
804 }
805
806 uint64_t journal_file_entry_array_n_items(Object *o) {
807         assert(o);
808         assert(o->object.type == OBJECT_ENTRY_ARRAY);
809
810         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
811 }
812
813 uint64_t journal_file_hash_table_n_items(Object *o) {
814         assert(o);
815         assert(o->object.type == OBJECT_DATA_HASH_TABLE ||
816                o->object.type == OBJECT_FIELD_HASH_TABLE);
817
818         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
819 }
820
821 static int link_entry_into_array(JournalFile *f,
822                                  le64_t *first,
823                                  le64_t *idx,
824                                  uint64_t p) {
825         int r;
826         uint64_t n = 0, ap = 0, q, i, a, hidx;
827         Object *o;
828
829         assert(f);
830         assert(first);
831         assert(idx);
832         assert(p > 0);
833
834         a = le64toh(*first);
835         i = hidx = le64toh(*idx);
836         while (a > 0) {
837
838                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
839                 if (r < 0)
840                         return r;
841
842                 n = journal_file_entry_array_n_items(o);
843                 if (i < n) {
844                         o->entry_array.items[i] = htole64(p);
845                         *idx = htole64(hidx + 1);
846                         return 0;
847                 }
848
849                 i -= n;
850                 ap = a;
851                 a = le64toh(o->entry_array.next_entry_array_offset);
852         }
853
854         if (hidx > n)
855                 n = (hidx+1) * 2;
856         else
857                 n = n * 2;
858
859         if (n < 4)
860                 n = 4;
861
862         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
863                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
864                                        &o, &q);
865         if (r < 0)
866                 return r;
867
868 #ifdef HAVE_GCRYPT
869         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, q);
870         if (r < 0)
871                 return r;
872 #endif
873
874         o->entry_array.items[i] = htole64(p);
875
876         if (ap == 0)
877                 *first = htole64(q);
878         else {
879                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
880                 if (r < 0)
881                         return r;
882
883                 o->entry_array.next_entry_array_offset = htole64(q);
884         }
885
886         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
887                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
888
889         *idx = htole64(hidx + 1);
890
891         return 0;
892 }
893
894 static int link_entry_into_array_plus_one(JournalFile *f,
895                                           le64_t *extra,
896                                           le64_t *first,
897                                           le64_t *idx,
898                                           uint64_t p) {
899
900         int r;
901
902         assert(f);
903         assert(extra);
904         assert(first);
905         assert(idx);
906         assert(p > 0);
907
908         if (*idx == 0)
909                 *extra = htole64(p);
910         else {
911                 le64_t i;
912
913                 i = htole64(le64toh(*idx) - 1);
914                 r = link_entry_into_array(f, first, &i, p);
915                 if (r < 0)
916                         return r;
917         }
918
919         *idx = htole64(le64toh(*idx) + 1);
920         return 0;
921 }
922
923 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
924         uint64_t p;
925         int r;
926         assert(f);
927         assert(o);
928         assert(offset > 0);
929
930         p = le64toh(o->entry.items[i].object_offset);
931         if (p == 0)
932                 return -EINVAL;
933
934         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
935         if (r < 0)
936                 return r;
937
938         return link_entry_into_array_plus_one(f,
939                                               &o->data.entry_offset,
940                                               &o->data.entry_array_offset,
941                                               &o->data.n_entries,
942                                               offset);
943 }
944
945 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
946         uint64_t n, i;
947         int r;
948
949         assert(f);
950         assert(o);
951         assert(offset > 0);
952         assert(o->object.type == OBJECT_ENTRY);
953
954         __sync_synchronize();
955
956         /* Link up the entry itself */
957         r = link_entry_into_array(f,
958                                   &f->header->entry_array_offset,
959                                   &f->header->n_entries,
960                                   offset);
961         if (r < 0)
962                 return r;
963
964         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
965
966         if (f->header->head_entry_realtime == 0)
967                 f->header->head_entry_realtime = o->entry.realtime;
968
969         f->header->tail_entry_realtime = o->entry.realtime;
970         f->header->tail_entry_monotonic = o->entry.monotonic;
971
972         f->tail_entry_monotonic_valid = true;
973
974         /* Link up the items */
975         n = journal_file_entry_n_items(o);
976         for (i = 0; i < n; i++) {
977                 r = journal_file_link_entry_item(f, o, offset, i);
978                 if (r < 0)
979                         return r;
980         }
981
982         return 0;
983 }
984
985 static int journal_file_append_entry_internal(
986                 JournalFile *f,
987                 const dual_timestamp *ts,
988                 uint64_t xor_hash,
989                 const EntryItem items[], unsigned n_items,
990                 uint64_t *seqnum,
991                 Object **ret, uint64_t *offset) {
992         uint64_t np;
993         uint64_t osize;
994         Object *o;
995         int r;
996
997         assert(f);
998         assert(items || n_items == 0);
999         assert(ts);
1000
1001         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1002
1003         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1004         if (r < 0)
1005                 return r;
1006
1007         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1008         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1009         o->entry.realtime = htole64(ts->realtime);
1010         o->entry.monotonic = htole64(ts->monotonic);
1011         o->entry.xor_hash = htole64(xor_hash);
1012         o->entry.boot_id = f->header->boot_id;
1013
1014 #ifdef HAVE_GCRYPT
1015         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, np);
1016         if (r < 0)
1017                 return r;
1018 #endif
1019
1020         r = journal_file_link_entry(f, o, np);
1021         if (r < 0)
1022                 return r;
1023
1024         if (ret)
1025                 *ret = o;
1026
1027         if (offset)
1028                 *offset = np;
1029
1030         return 0;
1031 }
1032
1033 void journal_file_post_change(JournalFile *f) {
1034         assert(f);
1035
1036         /* inotify() does not receive IN_MODIFY events from file
1037          * accesses done via mmap(). After each access we hence
1038          * trigger IN_MODIFY by truncating the journal file to its
1039          * current size which triggers IN_MODIFY. */
1040
1041         __sync_synchronize();
1042
1043         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1044                 log_error("Failed to to truncate file to its own size: %m");
1045 }
1046
1047 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1048         unsigned i;
1049         EntryItem *items;
1050         int r;
1051         uint64_t xor_hash = 0;
1052         struct dual_timestamp _ts;
1053
1054         assert(f);
1055         assert(iovec || n_iovec == 0);
1056
1057         if (!f->writable)
1058                 return -EPERM;
1059
1060         if (!ts) {
1061                 dual_timestamp_get(&_ts);
1062                 ts = &_ts;
1063         }
1064
1065         if (f->tail_entry_monotonic_valid &&
1066             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1067                 return -EINVAL;
1068
1069 #ifdef HAVE_GCRYPT
1070         r = journal_file_maybe_append_tag(f, ts->realtime);
1071         if (r < 0)
1072                 return r;
1073 #endif
1074
1075         /* alloca() can't take 0, hence let's allocate at least one */
1076         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1077
1078         for (i = 0; i < n_iovec; i++) {
1079                 uint64_t p;
1080                 Object *o;
1081
1082                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1083                 if (r < 0)
1084                         return r;
1085
1086                 xor_hash ^= le64toh(o->data.hash);
1087                 items[i].object_offset = htole64(p);
1088                 items[i].hash = o->data.hash;
1089         }
1090
1091         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1092
1093         journal_file_post_change(f);
1094
1095         return r;
1096 }
1097
1098 static int generic_array_get(JournalFile *f,
1099                              uint64_t first,
1100                              uint64_t i,
1101                              Object **ret, uint64_t *offset) {
1102
1103         Object *o;
1104         uint64_t p = 0, a;
1105         int r;
1106
1107         assert(f);
1108
1109         a = first;
1110         while (a > 0) {
1111                 uint64_t n;
1112
1113                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1114                 if (r < 0)
1115                         return r;
1116
1117                 n = journal_file_entry_array_n_items(o);
1118                 if (i < n) {
1119                         p = le64toh(o->entry_array.items[i]);
1120                         break;
1121                 }
1122
1123                 i -= n;
1124                 a = le64toh(o->entry_array.next_entry_array_offset);
1125         }
1126
1127         if (a <= 0 || p <= 0)
1128                 return 0;
1129
1130         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1131         if (r < 0)
1132                 return r;
1133
1134         if (ret)
1135                 *ret = o;
1136
1137         if (offset)
1138                 *offset = p;
1139
1140         return 1;
1141 }
1142
1143 static int generic_array_get_plus_one(JournalFile *f,
1144                                       uint64_t extra,
1145                                       uint64_t first,
1146                                       uint64_t i,
1147                                       Object **ret, uint64_t *offset) {
1148
1149         Object *o;
1150
1151         assert(f);
1152
1153         if (i == 0) {
1154                 int r;
1155
1156                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1157                 if (r < 0)
1158                         return r;
1159
1160                 if (ret)
1161                         *ret = o;
1162
1163                 if (offset)
1164                         *offset = extra;
1165
1166                 return 1;
1167         }
1168
1169         return generic_array_get(f, first, i-1, ret, offset);
1170 }
1171
1172 enum {
1173         TEST_FOUND,
1174         TEST_LEFT,
1175         TEST_RIGHT
1176 };
1177
1178 static int generic_array_bisect(JournalFile *f,
1179                                 uint64_t first,
1180                                 uint64_t n,
1181                                 uint64_t needle,
1182                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1183                                 direction_t direction,
1184                                 Object **ret,
1185                                 uint64_t *offset,
1186                                 uint64_t *idx) {
1187
1188         uint64_t a, p, t = 0, i = 0, last_p = 0;
1189         bool subtract_one = false;
1190         Object *o, *array = NULL;
1191         int r;
1192
1193         assert(f);
1194         assert(test_object);
1195
1196         a = first;
1197         while (a > 0) {
1198                 uint64_t left, right, k, lp;
1199
1200                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1201                 if (r < 0)
1202                         return r;
1203
1204                 k = journal_file_entry_array_n_items(array);
1205                 right = MIN(k, n);
1206                 if (right <= 0)
1207                         return 0;
1208
1209                 i = right - 1;
1210                 lp = p = le64toh(array->entry_array.items[i]);
1211                 if (p <= 0)
1212                         return -EBADMSG;
1213
1214                 r = test_object(f, p, needle);
1215                 if (r < 0)
1216                         return r;
1217
1218                 if (r == TEST_FOUND)
1219                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1220
1221                 if (r == TEST_RIGHT) {
1222                         left = 0;
1223                         right -= 1;
1224                         for (;;) {
1225                                 if (left == right) {
1226                                         if (direction == DIRECTION_UP)
1227                                                 subtract_one = true;
1228
1229                                         i = left;
1230                                         goto found;
1231                                 }
1232
1233                                 assert(left < right);
1234
1235                                 i = (left + right) / 2;
1236                                 p = le64toh(array->entry_array.items[i]);
1237                                 if (p <= 0)
1238                                         return -EBADMSG;
1239
1240                                 r = test_object(f, p, needle);
1241                                 if (r < 0)
1242                                         return r;
1243
1244                                 if (r == TEST_FOUND)
1245                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1246
1247                                 if (r == TEST_RIGHT)
1248                                         right = i;
1249                                 else
1250                                         left = i + 1;
1251                         }
1252                 }
1253
1254                 if (k > n) {
1255                         if (direction == DIRECTION_UP) {
1256                                 i = n;
1257                                 subtract_one = true;
1258                                 goto found;
1259                         }
1260
1261                         return 0;
1262                 }
1263
1264                 last_p = lp;
1265
1266                 n -= k;
1267                 t += k;
1268                 a = le64toh(array->entry_array.next_entry_array_offset);
1269         }
1270
1271         return 0;
1272
1273 found:
1274         if (subtract_one && t == 0 && i == 0)
1275                 return 0;
1276
1277         if (subtract_one && i == 0)
1278                 p = last_p;
1279         else if (subtract_one)
1280                 p = le64toh(array->entry_array.items[i-1]);
1281         else
1282                 p = le64toh(array->entry_array.items[i]);
1283
1284         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1285         if (r < 0)
1286                 return r;
1287
1288         if (ret)
1289                 *ret = o;
1290
1291         if (offset)
1292                 *offset = p;
1293
1294         if (idx)
1295                 *idx = t + i + (subtract_one ? -1 : 0);
1296
1297         return 1;
1298 }
1299
1300 static int generic_array_bisect_plus_one(JournalFile *f,
1301                                          uint64_t extra,
1302                                          uint64_t first,
1303                                          uint64_t n,
1304                                          uint64_t needle,
1305                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1306                                          direction_t direction,
1307                                          Object **ret,
1308                                          uint64_t *offset,
1309                                          uint64_t *idx) {
1310
1311         int r;
1312         bool step_back = false;
1313         Object *o;
1314
1315         assert(f);
1316         assert(test_object);
1317
1318         if (n <= 0)
1319                 return 0;
1320
1321         /* This bisects the array in object 'first', but first checks
1322          * an extra  */
1323         r = test_object(f, extra, needle);
1324         if (r < 0)
1325                 return r;
1326
1327         if (r == TEST_FOUND)
1328                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1329
1330         /* if we are looking with DIRECTION_UP then we need to first
1331            see if in the actual array there is a matching entry, and
1332            return the last one of that. But if there isn't any we need
1333            to return this one. Hence remember this, and return it
1334            below. */
1335         if (r == TEST_LEFT)
1336                 step_back = direction == DIRECTION_UP;
1337
1338         if (r == TEST_RIGHT) {
1339                 if (direction == DIRECTION_DOWN)
1340                         goto found;
1341                 else
1342                         return 0;
1343         }
1344
1345         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1346
1347         if (r == 0 && step_back)
1348                 goto found;
1349
1350         if (r > 0 && idx)
1351                 (*idx) ++;
1352
1353         return r;
1354
1355 found:
1356         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1357         if (r < 0)
1358                 return r;
1359
1360         if (ret)
1361                 *ret = o;
1362
1363         if (offset)
1364                 *offset = extra;
1365
1366         if (idx)
1367                 *idx = 0;
1368
1369         return 1;
1370 }
1371
1372 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1373         assert(f);
1374         assert(p > 0);
1375
1376         if (p == needle)
1377                 return TEST_FOUND;
1378         else if (p < needle)
1379                 return TEST_LEFT;
1380         else
1381                 return TEST_RIGHT;
1382 }
1383
1384 int journal_file_move_to_entry_by_offset(
1385                 JournalFile *f,
1386                 uint64_t p,
1387                 direction_t direction,
1388                 Object **ret,
1389                 uint64_t *offset) {
1390
1391         return generic_array_bisect(f,
1392                                     le64toh(f->header->entry_array_offset),
1393                                     le64toh(f->header->n_entries),
1394                                     p,
1395                                     test_object_offset,
1396                                     direction,
1397                                     ret, offset, NULL);
1398 }
1399
1400
1401 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1402         Object *o;
1403         int r;
1404
1405         assert(f);
1406         assert(p > 0);
1407
1408         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1409         if (r < 0)
1410                 return r;
1411
1412         if (le64toh(o->entry.seqnum) == needle)
1413                 return TEST_FOUND;
1414         else if (le64toh(o->entry.seqnum) < needle)
1415                 return TEST_LEFT;
1416         else
1417                 return TEST_RIGHT;
1418 }
1419
1420 int journal_file_move_to_entry_by_seqnum(
1421                 JournalFile *f,
1422                 uint64_t seqnum,
1423                 direction_t direction,
1424                 Object **ret,
1425                 uint64_t *offset) {
1426
1427         return generic_array_bisect(f,
1428                                     le64toh(f->header->entry_array_offset),
1429                                     le64toh(f->header->n_entries),
1430                                     seqnum,
1431                                     test_object_seqnum,
1432                                     direction,
1433                                     ret, offset, NULL);
1434 }
1435
1436 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1437         Object *o;
1438         int r;
1439
1440         assert(f);
1441         assert(p > 0);
1442
1443         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1444         if (r < 0)
1445                 return r;
1446
1447         if (le64toh(o->entry.realtime) == needle)
1448                 return TEST_FOUND;
1449         else if (le64toh(o->entry.realtime) < needle)
1450                 return TEST_LEFT;
1451         else
1452                 return TEST_RIGHT;
1453 }
1454
1455 int journal_file_move_to_entry_by_realtime(
1456                 JournalFile *f,
1457                 uint64_t realtime,
1458                 direction_t direction,
1459                 Object **ret,
1460                 uint64_t *offset) {
1461
1462         return generic_array_bisect(f,
1463                                     le64toh(f->header->entry_array_offset),
1464                                     le64toh(f->header->n_entries),
1465                                     realtime,
1466                                     test_object_realtime,
1467                                     direction,
1468                                     ret, offset, NULL);
1469 }
1470
1471 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1472         Object *o;
1473         int r;
1474
1475         assert(f);
1476         assert(p > 0);
1477
1478         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1479         if (r < 0)
1480                 return r;
1481
1482         if (le64toh(o->entry.monotonic) == needle)
1483                 return TEST_FOUND;
1484         else if (le64toh(o->entry.monotonic) < needle)
1485                 return TEST_LEFT;
1486         else
1487                 return TEST_RIGHT;
1488 }
1489
1490 int journal_file_move_to_entry_by_monotonic(
1491                 JournalFile *f,
1492                 sd_id128_t boot_id,
1493                 uint64_t monotonic,
1494                 direction_t direction,
1495                 Object **ret,
1496                 uint64_t *offset) {
1497
1498         char t[9+32+1] = "_BOOT_ID=";
1499         Object *o;
1500         int r;
1501
1502         assert(f);
1503
1504         sd_id128_to_string(boot_id, t + 9);
1505         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1506         if (r < 0)
1507                 return r;
1508         if (r == 0)
1509                 return -ENOENT;
1510
1511         return generic_array_bisect_plus_one(f,
1512                                              le64toh(o->data.entry_offset),
1513                                              le64toh(o->data.entry_array_offset),
1514                                              le64toh(o->data.n_entries),
1515                                              monotonic,
1516                                              test_object_monotonic,
1517                                              direction,
1518                                              ret, offset, NULL);
1519 }
1520
1521 int journal_file_next_entry(
1522                 JournalFile *f,
1523                 Object *o, uint64_t p,
1524                 direction_t direction,
1525                 Object **ret, uint64_t *offset) {
1526
1527         uint64_t i, n;
1528         int r;
1529
1530         assert(f);
1531         assert(p > 0 || !o);
1532
1533         n = le64toh(f->header->n_entries);
1534         if (n <= 0)
1535                 return 0;
1536
1537         if (!o)
1538                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1539         else {
1540                 if (o->object.type != OBJECT_ENTRY)
1541                         return -EINVAL;
1542
1543                 r = generic_array_bisect(f,
1544                                          le64toh(f->header->entry_array_offset),
1545                                          le64toh(f->header->n_entries),
1546                                          p,
1547                                          test_object_offset,
1548                                          DIRECTION_DOWN,
1549                                          NULL, NULL,
1550                                          &i);
1551                 if (r <= 0)
1552                         return r;
1553
1554                 if (direction == DIRECTION_DOWN) {
1555                         if (i >= n - 1)
1556                                 return 0;
1557
1558                         i++;
1559                 } else {
1560                         if (i <= 0)
1561                                 return 0;
1562
1563                         i--;
1564                 }
1565         }
1566
1567         /* And jump to it */
1568         return generic_array_get(f,
1569                                  le64toh(f->header->entry_array_offset),
1570                                  i,
1571                                  ret, offset);
1572 }
1573
1574 int journal_file_skip_entry(
1575                 JournalFile *f,
1576                 Object *o, uint64_t p,
1577                 int64_t skip,
1578                 Object **ret, uint64_t *offset) {
1579
1580         uint64_t i, n;
1581         int r;
1582
1583         assert(f);
1584         assert(o);
1585         assert(p > 0);
1586
1587         if (o->object.type != OBJECT_ENTRY)
1588                 return -EINVAL;
1589
1590         r = generic_array_bisect(f,
1591                                  le64toh(f->header->entry_array_offset),
1592                                  le64toh(f->header->n_entries),
1593                                  p,
1594                                  test_object_offset,
1595                                  DIRECTION_DOWN,
1596                                  NULL, NULL,
1597                                  &i);
1598         if (r <= 0)
1599                 return r;
1600
1601         /* Calculate new index */
1602         if (skip < 0) {
1603                 if ((uint64_t) -skip >= i)
1604                         i = 0;
1605                 else
1606                         i = i - (uint64_t) -skip;
1607         } else
1608                 i  += (uint64_t) skip;
1609
1610         n = le64toh(f->header->n_entries);
1611         if (n <= 0)
1612                 return -EBADMSG;
1613
1614         if (i >= n)
1615                 i = n-1;
1616
1617         return generic_array_get(f,
1618                                  le64toh(f->header->entry_array_offset),
1619                                  i,
1620                                  ret, offset);
1621 }
1622
1623 int journal_file_next_entry_for_data(
1624                 JournalFile *f,
1625                 Object *o, uint64_t p,
1626                 uint64_t data_offset,
1627                 direction_t direction,
1628                 Object **ret, uint64_t *offset) {
1629
1630         uint64_t n, i;
1631         int r;
1632         Object *d;
1633
1634         assert(f);
1635         assert(p > 0 || !o);
1636
1637         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1638         if (r < 0)
1639                 return r;
1640
1641         n = le64toh(d->data.n_entries);
1642         if (n <= 0)
1643                 return n;
1644
1645         if (!o)
1646                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1647         else {
1648                 if (o->object.type != OBJECT_ENTRY)
1649                         return -EINVAL;
1650
1651                 r = generic_array_bisect_plus_one(f,
1652                                                   le64toh(d->data.entry_offset),
1653                                                   le64toh(d->data.entry_array_offset),
1654                                                   le64toh(d->data.n_entries),
1655                                                   p,
1656                                                   test_object_offset,
1657                                                   DIRECTION_DOWN,
1658                                                   NULL, NULL,
1659                                                   &i);
1660
1661                 if (r <= 0)
1662                         return r;
1663
1664                 if (direction == DIRECTION_DOWN) {
1665                         if (i >= n - 1)
1666                                 return 0;
1667
1668                         i++;
1669                 } else {
1670                         if (i <= 0)
1671                                 return 0;
1672
1673                         i--;
1674                 }
1675
1676         }
1677
1678         return generic_array_get_plus_one(f,
1679                                           le64toh(d->data.entry_offset),
1680                                           le64toh(d->data.entry_array_offset),
1681                                           i,
1682                                           ret, offset);
1683 }
1684
1685 int journal_file_move_to_entry_by_offset_for_data(
1686                 JournalFile *f,
1687                 uint64_t data_offset,
1688                 uint64_t p,
1689                 direction_t direction,
1690                 Object **ret, uint64_t *offset) {
1691
1692         int r;
1693         Object *d;
1694
1695         assert(f);
1696
1697         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1698         if (r < 0)
1699                 return r;
1700
1701         return generic_array_bisect_plus_one(f,
1702                                              le64toh(d->data.entry_offset),
1703                                              le64toh(d->data.entry_array_offset),
1704                                              le64toh(d->data.n_entries),
1705                                              p,
1706                                              test_object_offset,
1707                                              direction,
1708                                              ret, offset, NULL);
1709 }
1710
1711 int journal_file_move_to_entry_by_monotonic_for_data(
1712                 JournalFile *f,
1713                 uint64_t data_offset,
1714                 sd_id128_t boot_id,
1715                 uint64_t monotonic,
1716                 direction_t direction,
1717                 Object **ret, uint64_t *offset) {
1718
1719         char t[9+32+1] = "_BOOT_ID=";
1720         Object *o, *d;
1721         int r;
1722         uint64_t b, z;
1723
1724         assert(f);
1725
1726         /* First, seek by time */
1727         sd_id128_to_string(boot_id, t + 9);
1728         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1729         if (r < 0)
1730                 return r;
1731         if (r == 0)
1732                 return -ENOENT;
1733
1734         r = generic_array_bisect_plus_one(f,
1735                                           le64toh(o->data.entry_offset),
1736                                           le64toh(o->data.entry_array_offset),
1737                                           le64toh(o->data.n_entries),
1738                                           monotonic,
1739                                           test_object_monotonic,
1740                                           direction,
1741                                           NULL, &z, NULL);
1742         if (r <= 0)
1743                 return r;
1744
1745         /* And now, continue seeking until we find an entry that
1746          * exists in both bisection arrays */
1747
1748         for (;;) {
1749                 Object *qo;
1750                 uint64_t p, q;
1751
1752                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1753                 if (r < 0)
1754                         return r;
1755
1756                 r = generic_array_bisect_plus_one(f,
1757                                                   le64toh(d->data.entry_offset),
1758                                                   le64toh(d->data.entry_array_offset),
1759                                                   le64toh(d->data.n_entries),
1760                                                   z,
1761                                                   test_object_offset,
1762                                                   direction,
1763                                                   NULL, &p, NULL);
1764                 if (r <= 0)
1765                         return r;
1766
1767                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1768                 if (r < 0)
1769                         return r;
1770
1771                 r = generic_array_bisect_plus_one(f,
1772                                                   le64toh(o->data.entry_offset),
1773                                                   le64toh(o->data.entry_array_offset),
1774                                                   le64toh(o->data.n_entries),
1775                                                   p,
1776                                                   test_object_offset,
1777                                                   direction,
1778                                                   &qo, &q, NULL);
1779
1780                 if (r <= 0)
1781                         return r;
1782
1783                 if (p == q) {
1784                         if (ret)
1785                                 *ret = qo;
1786                         if (offset)
1787                                 *offset = q;
1788
1789                         return 1;
1790                 }
1791
1792                 z = q;
1793         }
1794
1795         return 0;
1796 }
1797
1798 int journal_file_move_to_entry_by_seqnum_for_data(
1799                 JournalFile *f,
1800                 uint64_t data_offset,
1801                 uint64_t seqnum,
1802                 direction_t direction,
1803                 Object **ret, uint64_t *offset) {
1804
1805         Object *d;
1806         int r;
1807
1808         assert(f);
1809
1810         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1811         if (r < 0)
1812                 return r;
1813
1814         return generic_array_bisect_plus_one(f,
1815                                              le64toh(d->data.entry_offset),
1816                                              le64toh(d->data.entry_array_offset),
1817                                              le64toh(d->data.n_entries),
1818                                              seqnum,
1819                                              test_object_seqnum,
1820                                              direction,
1821                                              ret, offset, NULL);
1822 }
1823
1824 int journal_file_move_to_entry_by_realtime_for_data(
1825                 JournalFile *f,
1826                 uint64_t data_offset,
1827                 uint64_t realtime,
1828                 direction_t direction,
1829                 Object **ret, uint64_t *offset) {
1830
1831         Object *d;
1832         int r;
1833
1834         assert(f);
1835
1836         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1837         if (r < 0)
1838                 return r;
1839
1840         return generic_array_bisect_plus_one(f,
1841                                              le64toh(d->data.entry_offset),
1842                                              le64toh(d->data.entry_array_offset),
1843                                              le64toh(d->data.n_entries),
1844                                              realtime,
1845                                              test_object_realtime,
1846                                              direction,
1847                                              ret, offset, NULL);
1848 }
1849
1850 void journal_file_dump(JournalFile *f) {
1851         Object *o;
1852         int r;
1853         uint64_t p;
1854
1855         assert(f);
1856
1857         journal_file_print_header(f);
1858
1859         p = le64toh(f->header->header_size);
1860         while (p != 0) {
1861                 r = journal_file_move_to_object(f, -1, p, &o);
1862                 if (r < 0)
1863                         goto fail;
1864
1865                 switch (o->object.type) {
1866
1867                 case OBJECT_UNUSED:
1868                         printf("Type: OBJECT_UNUSED\n");
1869                         break;
1870
1871                 case OBJECT_DATA:
1872                         printf("Type: OBJECT_DATA\n");
1873                         break;
1874
1875                 case OBJECT_ENTRY:
1876                         printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
1877                                (unsigned long long) le64toh(o->entry.seqnum),
1878                                (unsigned long long) le64toh(o->entry.monotonic),
1879                                (unsigned long long) le64toh(o->entry.realtime));
1880                         break;
1881
1882                 case OBJECT_FIELD_HASH_TABLE:
1883                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1884                         break;
1885
1886                 case OBJECT_DATA_HASH_TABLE:
1887                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1888                         break;
1889
1890                 case OBJECT_ENTRY_ARRAY:
1891                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1892                         break;
1893
1894                 case OBJECT_TAG:
1895                         printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
1896                                (unsigned long long) le64toh(o->tag.seqnum),
1897                                (unsigned long long) le64toh(o->tag.epoch));
1898                         break;
1899                 }
1900
1901                 if (o->object.flags & OBJECT_COMPRESSED)
1902                         printf("Flags: COMPRESSED\n");
1903
1904                 if (p == le64toh(f->header->tail_object_offset))
1905                         p = 0;
1906                 else
1907                         p = p + ALIGN64(le64toh(o->object.size));
1908         }
1909
1910         return;
1911 fail:
1912         log_error("File corrupt");
1913 }
1914
1915 void journal_file_print_header(JournalFile *f) {
1916         char a[33], b[33], c[33];
1917         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1918         struct stat st;
1919         char bytes[FORMAT_BYTES_MAX];
1920
1921         assert(f);
1922
1923         printf("File Path: %s\n"
1924                "File ID: %s\n"
1925                "Machine ID: %s\n"
1926                "Boot ID: %s\n"
1927                "Sequential Number ID: %s\n"
1928                "State: %s\n"
1929                "Compatible Flags:%s%s\n"
1930                "Incompatible Flags:%s%s\n"
1931                "Header size: %llu\n"
1932                "Arena size: %llu\n"
1933                "Data Hash Table Size: %llu\n"
1934                "Field Hash Table Size: %llu\n"
1935                "Rotate Suggested: %s\n"
1936                "Head Sequential Number: %llu\n"
1937                "Tail Sequential Number: %llu\n"
1938                "Head Realtime Timestamp: %s\n"
1939                "Tail Realtime Timestamp: %s\n"
1940                "Objects: %llu\n"
1941                "Entry Objects: %llu\n",
1942                f->path,
1943                sd_id128_to_string(f->header->file_id, a),
1944                sd_id128_to_string(f->header->machine_id, b),
1945                sd_id128_to_string(f->header->boot_id, c),
1946                sd_id128_to_string(f->header->seqnum_id, c),
1947                f->header->state == STATE_OFFLINE ? "OFFLINE" :
1948                f->header->state == STATE_ONLINE ? "ONLINE" :
1949                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
1950                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
1951                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
1952                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
1953                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
1954                (unsigned long long) le64toh(f->header->header_size),
1955                (unsigned long long) le64toh(f->header->arena_size),
1956                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1957                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1958                yes_no(journal_file_rotate_suggested(f)),
1959                (unsigned long long) le64toh(f->header->head_entry_seqnum),
1960                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
1961                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1962                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
1963                (unsigned long long) le64toh(f->header->n_objects),
1964                (unsigned long long) le64toh(f->header->n_entries));
1965
1966         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1967                 printf("Data Objects: %llu\n"
1968                        "Data Hash Table Fill: %.1f%%\n",
1969                        (unsigned long long) le64toh(f->header->n_data),
1970                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1971
1972         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1973                 printf("Field Objects: %llu\n"
1974                        "Field Hash Table Fill: %.1f%%\n",
1975                        (unsigned long long) le64toh(f->header->n_fields),
1976                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1977
1978         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
1979                 printf("Tag Objects: %llu\n",
1980                        (unsigned long long) le64toh(f->header->n_tags));
1981         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1982                 printf("Entry Array Objects: %llu\n",
1983                        (unsigned long long) le64toh(f->header->n_entry_arrays));
1984
1985         if (fstat(f->fd, &st) >= 0)
1986                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
1987 }
1988
1989 int journal_file_open(
1990                 const char *fname,
1991                 int flags,
1992                 mode_t mode,
1993                 bool compress,
1994                 bool seal,
1995                 JournalMetrics *metrics,
1996                 MMapCache *mmap_cache,
1997                 JournalFile *template,
1998                 JournalFile **ret) {
1999
2000         JournalFile *f;
2001         int r;
2002         bool newly_created = false;
2003
2004         assert(fname);
2005
2006         if ((flags & O_ACCMODE) != O_RDONLY &&
2007             (flags & O_ACCMODE) != O_RDWR)
2008                 return -EINVAL;
2009
2010         if (!endswith(fname, ".journal") &&
2011             !endswith(fname, ".journal~"))
2012                 return -EINVAL;
2013
2014         f = new0(JournalFile, 1);
2015         if (!f)
2016                 return -ENOMEM;
2017
2018         f->fd = -1;
2019         f->mode = mode;
2020
2021         f->flags = flags;
2022         f->prot = prot_from_flags(flags);
2023         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2024         f->compress = compress;
2025         f->seal = seal;
2026
2027         if (mmap_cache)
2028                 f->mmap = mmap_cache_ref(mmap_cache);
2029         else {
2030                 f->mmap = mmap_cache_new();
2031                 if (!f->mmap) {
2032                         r = -ENOMEM;
2033                         goto fail;
2034                 }
2035         }
2036
2037         f->path = strdup(fname);
2038         if (!f->path) {
2039                 r = -ENOMEM;
2040                 goto fail;
2041         }
2042
2043         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2044         if (f->fd < 0) {
2045                 r = -errno;
2046                 goto fail;
2047         }
2048
2049         if (fstat(f->fd, &f->last_stat) < 0) {
2050                 r = -errno;
2051                 goto fail;
2052         }
2053
2054         if (f->last_stat.st_size == 0 && f->writable) {
2055                 newly_created = true;
2056
2057 #ifdef HAVE_GCRYPT
2058                 /* Try to load the FSPRG state, and if we can't, then
2059                  * just don't do sealing */
2060                 r = journal_file_fss_load(f);
2061                 if (r < 0)
2062                         f->seal = false;
2063 #endif
2064
2065                 r = journal_file_init_header(f, template);
2066                 if (r < 0)
2067                         goto fail;
2068
2069                 if (fstat(f->fd, &f->last_stat) < 0) {
2070                         r = -errno;
2071                         goto fail;
2072                 }
2073         }
2074
2075         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2076                 r = -EIO;
2077                 goto fail;
2078         }
2079
2080         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2081         if (f->header == MAP_FAILED) {
2082                 f->header = NULL;
2083                 r = -errno;
2084                 goto fail;
2085         }
2086
2087         if (!newly_created) {
2088                 r = journal_file_verify_header(f);
2089                 if (r < 0)
2090                         goto fail;
2091         }
2092
2093 #ifdef HAVE_GCRYPT
2094         if (!newly_created && f->writable) {
2095                 r = journal_file_fss_load(f);
2096                 if (r < 0)
2097                         goto fail;
2098         }
2099 #endif
2100
2101         if (f->writable) {
2102                 if (metrics) {
2103                         journal_default_metrics(metrics, f->fd);
2104                         f->metrics = *metrics;
2105                 } else if (template)
2106                         f->metrics = template->metrics;
2107
2108                 r = journal_file_refresh_header(f);
2109                 if (r < 0)
2110                         goto fail;
2111         }
2112
2113 #ifdef HAVE_GCRYPT
2114         r = journal_file_hmac_setup(f);
2115         if (r < 0)
2116                 goto fail;
2117 #endif
2118
2119         if (newly_created) {
2120                 r = journal_file_setup_field_hash_table(f);
2121                 if (r < 0)
2122                         goto fail;
2123
2124                 r = journal_file_setup_data_hash_table(f);
2125                 if (r < 0)
2126                         goto fail;
2127
2128 #ifdef HAVE_GCRYPT
2129                 r = journal_file_append_first_tag(f);
2130                 if (r < 0)
2131                         goto fail;
2132 #endif
2133         }
2134
2135         r = journal_file_map_field_hash_table(f);
2136         if (r < 0)
2137                 goto fail;
2138
2139         r = journal_file_map_data_hash_table(f);
2140         if (r < 0)
2141                 goto fail;
2142
2143         if (ret)
2144                 *ret = f;
2145
2146         return 0;
2147
2148 fail:
2149         journal_file_close(f);
2150
2151         return r;
2152 }
2153
2154 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2155         char *p;
2156         size_t l;
2157         JournalFile *old_file, *new_file = NULL;
2158         int r;
2159
2160         assert(f);
2161         assert(*f);
2162
2163         old_file = *f;
2164
2165         if (!old_file->writable)
2166                 return -EINVAL;
2167
2168         if (!endswith(old_file->path, ".journal"))
2169                 return -EINVAL;
2170
2171         l = strlen(old_file->path);
2172
2173         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2174         if (!p)
2175                 return -ENOMEM;
2176
2177         memcpy(p, old_file->path, l - 8);
2178         p[l-8] = '@';
2179         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2180         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2181                  "-%016llx-%016llx.journal",
2182                  (unsigned long long) le64toh((*f)->header->tail_entry_seqnum),
2183                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2184
2185         r = rename(old_file->path, p);
2186         free(p);
2187
2188         if (r < 0)
2189                 return -errno;
2190
2191         old_file->header->state = STATE_ARCHIVED;
2192
2193         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2194         journal_file_close(old_file);
2195
2196         *f = new_file;
2197         return r;
2198 }
2199
2200 int journal_file_open_reliably(
2201                 const char *fname,
2202                 int flags,
2203                 mode_t mode,
2204                 bool compress,
2205                 bool seal,
2206                 JournalMetrics *metrics,
2207                 MMapCache *mmap_cache,
2208                 JournalFile *template,
2209                 JournalFile **ret) {
2210
2211         int r;
2212         size_t l;
2213         char *p;
2214
2215         r = journal_file_open(fname, flags, mode, compress, seal,
2216                               metrics, mmap_cache, template, ret);
2217         if (r != -EBADMSG && /* corrupted */
2218             r != -ENODATA && /* truncated */
2219             r != -EHOSTDOWN && /* other machine */
2220             r != -EPROTONOSUPPORT && /* incompatible feature */
2221             r != -EBUSY && /* unclean shutdown */
2222             r != -ESHUTDOWN /* already archived */)
2223                 return r;
2224
2225         if ((flags & O_ACCMODE) == O_RDONLY)
2226                 return r;
2227
2228         if (!(flags & O_CREAT))
2229                 return r;
2230
2231         if (!endswith(fname, ".journal"))
2232                 return r;
2233
2234         /* The file is corrupted. Rotate it away and try it again (but only once) */
2235
2236         l = strlen(fname);
2237         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2238                      (int) (l-8), fname,
2239                      (unsigned long long) now(CLOCK_REALTIME),
2240                      random_ull()) < 0)
2241                 return -ENOMEM;
2242
2243         r = rename(fname, p);
2244         free(p);
2245         if (r < 0)
2246                 return -errno;
2247
2248         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2249
2250         return journal_file_open(fname, flags, mode, compress, seal,
2251                                  metrics, mmap_cache, template, ret);
2252 }
2253
2254
2255 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2256         uint64_t i, n;
2257         uint64_t q, xor_hash = 0;
2258         int r;
2259         EntryItem *items;
2260         dual_timestamp ts;
2261
2262         assert(from);
2263         assert(to);
2264         assert(o);
2265         assert(p);
2266
2267         if (!to->writable)
2268                 return -EPERM;
2269
2270         ts.monotonic = le64toh(o->entry.monotonic);
2271         ts.realtime = le64toh(o->entry.realtime);
2272
2273         if (to->tail_entry_monotonic_valid &&
2274             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2275                 return -EINVAL;
2276
2277         n = journal_file_entry_n_items(o);
2278         items = alloca(sizeof(EntryItem) * n);
2279
2280         for (i = 0; i < n; i++) {
2281                 uint64_t l, h;
2282                 le64_t le_hash;
2283                 size_t t;
2284                 void *data;
2285                 Object *u;
2286
2287                 q = le64toh(o->entry.items[i].object_offset);
2288                 le_hash = o->entry.items[i].hash;
2289
2290                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2291                 if (r < 0)
2292                         return r;
2293
2294                 if (le_hash != o->data.hash)
2295                         return -EBADMSG;
2296
2297                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2298                 t = (size_t) l;
2299
2300                 /* We hit the limit on 32bit machines */
2301                 if ((uint64_t) t != l)
2302                         return -E2BIG;
2303
2304                 if (o->object.flags & OBJECT_COMPRESSED) {
2305 #ifdef HAVE_XZ
2306                         uint64_t rsize;
2307
2308                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2309                                 return -EBADMSG;
2310
2311                         data = from->compress_buffer;
2312                         l = rsize;
2313 #else
2314                         return -EPROTONOSUPPORT;
2315 #endif
2316                 } else
2317                         data = o->data.payload;
2318
2319                 r = journal_file_append_data(to, data, l, &u, &h);
2320                 if (r < 0)
2321                         return r;
2322
2323                 xor_hash ^= le64toh(u->data.hash);
2324                 items[i].object_offset = htole64(h);
2325                 items[i].hash = u->data.hash;
2326
2327                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2328                 if (r < 0)
2329                         return r;
2330         }
2331
2332         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2333 }
2334
2335 void journal_default_metrics(JournalMetrics *m, int fd) {
2336         uint64_t fs_size = 0;
2337         struct statvfs ss;
2338         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2339
2340         assert(m);
2341         assert(fd >= 0);
2342
2343         if (fstatvfs(fd, &ss) >= 0)
2344                 fs_size = ss.f_frsize * ss.f_blocks;
2345
2346         if (m->max_use == (uint64_t) -1) {
2347
2348                 if (fs_size > 0) {
2349                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2350
2351                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2352                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2353
2354                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2355                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2356                 } else
2357                         m->max_use = DEFAULT_MAX_USE_LOWER;
2358         } else {
2359                 m->max_use = PAGE_ALIGN(m->max_use);
2360
2361                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2362                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2363         }
2364
2365         if (m->max_size == (uint64_t) -1) {
2366                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2367
2368                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2369                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2370         } else
2371                 m->max_size = PAGE_ALIGN(m->max_size);
2372
2373         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2374                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2375
2376         if (m->max_size*2 > m->max_use)
2377                 m->max_use = m->max_size*2;
2378
2379         if (m->min_size == (uint64_t) -1)
2380                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2381         else {
2382                 m->min_size = PAGE_ALIGN(m->min_size);
2383
2384                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2385                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2386
2387                 if (m->min_size > m->max_size)
2388                         m->max_size = m->min_size;
2389         }
2390
2391         if (m->keep_free == (uint64_t) -1) {
2392
2393                 if (fs_size > 0) {
2394                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2395
2396                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2397                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2398
2399                 } else
2400                         m->keep_free = DEFAULT_KEEP_FREE;
2401         }
2402
2403         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2404                   format_bytes(a, sizeof(a), m->max_use),
2405                   format_bytes(b, sizeof(b), m->max_size),
2406                   format_bytes(c, sizeof(c), m->min_size),
2407                   format_bytes(d, sizeof(d), m->keep_free));
2408 }
2409
2410 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2411         assert(f);
2412         assert(from || to);
2413
2414         if (from) {
2415                 if (f->header->head_entry_realtime == 0)
2416                         return -ENOENT;
2417
2418                 *from = le64toh(f->header->head_entry_realtime);
2419         }
2420
2421         if (to) {
2422                 if (f->header->tail_entry_realtime == 0)
2423                         return -ENOENT;
2424
2425                 *to = le64toh(f->header->tail_entry_realtime);
2426         }
2427
2428         return 1;
2429 }
2430
2431 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2432         char t[9+32+1] = "_BOOT_ID=";
2433         Object *o;
2434         uint64_t p;
2435         int r;
2436
2437         assert(f);
2438         assert(from || to);
2439
2440         sd_id128_to_string(boot_id, t + 9);
2441
2442         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2443         if (r <= 0)
2444                 return r;
2445
2446         if (le64toh(o->data.n_entries) <= 0)
2447                 return 0;
2448
2449         if (from) {
2450                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2451                 if (r < 0)
2452                         return r;
2453
2454                 *from = le64toh(o->entry.monotonic);
2455         }
2456
2457         if (to) {
2458                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2459                 if (r < 0)
2460                         return r;
2461
2462                 r = generic_array_get_plus_one(f,
2463                                                le64toh(o->data.entry_offset),
2464                                                le64toh(o->data.entry_array_offset),
2465                                                le64toh(o->data.n_entries)-1,
2466                                                &o, NULL);
2467                 if (r <= 0)
2468                         return r;
2469
2470                 *to = le64toh(o->entry.monotonic);
2471         }
2472
2473         return 1;
2474 }
2475
2476 bool journal_file_rotate_suggested(JournalFile *f) {
2477         assert(f);
2478
2479         /* If we gained new header fields we gained new features,
2480          * hence suggest a rotation */
2481         if (le64toh(f->header->header_size) < sizeof(Header)) {
2482                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2483                 return true;
2484         }
2485
2486         /* Let's check if the hash tables grew over a certain fill
2487          * level (75%, borrowing this value from Java's hash table
2488          * implementation), and if so suggest a rotation. To calculate
2489          * the fill level we need the n_data field, which only exists
2490          * in newer versions. */
2491
2492         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2493                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2494                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2495                                   f->path,
2496                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2497                                   (unsigned long long) le64toh(f->header->n_data),
2498                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2499                                   (unsigned long long) (f->last_stat.st_size),
2500                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2501                         return true;
2502                 }
2503
2504         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2505                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2506                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2507                                   f->path,
2508                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2509                                   (unsigned long long) le64toh(f->header->n_fields),
2510                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2511                         return true;
2512                 }
2513
2514         return false;
2515 }