chiark / gitweb /
journal: add call to determine current journal file disk usage
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
33 #include "lookup3.h"
34 #include "compress.h"
35 #include "fsprg.h"
36
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 void journal_file_close(JournalFile *f) {
65         assert(f);
66
67 #ifdef HAVE_GCRYPT
68         /* Write the final tag */
69         if (f->seal && f->writable)
70                 journal_file_append_tag(f);
71 #endif
72
73         /* Sync everything to disk, before we mark the file offline */
74         if (f->mmap && f->fd >= 0)
75                 mmap_cache_close_fd(f->mmap, f->fd);
76
77         if (f->writable && f->fd >= 0)
78                 fdatasync(f->fd);
79
80         if (f->header) {
81                 /* Mark the file offline. Don't override the archived state if it already is set */
82                 if (f->writable && f->header->state == STATE_ONLINE)
83                         f->header->state = STATE_OFFLINE;
84
85                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
86         }
87
88         if (f->fd >= 0)
89                 close_nointr_nofail(f->fd);
90
91         free(f->path);
92
93         if (f->mmap)
94                 mmap_cache_unref(f->mmap);
95
96 #ifdef HAVE_XZ
97         free(f->compress_buffer);
98 #endif
99
100 #ifdef HAVE_GCRYPT
101         if (f->fss_file)
102                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
103         else if (f->fsprg_state)
104                 free(f->fsprg_state);
105
106         free(f->fsprg_seed);
107
108         if (f->hmac)
109                 gcry_md_close(f->hmac);
110 #endif
111
112         free(f);
113 }
114
115 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
116         Header h;
117         ssize_t k;
118         int r;
119
120         assert(f);
121
122         zero(h);
123         memcpy(h.signature, HEADER_SIGNATURE, 8);
124         h.header_size = htole64(ALIGN64(sizeof(h)));
125
126         h.incompatible_flags =
127                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
128
129         h.compatible_flags =
130                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
131
132         r = sd_id128_randomize(&h.file_id);
133         if (r < 0)
134                 return r;
135
136         if (template) {
137                 h.seqnum_id = template->header->seqnum_id;
138                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
139         } else
140                 h.seqnum_id = h.file_id;
141
142         k = pwrite(f->fd, &h, sizeof(h), 0);
143         if (k < 0)
144                 return -errno;
145
146         if (k != sizeof(h))
147                 return -EIO;
148
149         return 0;
150 }
151
152 static int journal_file_refresh_header(JournalFile *f) {
153         int r;
154         sd_id128_t boot_id;
155
156         assert(f);
157
158         r = sd_id128_get_machine(&f->header->machine_id);
159         if (r < 0)
160                 return r;
161
162         r = sd_id128_get_boot(&boot_id);
163         if (r < 0)
164                 return r;
165
166         if (sd_id128_equal(boot_id, f->header->boot_id))
167                 f->tail_entry_monotonic_valid = true;
168
169         f->header->boot_id = boot_id;
170
171         f->header->state = STATE_ONLINE;
172
173         /* Sync the online state to disk */
174         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
175         fdatasync(f->fd);
176
177         return 0;
178 }
179
180 static int journal_file_verify_header(JournalFile *f) {
181         assert(f);
182
183         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
184                 return -EBADMSG;
185
186         /* In both read and write mode we refuse to open files with
187          * incompatible flags we don't know */
188 #ifdef HAVE_XZ
189         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
190                 return -EPROTONOSUPPORT;
191 #else
192         if (f->header->incompatible_flags != 0)
193                 return -EPROTONOSUPPORT;
194 #endif
195
196         /* When open for writing we refuse to open files with
197          * compatible flags, too */
198         if (f->writable) {
199 #ifdef HAVE_GCRYPT
200                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
201                         return -EPROTONOSUPPORT;
202 #else
203                 if (f->header->compatible_flags != 0)
204                         return -EPROTONOSUPPORT;
205 #endif
206         }
207
208         if (f->header->state >= _STATE_MAX)
209                 return -EBADMSG;
210
211         /* The first addition was n_data, so check that we are at least this large */
212         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
213                 return -EBADMSG;
214
215         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
216                 return -EBADMSG;
217
218         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
219                 return -ENODATA;
220
221         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
222                 return -ENODATA;
223
224         if (!VALID64(f->header->data_hash_table_offset) ||
225             !VALID64(f->header->field_hash_table_offset) ||
226             !VALID64(f->header->tail_object_offset) ||
227             !VALID64(f->header->entry_array_offset))
228                 return -ENODATA;
229
230         if (f->writable) {
231                 uint8_t state;
232                 sd_id128_t machine_id;
233                 int r;
234
235                 r = sd_id128_get_machine(&machine_id);
236                 if (r < 0)
237                         return r;
238
239                 if (!sd_id128_equal(machine_id, f->header->machine_id))
240                         return -EHOSTDOWN;
241
242                 state = f->header->state;
243
244                 if (state == STATE_ONLINE) {
245                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
246                         return -EBUSY;
247                 } else if (state == STATE_ARCHIVED)
248                         return -ESHUTDOWN;
249                 else if (state != STATE_OFFLINE) {
250                         log_debug("Journal file %s has unknown state %u.", f->path, state);
251                         return -EBUSY;
252                 }
253         }
254
255         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
256
257         if (f->writable)
258                 f->seal = JOURNAL_HEADER_SEALED(f->header);
259
260         return 0;
261 }
262
263 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
264         uint64_t old_size, new_size;
265         int r;
266
267         assert(f);
268
269         /* We assume that this file is not sparse, and we know that
270          * for sure, since we always call posix_fallocate()
271          * ourselves */
272
273         old_size =
274                 le64toh(f->header->header_size) +
275                 le64toh(f->header->arena_size);
276
277         new_size = PAGE_ALIGN(offset + size);
278         if (new_size < le64toh(f->header->header_size))
279                 new_size = le64toh(f->header->header_size);
280
281         if (new_size <= old_size)
282                 return 0;
283
284         if (f->metrics.max_size > 0 &&
285             new_size > f->metrics.max_size)
286                 return -E2BIG;
287
288         if (new_size > f->metrics.min_size &&
289             f->metrics.keep_free > 0) {
290                 struct statvfs svfs;
291
292                 if (fstatvfs(f->fd, &svfs) >= 0) {
293                         uint64_t available;
294
295                         available = svfs.f_bfree * svfs.f_bsize;
296
297                         if (available >= f->metrics.keep_free)
298                                 available -= f->metrics.keep_free;
299                         else
300                                 available = 0;
301
302                         if (new_size - old_size > available)
303                                 return -E2BIG;
304                 }
305         }
306
307         /* Note that the glibc fallocate() fallback is very
308            inefficient, hence we try to minimize the allocation area
309            as we can. */
310         r = posix_fallocate(f->fd, old_size, new_size - old_size);
311         if (r != 0)
312                 return -r;
313
314         if (fstat(f->fd, &f->last_stat) < 0)
315                 return -errno;
316
317         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
318
319         return 0;
320 }
321
322 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
323         assert(f);
324         assert(ret);
325
326         /* Avoid SIGBUS on invalid accesses */
327         if (offset + size > (uint64_t) f->last_stat.st_size) {
328                 /* Hmm, out of range? Let's refresh the fstat() data
329                  * first, before we trust that check. */
330
331                 if (fstat(f->fd, &f->last_stat) < 0 ||
332                     offset + size > (uint64_t) f->last_stat.st_size)
333                         return -EADDRNOTAVAIL;
334         }
335
336         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
337 }
338
339 static uint64_t minimum_header_size(Object *o) {
340
341         static uint64_t table[] = {
342                 [OBJECT_DATA] = sizeof(DataObject),
343                 [OBJECT_FIELD] = sizeof(FieldObject),
344                 [OBJECT_ENTRY] = sizeof(EntryObject),
345                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
346                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
347                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
348                 [OBJECT_TAG] = sizeof(TagObject),
349         };
350
351         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
352                 return sizeof(ObjectHeader);
353
354         return table[o->object.type];
355 }
356
357 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
358         int r;
359         void *t;
360         Object *o;
361         uint64_t s;
362         unsigned context;
363
364         assert(f);
365         assert(ret);
366
367         /* Objects may only be located at multiple of 64 bit */
368         if (!VALID64(offset))
369                 return -EFAULT;
370
371         /* One context for each type, plus one catch-all for the rest */
372         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
373
374         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
375         if (r < 0)
376                 return r;
377
378         o = (Object*) t;
379         s = le64toh(o->object.size);
380
381         if (s < sizeof(ObjectHeader))
382                 return -EBADMSG;
383
384         if (o->object.type <= OBJECT_UNUSED)
385                 return -EBADMSG;
386
387         if (s < minimum_header_size(o))
388                 return -EBADMSG;
389
390         if (type >= 0 && o->object.type != type)
391                 return -EBADMSG;
392
393         if (s > sizeof(ObjectHeader)) {
394                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
395                 if (r < 0)
396                         return r;
397
398                 o = (Object*) t;
399         }
400
401         *ret = o;
402         return 0;
403 }
404
405 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
406         uint64_t r;
407
408         assert(f);
409
410         r = le64toh(f->header->tail_entry_seqnum) + 1;
411
412         if (seqnum) {
413                 /* If an external seqnum counter was passed, we update
414                  * both the local and the external one, and set it to
415                  * the maximum of both */
416
417                 if (*seqnum + 1 > r)
418                         r = *seqnum + 1;
419
420                 *seqnum = r;
421         }
422
423         f->header->tail_entry_seqnum = htole64(r);
424
425         if (f->header->head_entry_seqnum == 0)
426                 f->header->head_entry_seqnum = htole64(r);
427
428         return r;
429 }
430
431 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
432         int r;
433         uint64_t p;
434         Object *tail, *o;
435         void *t;
436
437         assert(f);
438         assert(type > 0 && type < _OBJECT_TYPE_MAX);
439         assert(size >= sizeof(ObjectHeader));
440         assert(offset);
441         assert(ret);
442
443         p = le64toh(f->header->tail_object_offset);
444         if (p == 0)
445                 p = le64toh(f->header->header_size);
446         else {
447                 r = journal_file_move_to_object(f, -1, p, &tail);
448                 if (r < 0)
449                         return r;
450
451                 p += ALIGN64(le64toh(tail->object.size));
452         }
453
454         r = journal_file_allocate(f, p, size);
455         if (r < 0)
456                 return r;
457
458         r = journal_file_move_to(f, type, false, p, size, &t);
459         if (r < 0)
460                 return r;
461
462         o = (Object*) t;
463
464         zero(o->object);
465         o->object.type = type;
466         o->object.size = htole64(size);
467
468         f->header->tail_object_offset = htole64(p);
469         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
470
471         *ret = o;
472         *offset = p;
473
474         return 0;
475 }
476
477 static int journal_file_setup_data_hash_table(JournalFile *f) {
478         uint64_t s, p;
479         Object *o;
480         int r;
481
482         assert(f);
483
484         /* We estimate that we need 1 hash table entry per 768 of
485            journal file and we want to make sure we never get beyond
486            75% fill level. Calculate the hash table size for the
487            maximum file size based on these metrics. */
488
489         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
490         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
491                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
492
493         log_debug("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
494
495         r = journal_file_append_object(f,
496                                        OBJECT_DATA_HASH_TABLE,
497                                        offsetof(Object, hash_table.items) + s,
498                                        &o, &p);
499         if (r < 0)
500                 return r;
501
502         memset(o->hash_table.items, 0, s);
503
504         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
505         f->header->data_hash_table_size = htole64(s);
506
507         return 0;
508 }
509
510 static int journal_file_setup_field_hash_table(JournalFile *f) {
511         uint64_t s, p;
512         Object *o;
513         int r;
514
515         assert(f);
516
517         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
518         r = journal_file_append_object(f,
519                                        OBJECT_FIELD_HASH_TABLE,
520                                        offsetof(Object, hash_table.items) + s,
521                                        &o, &p);
522         if (r < 0)
523                 return r;
524
525         memset(o->hash_table.items, 0, s);
526
527         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
528         f->header->field_hash_table_size = htole64(s);
529
530         return 0;
531 }
532
533 static int journal_file_map_data_hash_table(JournalFile *f) {
534         uint64_t s, p;
535         void *t;
536         int r;
537
538         assert(f);
539
540         p = le64toh(f->header->data_hash_table_offset);
541         s = le64toh(f->header->data_hash_table_size);
542
543         r = journal_file_move_to(f,
544                                  OBJECT_DATA_HASH_TABLE,
545                                  true,
546                                  p, s,
547                                  &t);
548         if (r < 0)
549                 return r;
550
551         f->data_hash_table = t;
552         return 0;
553 }
554
555 static int journal_file_map_field_hash_table(JournalFile *f) {
556         uint64_t s, p;
557         void *t;
558         int r;
559
560         assert(f);
561
562         p = le64toh(f->header->field_hash_table_offset);
563         s = le64toh(f->header->field_hash_table_size);
564
565         r = journal_file_move_to(f,
566                                  OBJECT_FIELD_HASH_TABLE,
567                                  true,
568                                  p, s,
569                                  &t);
570         if (r < 0)
571                 return r;
572
573         f->field_hash_table = t;
574         return 0;
575 }
576
577 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
578         uint64_t p, h;
579         int r;
580
581         assert(f);
582         assert(o);
583         assert(offset > 0);
584         assert(o->object.type == OBJECT_DATA);
585
586         /* This might alter the window we are looking at */
587
588         o->data.next_hash_offset = o->data.next_field_offset = 0;
589         o->data.entry_offset = o->data.entry_array_offset = 0;
590         o->data.n_entries = 0;
591
592         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
593         p = le64toh(f->data_hash_table[h].tail_hash_offset);
594         if (p == 0) {
595                 /* Only entry in the hash table is easy */
596                 f->data_hash_table[h].head_hash_offset = htole64(offset);
597         } else {
598                 /* Move back to the previous data object, to patch in
599                  * pointer */
600
601                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
602                 if (r < 0)
603                         return r;
604
605                 o->data.next_hash_offset = htole64(offset);
606         }
607
608         f->data_hash_table[h].tail_hash_offset = htole64(offset);
609
610         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
611                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
612
613         return 0;
614 }
615
616 int journal_file_find_data_object_with_hash(
617                 JournalFile *f,
618                 const void *data, uint64_t size, uint64_t hash,
619                 Object **ret, uint64_t *offset) {
620
621         uint64_t p, osize, h;
622         int r;
623
624         assert(f);
625         assert(data || size == 0);
626
627         osize = offsetof(Object, data.payload) + size;
628
629         if (f->header->data_hash_table_size == 0)
630                 return -EBADMSG;
631
632         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
633         p = le64toh(f->data_hash_table[h].head_hash_offset);
634
635         while (p > 0) {
636                 Object *o;
637
638                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
639                 if (r < 0)
640                         return r;
641
642                 if (le64toh(o->data.hash) != hash)
643                         goto next;
644
645                 if (o->object.flags & OBJECT_COMPRESSED) {
646 #ifdef HAVE_XZ
647                         uint64_t l, rsize;
648
649                         l = le64toh(o->object.size);
650                         if (l <= offsetof(Object, data.payload))
651                                 return -EBADMSG;
652
653                         l -= offsetof(Object, data.payload);
654
655                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
656                                 return -EBADMSG;
657
658                         if (rsize == size &&
659                             memcmp(f->compress_buffer, data, size) == 0) {
660
661                                 if (ret)
662                                         *ret = o;
663
664                                 if (offset)
665                                         *offset = p;
666
667                                 return 1;
668                         }
669 #else
670                         return -EPROTONOSUPPORT;
671 #endif
672
673                 } else if (le64toh(o->object.size) == osize &&
674                            memcmp(o->data.payload, data, size) == 0) {
675
676                         if (ret)
677                                 *ret = o;
678
679                         if (offset)
680                                 *offset = p;
681
682                         return 1;
683                 }
684
685         next:
686                 p = le64toh(o->data.next_hash_offset);
687         }
688
689         return 0;
690 }
691
692 int journal_file_find_data_object(
693                 JournalFile *f,
694                 const void *data, uint64_t size,
695                 Object **ret, uint64_t *offset) {
696
697         uint64_t hash;
698
699         assert(f);
700         assert(data || size == 0);
701
702         hash = hash64(data, size);
703
704         return journal_file_find_data_object_with_hash(f,
705                                                        data, size, hash,
706                                                        ret, offset);
707 }
708
709 static int journal_file_append_data(
710                 JournalFile *f,
711                 const void *data, uint64_t size,
712                 Object **ret, uint64_t *offset) {
713
714         uint64_t hash, p;
715         uint64_t osize;
716         Object *o;
717         int r;
718         bool compressed = false;
719
720         assert(f);
721         assert(data || size == 0);
722
723         hash = hash64(data, size);
724
725         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
726         if (r < 0)
727                 return r;
728         else if (r > 0) {
729
730                 if (ret)
731                         *ret = o;
732
733                 if (offset)
734                         *offset = p;
735
736                 return 0;
737         }
738
739         osize = offsetof(Object, data.payload) + size;
740         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
741         if (r < 0)
742                 return r;
743
744         o->data.hash = htole64(hash);
745
746 #ifdef HAVE_XZ
747         if (f->compress &&
748             size >= COMPRESSION_SIZE_THRESHOLD) {
749                 uint64_t rsize;
750
751                 compressed = compress_blob(data, size, o->data.payload, &rsize);
752
753                 if (compressed) {
754                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
755                         o->object.flags |= OBJECT_COMPRESSED;
756
757                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
758                 }
759         }
760 #endif
761
762         if (!compressed && size > 0)
763                 memcpy(o->data.payload, data, size);
764
765         r = journal_file_link_data(f, o, p, hash);
766         if (r < 0)
767                 return r;
768
769 #ifdef HAVE_GCRYPT
770         r = journal_file_hmac_put_object(f, OBJECT_DATA, p);
771         if (r < 0)
772                 return r;
773 #endif
774
775         /* The linking might have altered the window, so let's
776          * refresh our pointer */
777         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
778         if (r < 0)
779                 return r;
780
781         if (ret)
782                 *ret = o;
783
784         if (offset)
785                 *offset = p;
786
787         return 0;
788 }
789
790 uint64_t journal_file_entry_n_items(Object *o) {
791         assert(o);
792         assert(o->object.type == OBJECT_ENTRY);
793
794         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
795 }
796
797 uint64_t journal_file_entry_array_n_items(Object *o) {
798         assert(o);
799         assert(o->object.type == OBJECT_ENTRY_ARRAY);
800
801         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
802 }
803
804 uint64_t journal_file_hash_table_n_items(Object *o) {
805         assert(o);
806         assert(o->object.type == OBJECT_DATA_HASH_TABLE ||
807                o->object.type == OBJECT_FIELD_HASH_TABLE);
808
809         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
810 }
811
812 static int link_entry_into_array(JournalFile *f,
813                                  le64_t *first,
814                                  le64_t *idx,
815                                  uint64_t p) {
816         int r;
817         uint64_t n = 0, ap = 0, q, i, a, hidx;
818         Object *o;
819
820         assert(f);
821         assert(first);
822         assert(idx);
823         assert(p > 0);
824
825         a = le64toh(*first);
826         i = hidx = le64toh(*idx);
827         while (a > 0) {
828
829                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
830                 if (r < 0)
831                         return r;
832
833                 n = journal_file_entry_array_n_items(o);
834                 if (i < n) {
835                         o->entry_array.items[i] = htole64(p);
836                         *idx = htole64(hidx + 1);
837                         return 0;
838                 }
839
840                 i -= n;
841                 ap = a;
842                 a = le64toh(o->entry_array.next_entry_array_offset);
843         }
844
845         if (hidx > n)
846                 n = (hidx+1) * 2;
847         else
848                 n = n * 2;
849
850         if (n < 4)
851                 n = 4;
852
853         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
854                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
855                                        &o, &q);
856         if (r < 0)
857                 return r;
858
859 #ifdef HAVE_GCRYPT
860         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, q);
861         if (r < 0)
862                 return r;
863 #endif
864
865         o->entry_array.items[i] = htole64(p);
866
867         if (ap == 0)
868                 *first = htole64(q);
869         else {
870                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
871                 if (r < 0)
872                         return r;
873
874                 o->entry_array.next_entry_array_offset = htole64(q);
875         }
876
877         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
878                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
879
880         *idx = htole64(hidx + 1);
881
882         return 0;
883 }
884
885 static int link_entry_into_array_plus_one(JournalFile *f,
886                                           le64_t *extra,
887                                           le64_t *first,
888                                           le64_t *idx,
889                                           uint64_t p) {
890
891         int r;
892
893         assert(f);
894         assert(extra);
895         assert(first);
896         assert(idx);
897         assert(p > 0);
898
899         if (*idx == 0)
900                 *extra = htole64(p);
901         else {
902                 le64_t i;
903
904                 i = htole64(le64toh(*idx) - 1);
905                 r = link_entry_into_array(f, first, &i, p);
906                 if (r < 0)
907                         return r;
908         }
909
910         *idx = htole64(le64toh(*idx) + 1);
911         return 0;
912 }
913
914 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
915         uint64_t p;
916         int r;
917         assert(f);
918         assert(o);
919         assert(offset > 0);
920
921         p = le64toh(o->entry.items[i].object_offset);
922         if (p == 0)
923                 return -EINVAL;
924
925         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
926         if (r < 0)
927                 return r;
928
929         return link_entry_into_array_plus_one(f,
930                                               &o->data.entry_offset,
931                                               &o->data.entry_array_offset,
932                                               &o->data.n_entries,
933                                               offset);
934 }
935
936 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
937         uint64_t n, i;
938         int r;
939
940         assert(f);
941         assert(o);
942         assert(offset > 0);
943         assert(o->object.type == OBJECT_ENTRY);
944
945         __sync_synchronize();
946
947         /* Link up the entry itself */
948         r = link_entry_into_array(f,
949                                   &f->header->entry_array_offset,
950                                   &f->header->n_entries,
951                                   offset);
952         if (r < 0)
953                 return r;
954
955         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
956
957         if (f->header->head_entry_realtime == 0)
958                 f->header->head_entry_realtime = o->entry.realtime;
959
960         f->header->tail_entry_realtime = o->entry.realtime;
961         f->header->tail_entry_monotonic = o->entry.monotonic;
962
963         f->tail_entry_monotonic_valid = true;
964
965         /* Link up the items */
966         n = journal_file_entry_n_items(o);
967         for (i = 0; i < n; i++) {
968                 r = journal_file_link_entry_item(f, o, offset, i);
969                 if (r < 0)
970                         return r;
971         }
972
973         return 0;
974 }
975
976 static int journal_file_append_entry_internal(
977                 JournalFile *f,
978                 const dual_timestamp *ts,
979                 uint64_t xor_hash,
980                 const EntryItem items[], unsigned n_items,
981                 uint64_t *seqnum,
982                 Object **ret, uint64_t *offset) {
983         uint64_t np;
984         uint64_t osize;
985         Object *o;
986         int r;
987
988         assert(f);
989         assert(items || n_items == 0);
990         assert(ts);
991
992         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
993
994         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
995         if (r < 0)
996                 return r;
997
998         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
999         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1000         o->entry.realtime = htole64(ts->realtime);
1001         o->entry.monotonic = htole64(ts->monotonic);
1002         o->entry.xor_hash = htole64(xor_hash);
1003         o->entry.boot_id = f->header->boot_id;
1004
1005 #ifdef HAVE_GCRYPT
1006         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, np);
1007         if (r < 0)
1008                 return r;
1009 #endif
1010
1011         r = journal_file_link_entry(f, o, np);
1012         if (r < 0)
1013                 return r;
1014
1015         if (ret)
1016                 *ret = o;
1017
1018         if (offset)
1019                 *offset = np;
1020
1021         return 0;
1022 }
1023
1024 void journal_file_post_change(JournalFile *f) {
1025         assert(f);
1026
1027         /* inotify() does not receive IN_MODIFY events from file
1028          * accesses done via mmap(). After each access we hence
1029          * trigger IN_MODIFY by truncating the journal file to its
1030          * current size which triggers IN_MODIFY. */
1031
1032         __sync_synchronize();
1033
1034         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1035                 log_error("Failed to to truncate file to its own size: %m");
1036 }
1037
1038 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1039         unsigned i;
1040         EntryItem *items;
1041         int r;
1042         uint64_t xor_hash = 0;
1043         struct dual_timestamp _ts;
1044
1045         assert(f);
1046         assert(iovec || n_iovec == 0);
1047
1048         if (!f->writable)
1049                 return -EPERM;
1050
1051         if (!ts) {
1052                 dual_timestamp_get(&_ts);
1053                 ts = &_ts;
1054         }
1055
1056         if (f->tail_entry_monotonic_valid &&
1057             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1058                 return -EINVAL;
1059
1060 #ifdef HAVE_GCRYPT
1061         r = journal_file_maybe_append_tag(f, ts->realtime);
1062         if (r < 0)
1063                 return r;
1064 #endif
1065
1066         /* alloca() can't take 0, hence let's allocate at least one */
1067         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1068
1069         for (i = 0; i < n_iovec; i++) {
1070                 uint64_t p;
1071                 Object *o;
1072
1073                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1074                 if (r < 0)
1075                         return r;
1076
1077                 xor_hash ^= le64toh(o->data.hash);
1078                 items[i].object_offset = htole64(p);
1079                 items[i].hash = o->data.hash;
1080         }
1081
1082         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1083
1084         journal_file_post_change(f);
1085
1086         return r;
1087 }
1088
1089 static int generic_array_get(JournalFile *f,
1090                              uint64_t first,
1091                              uint64_t i,
1092                              Object **ret, uint64_t *offset) {
1093
1094         Object *o;
1095         uint64_t p = 0, a;
1096         int r;
1097
1098         assert(f);
1099
1100         a = first;
1101         while (a > 0) {
1102                 uint64_t n;
1103
1104                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1105                 if (r < 0)
1106                         return r;
1107
1108                 n = journal_file_entry_array_n_items(o);
1109                 if (i < n) {
1110                         p = le64toh(o->entry_array.items[i]);
1111                         break;
1112                 }
1113
1114                 i -= n;
1115                 a = le64toh(o->entry_array.next_entry_array_offset);
1116         }
1117
1118         if (a <= 0 || p <= 0)
1119                 return 0;
1120
1121         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1122         if (r < 0)
1123                 return r;
1124
1125         if (ret)
1126                 *ret = o;
1127
1128         if (offset)
1129                 *offset = p;
1130
1131         return 1;
1132 }
1133
1134 static int generic_array_get_plus_one(JournalFile *f,
1135                                       uint64_t extra,
1136                                       uint64_t first,
1137                                       uint64_t i,
1138                                       Object **ret, uint64_t *offset) {
1139
1140         Object *o;
1141
1142         assert(f);
1143
1144         if (i == 0) {
1145                 int r;
1146
1147                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1148                 if (r < 0)
1149                         return r;
1150
1151                 if (ret)
1152                         *ret = o;
1153
1154                 if (offset)
1155                         *offset = extra;
1156
1157                 return 1;
1158         }
1159
1160         return generic_array_get(f, first, i-1, ret, offset);
1161 }
1162
1163 enum {
1164         TEST_FOUND,
1165         TEST_LEFT,
1166         TEST_RIGHT
1167 };
1168
1169 static int generic_array_bisect(JournalFile *f,
1170                                 uint64_t first,
1171                                 uint64_t n,
1172                                 uint64_t needle,
1173                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1174                                 direction_t direction,
1175                                 Object **ret,
1176                                 uint64_t *offset,
1177                                 uint64_t *idx) {
1178
1179         uint64_t a, p, t = 0, i = 0, last_p = 0;
1180         bool subtract_one = false;
1181         Object *o, *array = NULL;
1182         int r;
1183
1184         assert(f);
1185         assert(test_object);
1186
1187         a = first;
1188         while (a > 0) {
1189                 uint64_t left, right, k, lp;
1190
1191                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1192                 if (r < 0)
1193                         return r;
1194
1195                 k = journal_file_entry_array_n_items(array);
1196                 right = MIN(k, n);
1197                 if (right <= 0)
1198                         return 0;
1199
1200                 i = right - 1;
1201                 lp = p = le64toh(array->entry_array.items[i]);
1202                 if (p <= 0)
1203                         return -EBADMSG;
1204
1205                 r = test_object(f, p, needle);
1206                 if (r < 0)
1207                         return r;
1208
1209                 if (r == TEST_FOUND)
1210                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1211
1212                 if (r == TEST_RIGHT) {
1213                         left = 0;
1214                         right -= 1;
1215                         for (;;) {
1216                                 if (left == right) {
1217                                         if (direction == DIRECTION_UP)
1218                                                 subtract_one = true;
1219
1220                                         i = left;
1221                                         goto found;
1222                                 }
1223
1224                                 assert(left < right);
1225
1226                                 i = (left + right) / 2;
1227                                 p = le64toh(array->entry_array.items[i]);
1228                                 if (p <= 0)
1229                                         return -EBADMSG;
1230
1231                                 r = test_object(f, p, needle);
1232                                 if (r < 0)
1233                                         return r;
1234
1235                                 if (r == TEST_FOUND)
1236                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1237
1238                                 if (r == TEST_RIGHT)
1239                                         right = i;
1240                                 else
1241                                         left = i + 1;
1242                         }
1243                 }
1244
1245                 if (k > n) {
1246                         if (direction == DIRECTION_UP) {
1247                                 i = n;
1248                                 subtract_one = true;
1249                                 goto found;
1250                         }
1251
1252                         return 0;
1253                 }
1254
1255                 last_p = lp;
1256
1257                 n -= k;
1258                 t += k;
1259                 a = le64toh(array->entry_array.next_entry_array_offset);
1260         }
1261
1262         return 0;
1263
1264 found:
1265         if (subtract_one && t == 0 && i == 0)
1266                 return 0;
1267
1268         if (subtract_one && i == 0)
1269                 p = last_p;
1270         else if (subtract_one)
1271                 p = le64toh(array->entry_array.items[i-1]);
1272         else
1273                 p = le64toh(array->entry_array.items[i]);
1274
1275         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1276         if (r < 0)
1277                 return r;
1278
1279         if (ret)
1280                 *ret = o;
1281
1282         if (offset)
1283                 *offset = p;
1284
1285         if (idx)
1286                 *idx = t + i + (subtract_one ? -1 : 0);
1287
1288         return 1;
1289 }
1290
1291 static int generic_array_bisect_plus_one(JournalFile *f,
1292                                          uint64_t extra,
1293                                          uint64_t first,
1294                                          uint64_t n,
1295                                          uint64_t needle,
1296                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1297                                          direction_t direction,
1298                                          Object **ret,
1299                                          uint64_t *offset,
1300                                          uint64_t *idx) {
1301
1302         int r;
1303         bool step_back = false;
1304         Object *o;
1305
1306         assert(f);
1307         assert(test_object);
1308
1309         if (n <= 0)
1310                 return 0;
1311
1312         /* This bisects the array in object 'first', but first checks
1313          * an extra  */
1314         r = test_object(f, extra, needle);
1315         if (r < 0)
1316                 return r;
1317
1318         if (r == TEST_FOUND)
1319                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1320
1321         /* if we are looking with DIRECTION_UP then we need to first
1322            see if in the actual array there is a matching entry, and
1323            return the last one of that. But if there isn't any we need
1324            to return this one. Hence remember this, and return it
1325            below. */
1326         if (r == TEST_LEFT)
1327                 step_back = direction == DIRECTION_UP;
1328
1329         if (r == TEST_RIGHT) {
1330                 if (direction == DIRECTION_DOWN)
1331                         goto found;
1332                 else
1333                         return 0;
1334         }
1335
1336         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1337
1338         if (r == 0 && step_back)
1339                 goto found;
1340
1341         if (r > 0 && idx)
1342                 (*idx) ++;
1343
1344         return r;
1345
1346 found:
1347         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1348         if (r < 0)
1349                 return r;
1350
1351         if (ret)
1352                 *ret = o;
1353
1354         if (offset)
1355                 *offset = extra;
1356
1357         if (idx)
1358                 *idx = 0;
1359
1360         return 1;
1361 }
1362
1363 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1364         assert(f);
1365         assert(p > 0);
1366
1367         if (p == needle)
1368                 return TEST_FOUND;
1369         else if (p < needle)
1370                 return TEST_LEFT;
1371         else
1372                 return TEST_RIGHT;
1373 }
1374
1375 int journal_file_move_to_entry_by_offset(
1376                 JournalFile *f,
1377                 uint64_t p,
1378                 direction_t direction,
1379                 Object **ret,
1380                 uint64_t *offset) {
1381
1382         return generic_array_bisect(f,
1383                                     le64toh(f->header->entry_array_offset),
1384                                     le64toh(f->header->n_entries),
1385                                     p,
1386                                     test_object_offset,
1387                                     direction,
1388                                     ret, offset, NULL);
1389 }
1390
1391
1392 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1393         Object *o;
1394         int r;
1395
1396         assert(f);
1397         assert(p > 0);
1398
1399         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1400         if (r < 0)
1401                 return r;
1402
1403         if (le64toh(o->entry.seqnum) == needle)
1404                 return TEST_FOUND;
1405         else if (le64toh(o->entry.seqnum) < needle)
1406                 return TEST_LEFT;
1407         else
1408                 return TEST_RIGHT;
1409 }
1410
1411 int journal_file_move_to_entry_by_seqnum(
1412                 JournalFile *f,
1413                 uint64_t seqnum,
1414                 direction_t direction,
1415                 Object **ret,
1416                 uint64_t *offset) {
1417
1418         return generic_array_bisect(f,
1419                                     le64toh(f->header->entry_array_offset),
1420                                     le64toh(f->header->n_entries),
1421                                     seqnum,
1422                                     test_object_seqnum,
1423                                     direction,
1424                                     ret, offset, NULL);
1425 }
1426
1427 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1428         Object *o;
1429         int r;
1430
1431         assert(f);
1432         assert(p > 0);
1433
1434         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1435         if (r < 0)
1436                 return r;
1437
1438         if (le64toh(o->entry.realtime) == needle)
1439                 return TEST_FOUND;
1440         else if (le64toh(o->entry.realtime) < needle)
1441                 return TEST_LEFT;
1442         else
1443                 return TEST_RIGHT;
1444 }
1445
1446 int journal_file_move_to_entry_by_realtime(
1447                 JournalFile *f,
1448                 uint64_t realtime,
1449                 direction_t direction,
1450                 Object **ret,
1451                 uint64_t *offset) {
1452
1453         return generic_array_bisect(f,
1454                                     le64toh(f->header->entry_array_offset),
1455                                     le64toh(f->header->n_entries),
1456                                     realtime,
1457                                     test_object_realtime,
1458                                     direction,
1459                                     ret, offset, NULL);
1460 }
1461
1462 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1463         Object *o;
1464         int r;
1465
1466         assert(f);
1467         assert(p > 0);
1468
1469         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1470         if (r < 0)
1471                 return r;
1472
1473         if (le64toh(o->entry.monotonic) == needle)
1474                 return TEST_FOUND;
1475         else if (le64toh(o->entry.monotonic) < needle)
1476                 return TEST_LEFT;
1477         else
1478                 return TEST_RIGHT;
1479 }
1480
1481 int journal_file_move_to_entry_by_monotonic(
1482                 JournalFile *f,
1483                 sd_id128_t boot_id,
1484                 uint64_t monotonic,
1485                 direction_t direction,
1486                 Object **ret,
1487                 uint64_t *offset) {
1488
1489         char t[9+32+1] = "_BOOT_ID=";
1490         Object *o;
1491         int r;
1492
1493         assert(f);
1494
1495         sd_id128_to_string(boot_id, t + 9);
1496         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1497         if (r < 0)
1498                 return r;
1499         if (r == 0)
1500                 return -ENOENT;
1501
1502         return generic_array_bisect_plus_one(f,
1503                                              le64toh(o->data.entry_offset),
1504                                              le64toh(o->data.entry_array_offset),
1505                                              le64toh(o->data.n_entries),
1506                                              monotonic,
1507                                              test_object_monotonic,
1508                                              direction,
1509                                              ret, offset, NULL);
1510 }
1511
1512 int journal_file_next_entry(
1513                 JournalFile *f,
1514                 Object *o, uint64_t p,
1515                 direction_t direction,
1516                 Object **ret, uint64_t *offset) {
1517
1518         uint64_t i, n;
1519         int r;
1520
1521         assert(f);
1522         assert(p > 0 || !o);
1523
1524         n = le64toh(f->header->n_entries);
1525         if (n <= 0)
1526                 return 0;
1527
1528         if (!o)
1529                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1530         else {
1531                 if (o->object.type != OBJECT_ENTRY)
1532                         return -EINVAL;
1533
1534                 r = generic_array_bisect(f,
1535                                          le64toh(f->header->entry_array_offset),
1536                                          le64toh(f->header->n_entries),
1537                                          p,
1538                                          test_object_offset,
1539                                          DIRECTION_DOWN,
1540                                          NULL, NULL,
1541                                          &i);
1542                 if (r <= 0)
1543                         return r;
1544
1545                 if (direction == DIRECTION_DOWN) {
1546                         if (i >= n - 1)
1547                                 return 0;
1548
1549                         i++;
1550                 } else {
1551                         if (i <= 0)
1552                                 return 0;
1553
1554                         i--;
1555                 }
1556         }
1557
1558         /* And jump to it */
1559         return generic_array_get(f,
1560                                  le64toh(f->header->entry_array_offset),
1561                                  i,
1562                                  ret, offset);
1563 }
1564
1565 int journal_file_skip_entry(
1566                 JournalFile *f,
1567                 Object *o, uint64_t p,
1568                 int64_t skip,
1569                 Object **ret, uint64_t *offset) {
1570
1571         uint64_t i, n;
1572         int r;
1573
1574         assert(f);
1575         assert(o);
1576         assert(p > 0);
1577
1578         if (o->object.type != OBJECT_ENTRY)
1579                 return -EINVAL;
1580
1581         r = generic_array_bisect(f,
1582                                  le64toh(f->header->entry_array_offset),
1583                                  le64toh(f->header->n_entries),
1584                                  p,
1585                                  test_object_offset,
1586                                  DIRECTION_DOWN,
1587                                  NULL, NULL,
1588                                  &i);
1589         if (r <= 0)
1590                 return r;
1591
1592         /* Calculate new index */
1593         if (skip < 0) {
1594                 if ((uint64_t) -skip >= i)
1595                         i = 0;
1596                 else
1597                         i = i - (uint64_t) -skip;
1598         } else
1599                 i  += (uint64_t) skip;
1600
1601         n = le64toh(f->header->n_entries);
1602         if (n <= 0)
1603                 return -EBADMSG;
1604
1605         if (i >= n)
1606                 i = n-1;
1607
1608         return generic_array_get(f,
1609                                  le64toh(f->header->entry_array_offset),
1610                                  i,
1611                                  ret, offset);
1612 }
1613
1614 int journal_file_next_entry_for_data(
1615                 JournalFile *f,
1616                 Object *o, uint64_t p,
1617                 uint64_t data_offset,
1618                 direction_t direction,
1619                 Object **ret, uint64_t *offset) {
1620
1621         uint64_t n, i;
1622         int r;
1623         Object *d;
1624
1625         assert(f);
1626         assert(p > 0 || !o);
1627
1628         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1629         if (r < 0)
1630                 return r;
1631
1632         n = le64toh(d->data.n_entries);
1633         if (n <= 0)
1634                 return n;
1635
1636         if (!o)
1637                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1638         else {
1639                 if (o->object.type != OBJECT_ENTRY)
1640                         return -EINVAL;
1641
1642                 r = generic_array_bisect_plus_one(f,
1643                                                   le64toh(d->data.entry_offset),
1644                                                   le64toh(d->data.entry_array_offset),
1645                                                   le64toh(d->data.n_entries),
1646                                                   p,
1647                                                   test_object_offset,
1648                                                   DIRECTION_DOWN,
1649                                                   NULL, NULL,
1650                                                   &i);
1651
1652                 if (r <= 0)
1653                         return r;
1654
1655                 if (direction == DIRECTION_DOWN) {
1656                         if (i >= n - 1)
1657                                 return 0;
1658
1659                         i++;
1660                 } else {
1661                         if (i <= 0)
1662                                 return 0;
1663
1664                         i--;
1665                 }
1666
1667         }
1668
1669         return generic_array_get_plus_one(f,
1670                                           le64toh(d->data.entry_offset),
1671                                           le64toh(d->data.entry_array_offset),
1672                                           i,
1673                                           ret, offset);
1674 }
1675
1676 int journal_file_move_to_entry_by_offset_for_data(
1677                 JournalFile *f,
1678                 uint64_t data_offset,
1679                 uint64_t p,
1680                 direction_t direction,
1681                 Object **ret, uint64_t *offset) {
1682
1683         int r;
1684         Object *d;
1685
1686         assert(f);
1687
1688         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1689         if (r < 0)
1690                 return r;
1691
1692         return generic_array_bisect_plus_one(f,
1693                                              le64toh(d->data.entry_offset),
1694                                              le64toh(d->data.entry_array_offset),
1695                                              le64toh(d->data.n_entries),
1696                                              p,
1697                                              test_object_offset,
1698                                              direction,
1699                                              ret, offset, NULL);
1700 }
1701
1702 int journal_file_move_to_entry_by_monotonic_for_data(
1703                 JournalFile *f,
1704                 uint64_t data_offset,
1705                 sd_id128_t boot_id,
1706                 uint64_t monotonic,
1707                 direction_t direction,
1708                 Object **ret, uint64_t *offset) {
1709
1710         char t[9+32+1] = "_BOOT_ID=";
1711         Object *o, *d;
1712         int r;
1713         uint64_t b, z;
1714
1715         assert(f);
1716
1717         /* First, seek by time */
1718         sd_id128_to_string(boot_id, t + 9);
1719         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1720         if (r < 0)
1721                 return r;
1722         if (r == 0)
1723                 return -ENOENT;
1724
1725         r = generic_array_bisect_plus_one(f,
1726                                           le64toh(o->data.entry_offset),
1727                                           le64toh(o->data.entry_array_offset),
1728                                           le64toh(o->data.n_entries),
1729                                           monotonic,
1730                                           test_object_monotonic,
1731                                           direction,
1732                                           NULL, &z, NULL);
1733         if (r <= 0)
1734                 return r;
1735
1736         /* And now, continue seeking until we find an entry that
1737          * exists in both bisection arrays */
1738
1739         for (;;) {
1740                 Object *qo;
1741                 uint64_t p, q;
1742
1743                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1744                 if (r < 0)
1745                         return r;
1746
1747                 r = generic_array_bisect_plus_one(f,
1748                                                   le64toh(d->data.entry_offset),
1749                                                   le64toh(d->data.entry_array_offset),
1750                                                   le64toh(d->data.n_entries),
1751                                                   z,
1752                                                   test_object_offset,
1753                                                   direction,
1754                                                   NULL, &p, NULL);
1755                 if (r <= 0)
1756                         return r;
1757
1758                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1759                 if (r < 0)
1760                         return r;
1761
1762                 r = generic_array_bisect_plus_one(f,
1763                                                   le64toh(o->data.entry_offset),
1764                                                   le64toh(o->data.entry_array_offset),
1765                                                   le64toh(o->data.n_entries),
1766                                                   p,
1767                                                   test_object_offset,
1768                                                   direction,
1769                                                   &qo, &q, NULL);
1770
1771                 if (r <= 0)
1772                         return r;
1773
1774                 if (p == q) {
1775                         if (ret)
1776                                 *ret = qo;
1777                         if (offset)
1778                                 *offset = q;
1779
1780                         return 1;
1781                 }
1782
1783                 z = q;
1784         }
1785
1786         return 0;
1787 }
1788
1789 int journal_file_move_to_entry_by_seqnum_for_data(
1790                 JournalFile *f,
1791                 uint64_t data_offset,
1792                 uint64_t seqnum,
1793                 direction_t direction,
1794                 Object **ret, uint64_t *offset) {
1795
1796         Object *d;
1797         int r;
1798
1799         assert(f);
1800
1801         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1802         if (r < 0)
1803                 return r;
1804
1805         return generic_array_bisect_plus_one(f,
1806                                              le64toh(d->data.entry_offset),
1807                                              le64toh(d->data.entry_array_offset),
1808                                              le64toh(d->data.n_entries),
1809                                              seqnum,
1810                                              test_object_seqnum,
1811                                              direction,
1812                                              ret, offset, NULL);
1813 }
1814
1815 int journal_file_move_to_entry_by_realtime_for_data(
1816                 JournalFile *f,
1817                 uint64_t data_offset,
1818                 uint64_t realtime,
1819                 direction_t direction,
1820                 Object **ret, uint64_t *offset) {
1821
1822         Object *d;
1823         int r;
1824
1825         assert(f);
1826
1827         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1828         if (r < 0)
1829                 return r;
1830
1831         return generic_array_bisect_plus_one(f,
1832                                              le64toh(d->data.entry_offset),
1833                                              le64toh(d->data.entry_array_offset),
1834                                              le64toh(d->data.n_entries),
1835                                              realtime,
1836                                              test_object_realtime,
1837                                              direction,
1838                                              ret, offset, NULL);
1839 }
1840
1841 void journal_file_dump(JournalFile *f) {
1842         Object *o;
1843         int r;
1844         uint64_t p;
1845
1846         assert(f);
1847
1848         journal_file_print_header(f);
1849
1850         p = le64toh(f->header->header_size);
1851         while (p != 0) {
1852                 r = journal_file_move_to_object(f, -1, p, &o);
1853                 if (r < 0)
1854                         goto fail;
1855
1856                 switch (o->object.type) {
1857
1858                 case OBJECT_UNUSED:
1859                         printf("Type: OBJECT_UNUSED\n");
1860                         break;
1861
1862                 case OBJECT_DATA:
1863                         printf("Type: OBJECT_DATA\n");
1864                         break;
1865
1866                 case OBJECT_ENTRY:
1867                         printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
1868                                (unsigned long long) le64toh(o->entry.seqnum),
1869                                (unsigned long long) le64toh(o->entry.monotonic),
1870                                (unsigned long long) le64toh(o->entry.realtime));
1871                         break;
1872
1873                 case OBJECT_FIELD_HASH_TABLE:
1874                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1875                         break;
1876
1877                 case OBJECT_DATA_HASH_TABLE:
1878                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1879                         break;
1880
1881                 case OBJECT_ENTRY_ARRAY:
1882                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1883                         break;
1884
1885                 case OBJECT_TAG:
1886                         printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
1887                                (unsigned long long) le64toh(o->tag.seqnum),
1888                                (unsigned long long) le64toh(o->tag.epoch));
1889                         break;
1890                 }
1891
1892                 if (o->object.flags & OBJECT_COMPRESSED)
1893                         printf("Flags: COMPRESSED\n");
1894
1895                 if (p == le64toh(f->header->tail_object_offset))
1896                         p = 0;
1897                 else
1898                         p = p + ALIGN64(le64toh(o->object.size));
1899         }
1900
1901         return;
1902 fail:
1903         log_error("File corrupt");
1904 }
1905
1906 void journal_file_print_header(JournalFile *f) {
1907         char a[33], b[33], c[33];
1908         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1909         struct stat st;
1910         char bytes[FORMAT_BYTES_MAX];
1911
1912         assert(f);
1913
1914         printf("File Path: %s\n"
1915                "File ID: %s\n"
1916                "Machine ID: %s\n"
1917                "Boot ID: %s\n"
1918                "Sequential Number ID: %s\n"
1919                "State: %s\n"
1920                "Compatible Flags:%s%s\n"
1921                "Incompatible Flags:%s%s\n"
1922                "Header size: %llu\n"
1923                "Arena size: %llu\n"
1924                "Data Hash Table Size: %llu\n"
1925                "Field Hash Table Size: %llu\n"
1926                "Rotate Suggested: %s\n"
1927                "Head Sequential Number: %llu\n"
1928                "Tail Sequential Number: %llu\n"
1929                "Head Realtime Timestamp: %s\n"
1930                "Tail Realtime Timestamp: %s\n"
1931                "Objects: %llu\n"
1932                "Entry Objects: %llu\n",
1933                f->path,
1934                sd_id128_to_string(f->header->file_id, a),
1935                sd_id128_to_string(f->header->machine_id, b),
1936                sd_id128_to_string(f->header->boot_id, c),
1937                sd_id128_to_string(f->header->seqnum_id, c),
1938                f->header->state == STATE_OFFLINE ? "OFFLINE" :
1939                f->header->state == STATE_ONLINE ? "ONLINE" :
1940                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
1941                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
1942                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
1943                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
1944                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
1945                (unsigned long long) le64toh(f->header->header_size),
1946                (unsigned long long) le64toh(f->header->arena_size),
1947                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1948                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1949                yes_no(journal_file_rotate_suggested(f)),
1950                (unsigned long long) le64toh(f->header->head_entry_seqnum),
1951                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
1952                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1953                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
1954                (unsigned long long) le64toh(f->header->n_objects),
1955                (unsigned long long) le64toh(f->header->n_entries));
1956
1957         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1958                 printf("Data Objects: %llu\n"
1959                        "Data Hash Table Fill: %.1f%%\n",
1960                        (unsigned long long) le64toh(f->header->n_data),
1961                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1962
1963         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1964                 printf("Field Objects: %llu\n"
1965                        "Field Hash Table Fill: %.1f%%\n",
1966                        (unsigned long long) le64toh(f->header->n_fields),
1967                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1968
1969         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
1970                 printf("Tag Objects: %llu\n",
1971                        (unsigned long long) le64toh(f->header->n_tags));
1972         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1973                 printf("Entry Array Objects: %llu\n",
1974                        (unsigned long long) le64toh(f->header->n_entry_arrays));
1975
1976         if (fstat(f->fd, &st) >= 0)
1977                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
1978 }
1979
1980 int journal_file_open(
1981                 const char *fname,
1982                 int flags,
1983                 mode_t mode,
1984                 bool compress,
1985                 bool seal,
1986                 JournalMetrics *metrics,
1987                 MMapCache *mmap_cache,
1988                 JournalFile *template,
1989                 JournalFile **ret) {
1990
1991         JournalFile *f;
1992         int r;
1993         bool newly_created = false;
1994
1995         assert(fname);
1996
1997         if ((flags & O_ACCMODE) != O_RDONLY &&
1998             (flags & O_ACCMODE) != O_RDWR)
1999                 return -EINVAL;
2000
2001         if (!endswith(fname, ".journal") &&
2002             !endswith(fname, ".journal~"))
2003                 return -EINVAL;
2004
2005         f = new0(JournalFile, 1);
2006         if (!f)
2007                 return -ENOMEM;
2008
2009         f->fd = -1;
2010         f->mode = mode;
2011
2012         f->flags = flags;
2013         f->prot = prot_from_flags(flags);
2014         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2015         f->compress = compress;
2016         f->seal = seal;
2017
2018         if (mmap_cache)
2019                 f->mmap = mmap_cache_ref(mmap_cache);
2020         else {
2021                 f->mmap = mmap_cache_new();
2022                 if (!f->mmap) {
2023                         r = -ENOMEM;
2024                         goto fail;
2025                 }
2026         }
2027
2028         f->path = strdup(fname);
2029         if (!f->path) {
2030                 r = -ENOMEM;
2031                 goto fail;
2032         }
2033
2034         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2035         if (f->fd < 0) {
2036                 r = -errno;
2037                 goto fail;
2038         }
2039
2040         if (fstat(f->fd, &f->last_stat) < 0) {
2041                 r = -errno;
2042                 goto fail;
2043         }
2044
2045         if (f->last_stat.st_size == 0 && f->writable) {
2046                 newly_created = true;
2047
2048 #ifdef HAVE_GCRYPT
2049                 /* Try to load the FSPRG state, and if we can't, then
2050                  * just don't do sealing */
2051                 r = journal_file_fss_load(f);
2052                 if (r < 0)
2053                         f->seal = false;
2054 #endif
2055
2056                 r = journal_file_init_header(f, template);
2057                 if (r < 0)
2058                         goto fail;
2059
2060                 if (fstat(f->fd, &f->last_stat) < 0) {
2061                         r = -errno;
2062                         goto fail;
2063                 }
2064         }
2065
2066         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2067                 r = -EIO;
2068                 goto fail;
2069         }
2070
2071         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2072         if (f->header == MAP_FAILED) {
2073                 f->header = NULL;
2074                 r = -errno;
2075                 goto fail;
2076         }
2077
2078         if (!newly_created) {
2079                 r = journal_file_verify_header(f);
2080                 if (r < 0)
2081                         goto fail;
2082         }
2083
2084 #ifdef HAVE_GCRYPT
2085         if (!newly_created && f->writable) {
2086                 r = journal_file_fss_load(f);
2087                 if (r < 0)
2088                         goto fail;
2089         }
2090 #endif
2091
2092         if (f->writable) {
2093                 if (metrics) {
2094                         journal_default_metrics(metrics, f->fd);
2095                         f->metrics = *metrics;
2096                 } else if (template)
2097                         f->metrics = template->metrics;
2098
2099                 r = journal_file_refresh_header(f);
2100                 if (r < 0)
2101                         goto fail;
2102         }
2103
2104 #ifdef HAVE_GCRYPT
2105         r = journal_file_hmac_setup(f);
2106         if (r < 0)
2107                 goto fail;
2108 #endif
2109
2110         if (newly_created) {
2111                 r = journal_file_setup_field_hash_table(f);
2112                 if (r < 0)
2113                         goto fail;
2114
2115                 r = journal_file_setup_data_hash_table(f);
2116                 if (r < 0)
2117                         goto fail;
2118
2119 #ifdef HAVE_GCRYPT
2120                 r = journal_file_append_first_tag(f);
2121                 if (r < 0)
2122                         goto fail;
2123 #endif
2124         }
2125
2126         r = journal_file_map_field_hash_table(f);
2127         if (r < 0)
2128                 goto fail;
2129
2130         r = journal_file_map_data_hash_table(f);
2131         if (r < 0)
2132                 goto fail;
2133
2134         if (ret)
2135                 *ret = f;
2136
2137         return 0;
2138
2139 fail:
2140         journal_file_close(f);
2141
2142         return r;
2143 }
2144
2145 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2146         char *p;
2147         size_t l;
2148         JournalFile *old_file, *new_file = NULL;
2149         int r;
2150
2151         assert(f);
2152         assert(*f);
2153
2154         old_file = *f;
2155
2156         if (!old_file->writable)
2157                 return -EINVAL;
2158
2159         if (!endswith(old_file->path, ".journal"))
2160                 return -EINVAL;
2161
2162         l = strlen(old_file->path);
2163
2164         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2165         if (!p)
2166                 return -ENOMEM;
2167
2168         memcpy(p, old_file->path, l - 8);
2169         p[l-8] = '@';
2170         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2171         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2172                  "-%016llx-%016llx.journal",
2173                  (unsigned long long) le64toh((*f)->header->tail_entry_seqnum),
2174                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2175
2176         r = rename(old_file->path, p);
2177         free(p);
2178
2179         if (r < 0)
2180                 return -errno;
2181
2182         old_file->header->state = STATE_ARCHIVED;
2183
2184         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2185         journal_file_close(old_file);
2186
2187         *f = new_file;
2188         return r;
2189 }
2190
2191 int journal_file_open_reliably(
2192                 const char *fname,
2193                 int flags,
2194                 mode_t mode,
2195                 bool compress,
2196                 bool seal,
2197                 JournalMetrics *metrics,
2198                 MMapCache *mmap_cache,
2199                 JournalFile *template,
2200                 JournalFile **ret) {
2201
2202         int r;
2203         size_t l;
2204         char *p;
2205
2206         r = journal_file_open(fname, flags, mode, compress, seal,
2207                               metrics, mmap_cache, template, ret);
2208         if (r != -EBADMSG && /* corrupted */
2209             r != -ENODATA && /* truncated */
2210             r != -EHOSTDOWN && /* other machine */
2211             r != -EPROTONOSUPPORT && /* incompatible feature */
2212             r != -EBUSY && /* unclean shutdown */
2213             r != -ESHUTDOWN /* already archived */)
2214                 return r;
2215
2216         if ((flags & O_ACCMODE) == O_RDONLY)
2217                 return r;
2218
2219         if (!(flags & O_CREAT))
2220                 return r;
2221
2222         if (!endswith(fname, ".journal"))
2223                 return r;
2224
2225         /* The file is corrupted. Rotate it away and try it again (but only once) */
2226
2227         l = strlen(fname);
2228         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2229                      (int) (l-8), fname,
2230                      (unsigned long long) now(CLOCK_REALTIME),
2231                      random_ull()) < 0)
2232                 return -ENOMEM;
2233
2234         r = rename(fname, p);
2235         free(p);
2236         if (r < 0)
2237                 return -errno;
2238
2239         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2240
2241         return journal_file_open(fname, flags, mode, compress, seal,
2242                                  metrics, mmap_cache, template, ret);
2243 }
2244
2245
2246 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2247         uint64_t i, n;
2248         uint64_t q, xor_hash = 0;
2249         int r;
2250         EntryItem *items;
2251         dual_timestamp ts;
2252
2253         assert(from);
2254         assert(to);
2255         assert(o);
2256         assert(p);
2257
2258         if (!to->writable)
2259                 return -EPERM;
2260
2261         ts.monotonic = le64toh(o->entry.monotonic);
2262         ts.realtime = le64toh(o->entry.realtime);
2263
2264         if (to->tail_entry_monotonic_valid &&
2265             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2266                 return -EINVAL;
2267
2268         n = journal_file_entry_n_items(o);
2269         items = alloca(sizeof(EntryItem) * n);
2270
2271         for (i = 0; i < n; i++) {
2272                 uint64_t l, h;
2273                 le64_t le_hash;
2274                 size_t t;
2275                 void *data;
2276                 Object *u;
2277
2278                 q = le64toh(o->entry.items[i].object_offset);
2279                 le_hash = o->entry.items[i].hash;
2280
2281                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2282                 if (r < 0)
2283                         return r;
2284
2285                 if (le_hash != o->data.hash)
2286                         return -EBADMSG;
2287
2288                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2289                 t = (size_t) l;
2290
2291                 /* We hit the limit on 32bit machines */
2292                 if ((uint64_t) t != l)
2293                         return -E2BIG;
2294
2295                 if (o->object.flags & OBJECT_COMPRESSED) {
2296 #ifdef HAVE_XZ
2297                         uint64_t rsize;
2298
2299                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2300                                 return -EBADMSG;
2301
2302                         data = from->compress_buffer;
2303                         l = rsize;
2304 #else
2305                         return -EPROTONOSUPPORT;
2306 #endif
2307                 } else
2308                         data = o->data.payload;
2309
2310                 r = journal_file_append_data(to, data, l, &u, &h);
2311                 if (r < 0)
2312                         return r;
2313
2314                 xor_hash ^= le64toh(u->data.hash);
2315                 items[i].object_offset = htole64(h);
2316                 items[i].hash = u->data.hash;
2317
2318                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2319                 if (r < 0)
2320                         return r;
2321         }
2322
2323         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2324 }
2325
2326 void journal_default_metrics(JournalMetrics *m, int fd) {
2327         uint64_t fs_size = 0;
2328         struct statvfs ss;
2329         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2330
2331         assert(m);
2332         assert(fd >= 0);
2333
2334         if (fstatvfs(fd, &ss) >= 0)
2335                 fs_size = ss.f_frsize * ss.f_blocks;
2336
2337         if (m->max_use == (uint64_t) -1) {
2338
2339                 if (fs_size > 0) {
2340                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2341
2342                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2343                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2344
2345                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2346                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2347                 } else
2348                         m->max_use = DEFAULT_MAX_USE_LOWER;
2349         } else {
2350                 m->max_use = PAGE_ALIGN(m->max_use);
2351
2352                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2353                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2354         }
2355
2356         if (m->max_size == (uint64_t) -1) {
2357                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2358
2359                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2360                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2361         } else
2362                 m->max_size = PAGE_ALIGN(m->max_size);
2363
2364         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2365                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2366
2367         if (m->max_size*2 > m->max_use)
2368                 m->max_use = m->max_size*2;
2369
2370         if (m->min_size == (uint64_t) -1)
2371                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2372         else {
2373                 m->min_size = PAGE_ALIGN(m->min_size);
2374
2375                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2376                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2377
2378                 if (m->min_size > m->max_size)
2379                         m->max_size = m->min_size;
2380         }
2381
2382         if (m->keep_free == (uint64_t) -1) {
2383
2384                 if (fs_size > 0) {
2385                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2386
2387                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2388                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2389
2390                 } else
2391                         m->keep_free = DEFAULT_KEEP_FREE;
2392         }
2393
2394         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2395                   format_bytes(a, sizeof(a), m->max_use),
2396                   format_bytes(b, sizeof(b), m->max_size),
2397                   format_bytes(c, sizeof(c), m->min_size),
2398                   format_bytes(d, sizeof(d), m->keep_free));
2399 }
2400
2401 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2402         assert(f);
2403         assert(from || to);
2404
2405         if (from) {
2406                 if (f->header->head_entry_realtime == 0)
2407                         return -ENOENT;
2408
2409                 *from = le64toh(f->header->head_entry_realtime);
2410         }
2411
2412         if (to) {
2413                 if (f->header->tail_entry_realtime == 0)
2414                         return -ENOENT;
2415
2416                 *to = le64toh(f->header->tail_entry_realtime);
2417         }
2418
2419         return 1;
2420 }
2421
2422 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2423         char t[9+32+1] = "_BOOT_ID=";
2424         Object *o;
2425         uint64_t p;
2426         int r;
2427
2428         assert(f);
2429         assert(from || to);
2430
2431         sd_id128_to_string(boot_id, t + 9);
2432
2433         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2434         if (r <= 0)
2435                 return r;
2436
2437         if (le64toh(o->data.n_entries) <= 0)
2438                 return 0;
2439
2440         if (from) {
2441                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2442                 if (r < 0)
2443                         return r;
2444
2445                 *from = le64toh(o->entry.monotonic);
2446         }
2447
2448         if (to) {
2449                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2450                 if (r < 0)
2451                         return r;
2452
2453                 r = generic_array_get_plus_one(f,
2454                                                le64toh(o->data.entry_offset),
2455                                                le64toh(o->data.entry_array_offset),
2456                                                le64toh(o->data.n_entries)-1,
2457                                                &o, NULL);
2458                 if (r <= 0)
2459                         return r;
2460
2461                 *to = le64toh(o->entry.monotonic);
2462         }
2463
2464         return 1;
2465 }
2466
2467 bool journal_file_rotate_suggested(JournalFile *f) {
2468         assert(f);
2469
2470         /* If we gained new header fields we gained new features,
2471          * hence suggest a rotation */
2472         if (le64toh(f->header->header_size) < sizeof(Header)) {
2473                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2474                 return true;
2475         }
2476
2477         /* Let's check if the hash tables grew over a certain fill
2478          * level (75%, borrowing this value from Java's hash table
2479          * implementation), and if so suggest a rotation. To calculate
2480          * the fill level we need the n_data field, which only exists
2481          * in newer versions. */
2482
2483         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2484                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2485                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2486                                   f->path,
2487                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2488                                   (unsigned long long) le64toh(f->header->n_data),
2489                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2490                                   (unsigned long long) (f->last_stat.st_size),
2491                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2492                         return true;
2493                 }
2494
2495         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2496                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2497                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2498                                   f->path,
2499                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2500                                   (unsigned long long) le64toh(f->header->n_fields),
2501                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2502                         return true;
2503                 }
2504
2505         return false;
2506 }