chiark / gitweb /
ba04d1667b11dbda2f1eac5163252eff15773efb
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
33 #include "lookup3.h"
34 #include "compress.h"
35 #include "fsprg.h"
36
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 void journal_file_close(JournalFile *f) {
65         assert(f);
66
67 #ifdef HAVE_GCRYPT
68         /* Write the final tag */
69         if (f->seal && f->writable)
70                 journal_file_append_tag(f);
71 #endif
72
73         /* Sync everything to disk, before we mark the file offline */
74         if (f->mmap && f->fd >= 0)
75                 mmap_cache_close_fd(f->mmap, f->fd);
76
77         if (f->writable && f->fd >= 0)
78                 fdatasync(f->fd);
79
80         if (f->header) {
81                 /* Mark the file offline. Don't override the archived state if it already is set */
82                 if (f->writable && f->header->state == STATE_ONLINE)
83                         f->header->state = STATE_OFFLINE;
84
85                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
86         }
87
88         if (f->fd >= 0)
89                 close_nointr_nofail(f->fd);
90
91         free(f->path);
92
93         if (f->mmap)
94                 mmap_cache_unref(f->mmap);
95
96 #ifdef HAVE_XZ
97         free(f->compress_buffer);
98 #endif
99
100 #ifdef HAVE_GCRYPT
101         if (f->fss_file)
102                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
103         else if (f->fsprg_state)
104                 free(f->fsprg_state);
105
106         free(f->fsprg_seed);
107
108         if (f->hmac)
109                 gcry_md_close(f->hmac);
110 #endif
111
112         free(f);
113 }
114
115 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
116         Header h;
117         ssize_t k;
118         int r;
119
120         assert(f);
121
122         zero(h);
123         memcpy(h.signature, HEADER_SIGNATURE, 8);
124         h.header_size = htole64(ALIGN64(sizeof(h)));
125
126         h.incompatible_flags =
127                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
128
129         h.compatible_flags =
130                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
131
132         r = sd_id128_randomize(&h.file_id);
133         if (r < 0)
134                 return r;
135
136         if (template) {
137                 h.seqnum_id = template->header->seqnum_id;
138                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
139         } else
140                 h.seqnum_id = h.file_id;
141
142         k = pwrite(f->fd, &h, sizeof(h), 0);
143         if (k < 0)
144                 return -errno;
145
146         if (k != sizeof(h))
147                 return -EIO;
148
149         return 0;
150 }
151
152 static int journal_file_refresh_header(JournalFile *f) {
153         int r;
154         sd_id128_t boot_id;
155
156         assert(f);
157
158         r = sd_id128_get_machine(&f->header->machine_id);
159         if (r < 0)
160                 return r;
161
162         r = sd_id128_get_boot(&boot_id);
163         if (r < 0)
164                 return r;
165
166         if (sd_id128_equal(boot_id, f->header->boot_id))
167                 f->tail_entry_monotonic_valid = true;
168
169         f->header->boot_id = boot_id;
170
171         f->header->state = STATE_ONLINE;
172
173         /* Sync the online state to disk */
174         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
175         fdatasync(f->fd);
176
177         return 0;
178 }
179
180 static int journal_file_verify_header(JournalFile *f) {
181         assert(f);
182
183         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
184                 return -EBADMSG;
185
186         /* In both read and write mode we refuse to open files with
187          * incompatible flags we don't know */
188 #ifdef HAVE_XZ
189         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
190                 return -EPROTONOSUPPORT;
191 #else
192         if (f->header->incompatible_flags != 0)
193                 return -EPROTONOSUPPORT;
194 #endif
195
196         /* When open for writing we refuse to open files with
197          * compatible flags, too */
198         if (f->writable) {
199 #ifdef HAVE_GCRYPT
200                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
201                         return -EPROTONOSUPPORT;
202 #else
203                 if (f->header->compatible_flags != 0)
204                         return -EPROTONOSUPPORT;
205 #endif
206         }
207
208         if (f->header->state >= _STATE_MAX)
209                 return -EBADMSG;
210
211         /* The first addition was n_data, so check that we are at least this large */
212         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
213                 return -EBADMSG;
214
215         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
216                 return -EBADMSG;
217
218         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
219                 return -ENODATA;
220
221         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
222                 return -ENODATA;
223
224         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
225             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
226             !VALID64(le64toh(f->header->tail_object_offset)) ||
227             !VALID64(le64toh(f->header->entry_array_offset)))
228                 return -ENODATA;
229
230         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
231             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
232             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
233             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
234                 return -ENODATA;
235
236         if (f->writable) {
237                 uint8_t state;
238                 sd_id128_t machine_id;
239                 int r;
240
241                 r = sd_id128_get_machine(&machine_id);
242                 if (r < 0)
243                         return r;
244
245                 if (!sd_id128_equal(machine_id, f->header->machine_id))
246                         return -EHOSTDOWN;
247
248                 state = f->header->state;
249
250                 if (state == STATE_ONLINE) {
251                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
252                         return -EBUSY;
253                 } else if (state == STATE_ARCHIVED)
254                         return -ESHUTDOWN;
255                 else if (state != STATE_OFFLINE) {
256                         log_debug("Journal file %s has unknown state %u.", f->path, state);
257                         return -EBUSY;
258                 }
259         }
260
261         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
262
263         f->seal = JOURNAL_HEADER_SEALED(f->header);
264
265         return 0;
266 }
267
268 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
269         uint64_t old_size, new_size;
270         int r;
271
272         assert(f);
273
274         /* We assume that this file is not sparse, and we know that
275          * for sure, since we always call posix_fallocate()
276          * ourselves */
277
278         old_size =
279                 le64toh(f->header->header_size) +
280                 le64toh(f->header->arena_size);
281
282         new_size = PAGE_ALIGN(offset + size);
283         if (new_size < le64toh(f->header->header_size))
284                 new_size = le64toh(f->header->header_size);
285
286         if (new_size <= old_size)
287                 return 0;
288
289         if (f->metrics.max_size > 0 &&
290             new_size > f->metrics.max_size)
291                 return -E2BIG;
292
293         if (new_size > f->metrics.min_size &&
294             f->metrics.keep_free > 0) {
295                 struct statvfs svfs;
296
297                 if (fstatvfs(f->fd, &svfs) >= 0) {
298                         uint64_t available;
299
300                         available = svfs.f_bfree * svfs.f_bsize;
301
302                         if (available >= f->metrics.keep_free)
303                                 available -= f->metrics.keep_free;
304                         else
305                                 available = 0;
306
307                         if (new_size - old_size > available)
308                                 return -E2BIG;
309                 }
310         }
311
312         /* Note that the glibc fallocate() fallback is very
313            inefficient, hence we try to minimize the allocation area
314            as we can. */
315         r = posix_fallocate(f->fd, old_size, new_size - old_size);
316         if (r != 0)
317                 return -r;
318
319         if (fstat(f->fd, &f->last_stat) < 0)
320                 return -errno;
321
322         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
323
324         return 0;
325 }
326
327 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
328         assert(f);
329         assert(ret);
330
331         if (size <= 0)
332                 return -EINVAL;
333
334         /* Avoid SIGBUS on invalid accesses */
335         if (offset + size > (uint64_t) f->last_stat.st_size) {
336                 /* Hmm, out of range? Let's refresh the fstat() data
337                  * first, before we trust that check. */
338
339                 if (fstat(f->fd, &f->last_stat) < 0 ||
340                     offset + size > (uint64_t) f->last_stat.st_size)
341                         return -EADDRNOTAVAIL;
342         }
343
344         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
345 }
346
347 static uint64_t minimum_header_size(Object *o) {
348
349         static uint64_t table[] = {
350                 [OBJECT_DATA] = sizeof(DataObject),
351                 [OBJECT_FIELD] = sizeof(FieldObject),
352                 [OBJECT_ENTRY] = sizeof(EntryObject),
353                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
354                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
355                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
356                 [OBJECT_TAG] = sizeof(TagObject),
357         };
358
359         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
360                 return sizeof(ObjectHeader);
361
362         return table[o->object.type];
363 }
364
365 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
366         int r;
367         void *t;
368         Object *o;
369         uint64_t s;
370         unsigned context;
371
372         assert(f);
373         assert(ret);
374
375         /* Objects may only be located at multiple of 64 bit */
376         if (!VALID64(offset))
377                 return -EFAULT;
378
379         /* One context for each type, plus one catch-all for the rest */
380         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
381
382         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
383         if (r < 0)
384                 return r;
385
386         o = (Object*) t;
387         s = le64toh(o->object.size);
388
389         if (s < sizeof(ObjectHeader))
390                 return -EBADMSG;
391
392         if (o->object.type <= OBJECT_UNUSED)
393                 return -EBADMSG;
394
395         if (s < minimum_header_size(o))
396                 return -EBADMSG;
397
398         if (type >= 0 && o->object.type != type)
399                 return -EBADMSG;
400
401         if (s > sizeof(ObjectHeader)) {
402                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
403                 if (r < 0)
404                         return r;
405
406                 o = (Object*) t;
407         }
408
409         *ret = o;
410         return 0;
411 }
412
413 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
414         uint64_t r;
415
416         assert(f);
417
418         r = le64toh(f->header->tail_entry_seqnum) + 1;
419
420         if (seqnum) {
421                 /* If an external seqnum counter was passed, we update
422                  * both the local and the external one, and set it to
423                  * the maximum of both */
424
425                 if (*seqnum + 1 > r)
426                         r = *seqnum + 1;
427
428                 *seqnum = r;
429         }
430
431         f->header->tail_entry_seqnum = htole64(r);
432
433         if (f->header->head_entry_seqnum == 0)
434                 f->header->head_entry_seqnum = htole64(r);
435
436         return r;
437 }
438
439 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
440         int r;
441         uint64_t p;
442         Object *tail, *o;
443         void *t;
444
445         assert(f);
446         assert(type > 0 && type < _OBJECT_TYPE_MAX);
447         assert(size >= sizeof(ObjectHeader));
448         assert(offset);
449         assert(ret);
450
451         p = le64toh(f->header->tail_object_offset);
452         if (p == 0)
453                 p = le64toh(f->header->header_size);
454         else {
455                 r = journal_file_move_to_object(f, -1, p, &tail);
456                 if (r < 0)
457                         return r;
458
459                 p += ALIGN64(le64toh(tail->object.size));
460         }
461
462         r = journal_file_allocate(f, p, size);
463         if (r < 0)
464                 return r;
465
466         r = journal_file_move_to(f, type, false, p, size, &t);
467         if (r < 0)
468                 return r;
469
470         o = (Object*) t;
471
472         zero(o->object);
473         o->object.type = type;
474         o->object.size = htole64(size);
475
476         f->header->tail_object_offset = htole64(p);
477         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
478
479         *ret = o;
480         *offset = p;
481
482         return 0;
483 }
484
485 static int journal_file_setup_data_hash_table(JournalFile *f) {
486         uint64_t s, p;
487         Object *o;
488         int r;
489
490         assert(f);
491
492         /* We estimate that we need 1 hash table entry per 768 of
493            journal file and we want to make sure we never get beyond
494            75% fill level. Calculate the hash table size for the
495            maximum file size based on these metrics. */
496
497         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
498         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
499                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
500
501         log_debug("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
502
503         r = journal_file_append_object(f,
504                                        OBJECT_DATA_HASH_TABLE,
505                                        offsetof(Object, hash_table.items) + s,
506                                        &o, &p);
507         if (r < 0)
508                 return r;
509
510         memset(o->hash_table.items, 0, s);
511
512         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
513         f->header->data_hash_table_size = htole64(s);
514
515         return 0;
516 }
517
518 static int journal_file_setup_field_hash_table(JournalFile *f) {
519         uint64_t s, p;
520         Object *o;
521         int r;
522
523         assert(f);
524
525         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
526         r = journal_file_append_object(f,
527                                        OBJECT_FIELD_HASH_TABLE,
528                                        offsetof(Object, hash_table.items) + s,
529                                        &o, &p);
530         if (r < 0)
531                 return r;
532
533         memset(o->hash_table.items, 0, s);
534
535         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
536         f->header->field_hash_table_size = htole64(s);
537
538         return 0;
539 }
540
541 static int journal_file_map_data_hash_table(JournalFile *f) {
542         uint64_t s, p;
543         void *t;
544         int r;
545
546         assert(f);
547
548         p = le64toh(f->header->data_hash_table_offset);
549         s = le64toh(f->header->data_hash_table_size);
550
551         r = journal_file_move_to(f,
552                                  OBJECT_DATA_HASH_TABLE,
553                                  true,
554                                  p, s,
555                                  &t);
556         if (r < 0)
557                 return r;
558
559         f->data_hash_table = t;
560         return 0;
561 }
562
563 static int journal_file_map_field_hash_table(JournalFile *f) {
564         uint64_t s, p;
565         void *t;
566         int r;
567
568         assert(f);
569
570         p = le64toh(f->header->field_hash_table_offset);
571         s = le64toh(f->header->field_hash_table_size);
572
573         r = journal_file_move_to(f,
574                                  OBJECT_FIELD_HASH_TABLE,
575                                  true,
576                                  p, s,
577                                  &t);
578         if (r < 0)
579                 return r;
580
581         f->field_hash_table = t;
582         return 0;
583 }
584
585 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
586         uint64_t p, h;
587         int r;
588
589         assert(f);
590         assert(o);
591         assert(offset > 0);
592
593         if (o->object.type != OBJECT_DATA)
594                 return -EINVAL;
595
596         /* This might alter the window we are looking at */
597
598         o->data.next_hash_offset = o->data.next_field_offset = 0;
599         o->data.entry_offset = o->data.entry_array_offset = 0;
600         o->data.n_entries = 0;
601
602         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
603         p = le64toh(f->data_hash_table[h].tail_hash_offset);
604         if (p == 0) {
605                 /* Only entry in the hash table is easy */
606                 f->data_hash_table[h].head_hash_offset = htole64(offset);
607         } else {
608                 /* Move back to the previous data object, to patch in
609                  * pointer */
610
611                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
612                 if (r < 0)
613                         return r;
614
615                 o->data.next_hash_offset = htole64(offset);
616         }
617
618         f->data_hash_table[h].tail_hash_offset = htole64(offset);
619
620         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
621                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
622
623         return 0;
624 }
625
626 int journal_file_find_data_object_with_hash(
627                 JournalFile *f,
628                 const void *data, uint64_t size, uint64_t hash,
629                 Object **ret, uint64_t *offset) {
630
631         uint64_t p, osize, h;
632         int r;
633
634         assert(f);
635         assert(data || size == 0);
636
637         osize = offsetof(Object, data.payload) + size;
638
639         if (f->header->data_hash_table_size == 0)
640                 return -EBADMSG;
641
642         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
643         p = le64toh(f->data_hash_table[h].head_hash_offset);
644
645         while (p > 0) {
646                 Object *o;
647
648                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
649                 if (r < 0)
650                         return r;
651
652                 if (le64toh(o->data.hash) != hash)
653                         goto next;
654
655                 if (o->object.flags & OBJECT_COMPRESSED) {
656 #ifdef HAVE_XZ
657                         uint64_t l, rsize;
658
659                         l = le64toh(o->object.size);
660                         if (l <= offsetof(Object, data.payload))
661                                 return -EBADMSG;
662
663                         l -= offsetof(Object, data.payload);
664
665                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
666                                 return -EBADMSG;
667
668                         if (rsize == size &&
669                             memcmp(f->compress_buffer, data, size) == 0) {
670
671                                 if (ret)
672                                         *ret = o;
673
674                                 if (offset)
675                                         *offset = p;
676
677                                 return 1;
678                         }
679 #else
680                         return -EPROTONOSUPPORT;
681 #endif
682
683                 } else if (le64toh(o->object.size) == osize &&
684                            memcmp(o->data.payload, data, size) == 0) {
685
686                         if (ret)
687                                 *ret = o;
688
689                         if (offset)
690                                 *offset = p;
691
692                         return 1;
693                 }
694
695         next:
696                 p = le64toh(o->data.next_hash_offset);
697         }
698
699         return 0;
700 }
701
702 int journal_file_find_data_object(
703                 JournalFile *f,
704                 const void *data, uint64_t size,
705                 Object **ret, uint64_t *offset) {
706
707         uint64_t hash;
708
709         assert(f);
710         assert(data || size == 0);
711
712         hash = hash64(data, size);
713
714         return journal_file_find_data_object_with_hash(f,
715                                                        data, size, hash,
716                                                        ret, offset);
717 }
718
719 static int journal_file_append_data(
720                 JournalFile *f,
721                 const void *data, uint64_t size,
722                 Object **ret, uint64_t *offset) {
723
724         uint64_t hash, p;
725         uint64_t osize;
726         Object *o;
727         int r;
728         bool compressed = false;
729
730         assert(f);
731         assert(data || size == 0);
732
733         hash = hash64(data, size);
734
735         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
736         if (r < 0)
737                 return r;
738         else if (r > 0) {
739
740                 if (ret)
741                         *ret = o;
742
743                 if (offset)
744                         *offset = p;
745
746                 return 0;
747         }
748
749         osize = offsetof(Object, data.payload) + size;
750         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
751         if (r < 0)
752                 return r;
753
754         o->data.hash = htole64(hash);
755
756 #ifdef HAVE_XZ
757         if (f->compress &&
758             size >= COMPRESSION_SIZE_THRESHOLD) {
759                 uint64_t rsize;
760
761                 compressed = compress_blob(data, size, o->data.payload, &rsize);
762
763                 if (compressed) {
764                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
765                         o->object.flags |= OBJECT_COMPRESSED;
766
767                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
768                 }
769         }
770 #endif
771
772         if (!compressed && size > 0)
773                 memcpy(o->data.payload, data, size);
774
775         r = journal_file_link_data(f, o, p, hash);
776         if (r < 0)
777                 return r;
778
779         /* The linking might have altered the window, so let's
780          * refresh our pointer */
781         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
782         if (r < 0)
783                 return r;
784
785 #ifdef HAVE_GCRYPT
786         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
787         if (r < 0)
788                 return r;
789 #endif
790
791         if (ret)
792                 *ret = o;
793
794         if (offset)
795                 *offset = p;
796
797         return 0;
798 }
799
800 uint64_t journal_file_entry_n_items(Object *o) {
801         assert(o);
802
803         if (o->object.type != OBJECT_ENTRY)
804                 return 0;
805
806         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
807 }
808
809 uint64_t journal_file_entry_array_n_items(Object *o) {
810         assert(o);
811
812         if (o->object.type != OBJECT_ENTRY_ARRAY)
813                 return 0;
814
815         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
816 }
817
818 uint64_t journal_file_hash_table_n_items(Object *o) {
819         assert(o);
820
821         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
822             o->object.type != OBJECT_FIELD_HASH_TABLE)
823                 return 0;
824
825         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
826 }
827
828 static int link_entry_into_array(JournalFile *f,
829                                  le64_t *first,
830                                  le64_t *idx,
831                                  uint64_t p) {
832         int r;
833         uint64_t n = 0, ap = 0, q, i, a, hidx;
834         Object *o;
835
836         assert(f);
837         assert(first);
838         assert(idx);
839         assert(p > 0);
840
841         a = le64toh(*first);
842         i = hidx = le64toh(*idx);
843         while (a > 0) {
844
845                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
846                 if (r < 0)
847                         return r;
848
849                 n = journal_file_entry_array_n_items(o);
850                 if (i < n) {
851                         o->entry_array.items[i] = htole64(p);
852                         *idx = htole64(hidx + 1);
853                         return 0;
854                 }
855
856                 i -= n;
857                 ap = a;
858                 a = le64toh(o->entry_array.next_entry_array_offset);
859         }
860
861         if (hidx > n)
862                 n = (hidx+1) * 2;
863         else
864                 n = n * 2;
865
866         if (n < 4)
867                 n = 4;
868
869         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
870                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
871                                        &o, &q);
872         if (r < 0)
873                 return r;
874
875 #ifdef HAVE_GCRYPT
876         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
877         if (r < 0)
878                 return r;
879 #endif
880
881         o->entry_array.items[i] = htole64(p);
882
883         if (ap == 0)
884                 *first = htole64(q);
885         else {
886                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
887                 if (r < 0)
888                         return r;
889
890                 o->entry_array.next_entry_array_offset = htole64(q);
891         }
892
893         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
894                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
895
896         *idx = htole64(hidx + 1);
897
898         return 0;
899 }
900
901 static int link_entry_into_array_plus_one(JournalFile *f,
902                                           le64_t *extra,
903                                           le64_t *first,
904                                           le64_t *idx,
905                                           uint64_t p) {
906
907         int r;
908
909         assert(f);
910         assert(extra);
911         assert(first);
912         assert(idx);
913         assert(p > 0);
914
915         if (*idx == 0)
916                 *extra = htole64(p);
917         else {
918                 le64_t i;
919
920                 i = htole64(le64toh(*idx) - 1);
921                 r = link_entry_into_array(f, first, &i, p);
922                 if (r < 0)
923                         return r;
924         }
925
926         *idx = htole64(le64toh(*idx) + 1);
927         return 0;
928 }
929
930 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
931         uint64_t p;
932         int r;
933         assert(f);
934         assert(o);
935         assert(offset > 0);
936
937         p = le64toh(o->entry.items[i].object_offset);
938         if (p == 0)
939                 return -EINVAL;
940
941         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
942         if (r < 0)
943                 return r;
944
945         return link_entry_into_array_plus_one(f,
946                                               &o->data.entry_offset,
947                                               &o->data.entry_array_offset,
948                                               &o->data.n_entries,
949                                               offset);
950 }
951
952 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
953         uint64_t n, i;
954         int r;
955
956         assert(f);
957         assert(o);
958         assert(offset > 0);
959
960         if (o->object.type != OBJECT_ENTRY)
961                 return -EINVAL;
962
963         __sync_synchronize();
964
965         /* Link up the entry itself */
966         r = link_entry_into_array(f,
967                                   &f->header->entry_array_offset,
968                                   &f->header->n_entries,
969                                   offset);
970         if (r < 0)
971                 return r;
972
973         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
974
975         if (f->header->head_entry_realtime == 0)
976                 f->header->head_entry_realtime = o->entry.realtime;
977
978         f->header->tail_entry_realtime = o->entry.realtime;
979         f->header->tail_entry_monotonic = o->entry.monotonic;
980
981         f->tail_entry_monotonic_valid = true;
982
983         /* Link up the items */
984         n = journal_file_entry_n_items(o);
985         for (i = 0; i < n; i++) {
986                 r = journal_file_link_entry_item(f, o, offset, i);
987                 if (r < 0)
988                         return r;
989         }
990
991         return 0;
992 }
993
994 static int journal_file_append_entry_internal(
995                 JournalFile *f,
996                 const dual_timestamp *ts,
997                 uint64_t xor_hash,
998                 const EntryItem items[], unsigned n_items,
999                 uint64_t *seqnum,
1000                 Object **ret, uint64_t *offset) {
1001         uint64_t np;
1002         uint64_t osize;
1003         Object *o;
1004         int r;
1005
1006         assert(f);
1007         assert(items || n_items == 0);
1008         assert(ts);
1009
1010         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1011
1012         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1013         if (r < 0)
1014                 return r;
1015
1016         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1017         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1018         o->entry.realtime = htole64(ts->realtime);
1019         o->entry.monotonic = htole64(ts->monotonic);
1020         o->entry.xor_hash = htole64(xor_hash);
1021         o->entry.boot_id = f->header->boot_id;
1022
1023 #ifdef HAVE_GCRYPT
1024         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1025         if (r < 0)
1026                 return r;
1027 #endif
1028
1029         r = journal_file_link_entry(f, o, np);
1030         if (r < 0)
1031                 return r;
1032
1033         if (ret)
1034                 *ret = o;
1035
1036         if (offset)
1037                 *offset = np;
1038
1039         return 0;
1040 }
1041
1042 void journal_file_post_change(JournalFile *f) {
1043         assert(f);
1044
1045         /* inotify() does not receive IN_MODIFY events from file
1046          * accesses done via mmap(). After each access we hence
1047          * trigger IN_MODIFY by truncating the journal file to its
1048          * current size which triggers IN_MODIFY. */
1049
1050         __sync_synchronize();
1051
1052         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1053                 log_error("Failed to truncate file to its own size: %m");
1054 }
1055
1056 static int entry_item_cmp(const void *_a, const void *_b) {
1057         const EntryItem *a = _a, *b = _b;
1058
1059         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1060                 return -1;
1061         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1062                 return 1;
1063         return 0;
1064 }
1065
1066 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1067         unsigned i;
1068         EntryItem *items;
1069         int r;
1070         uint64_t xor_hash = 0;
1071         struct dual_timestamp _ts;
1072
1073         assert(f);
1074         assert(iovec || n_iovec == 0);
1075
1076         if (!f->writable)
1077                 return -EPERM;
1078
1079         if (!ts) {
1080                 dual_timestamp_get(&_ts);
1081                 ts = &_ts;
1082         }
1083
1084         if (f->tail_entry_monotonic_valid &&
1085             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1086                 return -EINVAL;
1087
1088 #ifdef HAVE_GCRYPT
1089         r = journal_file_maybe_append_tag(f, ts->realtime);
1090         if (r < 0)
1091                 return r;
1092 #endif
1093
1094         /* alloca() can't take 0, hence let's allocate at least one */
1095         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1096
1097         for (i = 0; i < n_iovec; i++) {
1098                 uint64_t p;
1099                 Object *o;
1100
1101                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1102                 if (r < 0)
1103                         return r;
1104
1105                 xor_hash ^= le64toh(o->data.hash);
1106                 items[i].object_offset = htole64(p);
1107                 items[i].hash = o->data.hash;
1108         }
1109
1110         /* Order by the position on disk, in order to improve seek
1111          * times for rotating media. */
1112         qsort(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1113
1114         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1115
1116         journal_file_post_change(f);
1117
1118         return r;
1119 }
1120
1121 static int generic_array_get(JournalFile *f,
1122                              uint64_t first,
1123                              uint64_t i,
1124                              Object **ret, uint64_t *offset) {
1125
1126         Object *o;
1127         uint64_t p = 0, a;
1128         int r;
1129
1130         assert(f);
1131
1132         a = first;
1133         while (a > 0) {
1134                 uint64_t n;
1135
1136                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1137                 if (r < 0)
1138                         return r;
1139
1140                 n = journal_file_entry_array_n_items(o);
1141                 if (i < n) {
1142                         p = le64toh(o->entry_array.items[i]);
1143                         break;
1144                 }
1145
1146                 i -= n;
1147                 a = le64toh(o->entry_array.next_entry_array_offset);
1148         }
1149
1150         if (a <= 0 || p <= 0)
1151                 return 0;
1152
1153         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1154         if (r < 0)
1155                 return r;
1156
1157         if (ret)
1158                 *ret = o;
1159
1160         if (offset)
1161                 *offset = p;
1162
1163         return 1;
1164 }
1165
1166 static int generic_array_get_plus_one(JournalFile *f,
1167                                       uint64_t extra,
1168                                       uint64_t first,
1169                                       uint64_t i,
1170                                       Object **ret, uint64_t *offset) {
1171
1172         Object *o;
1173
1174         assert(f);
1175
1176         if (i == 0) {
1177                 int r;
1178
1179                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1180                 if (r < 0)
1181                         return r;
1182
1183                 if (ret)
1184                         *ret = o;
1185
1186                 if (offset)
1187                         *offset = extra;
1188
1189                 return 1;
1190         }
1191
1192         return generic_array_get(f, first, i-1, ret, offset);
1193 }
1194
1195 enum {
1196         TEST_FOUND,
1197         TEST_LEFT,
1198         TEST_RIGHT
1199 };
1200
1201 static int generic_array_bisect(JournalFile *f,
1202                                 uint64_t first,
1203                                 uint64_t n,
1204                                 uint64_t needle,
1205                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1206                                 direction_t direction,
1207                                 Object **ret,
1208                                 uint64_t *offset,
1209                                 uint64_t *idx) {
1210
1211         uint64_t a, p, t = 0, i = 0, last_p = 0;
1212         bool subtract_one = false;
1213         Object *o, *array = NULL;
1214         int r;
1215
1216         assert(f);
1217         assert(test_object);
1218
1219         a = first;
1220         while (a > 0) {
1221                 uint64_t left, right, k, lp;
1222
1223                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1224                 if (r < 0)
1225                         return r;
1226
1227                 k = journal_file_entry_array_n_items(array);
1228                 right = MIN(k, n);
1229                 if (right <= 0)
1230                         return 0;
1231
1232                 i = right - 1;
1233                 lp = p = le64toh(array->entry_array.items[i]);
1234                 if (p <= 0)
1235                         return -EBADMSG;
1236
1237                 r = test_object(f, p, needle);
1238                 if (r < 0)
1239                         return r;
1240
1241                 if (r == TEST_FOUND)
1242                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1243
1244                 if (r == TEST_RIGHT) {
1245                         left = 0;
1246                         right -= 1;
1247                         for (;;) {
1248                                 if (left == right) {
1249                                         if (direction == DIRECTION_UP)
1250                                                 subtract_one = true;
1251
1252                                         i = left;
1253                                         goto found;
1254                                 }
1255
1256                                 assert(left < right);
1257
1258                                 i = (left + right) / 2;
1259                                 p = le64toh(array->entry_array.items[i]);
1260                                 if (p <= 0)
1261                                         return -EBADMSG;
1262
1263                                 r = test_object(f, p, needle);
1264                                 if (r < 0)
1265                                         return r;
1266
1267                                 if (r == TEST_FOUND)
1268                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1269
1270                                 if (r == TEST_RIGHT)
1271                                         right = i;
1272                                 else
1273                                         left = i + 1;
1274                         }
1275                 }
1276
1277                 if (k > n) {
1278                         if (direction == DIRECTION_UP) {
1279                                 i = n;
1280                                 subtract_one = true;
1281                                 goto found;
1282                         }
1283
1284                         return 0;
1285                 }
1286
1287                 last_p = lp;
1288
1289                 n -= k;
1290                 t += k;
1291                 a = le64toh(array->entry_array.next_entry_array_offset);
1292         }
1293
1294         return 0;
1295
1296 found:
1297         if (subtract_one && t == 0 && i == 0)
1298                 return 0;
1299
1300         if (subtract_one && i == 0)
1301                 p = last_p;
1302         else if (subtract_one)
1303                 p = le64toh(array->entry_array.items[i-1]);
1304         else
1305                 p = le64toh(array->entry_array.items[i]);
1306
1307         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1308         if (r < 0)
1309                 return r;
1310
1311         if (ret)
1312                 *ret = o;
1313
1314         if (offset)
1315                 *offset = p;
1316
1317         if (idx)
1318                 *idx = t + i + (subtract_one ? -1 : 0);
1319
1320         return 1;
1321 }
1322
1323 static int generic_array_bisect_plus_one(JournalFile *f,
1324                                          uint64_t extra,
1325                                          uint64_t first,
1326                                          uint64_t n,
1327                                          uint64_t needle,
1328                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1329                                          direction_t direction,
1330                                          Object **ret,
1331                                          uint64_t *offset,
1332                                          uint64_t *idx) {
1333
1334         int r;
1335         bool step_back = false;
1336         Object *o;
1337
1338         assert(f);
1339         assert(test_object);
1340
1341         if (n <= 0)
1342                 return 0;
1343
1344         /* This bisects the array in object 'first', but first checks
1345          * an extra  */
1346         r = test_object(f, extra, needle);
1347         if (r < 0)
1348                 return r;
1349
1350         if (r == TEST_FOUND)
1351                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1352
1353         /* if we are looking with DIRECTION_UP then we need to first
1354            see if in the actual array there is a matching entry, and
1355            return the last one of that. But if there isn't any we need
1356            to return this one. Hence remember this, and return it
1357            below. */
1358         if (r == TEST_LEFT)
1359                 step_back = direction == DIRECTION_UP;
1360
1361         if (r == TEST_RIGHT) {
1362                 if (direction == DIRECTION_DOWN)
1363                         goto found;
1364                 else
1365                         return 0;
1366         }
1367
1368         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1369
1370         if (r == 0 && step_back)
1371                 goto found;
1372
1373         if (r > 0 && idx)
1374                 (*idx) ++;
1375
1376         return r;
1377
1378 found:
1379         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1380         if (r < 0)
1381                 return r;
1382
1383         if (ret)
1384                 *ret = o;
1385
1386         if (offset)
1387                 *offset = extra;
1388
1389         if (idx)
1390                 *idx = 0;
1391
1392         return 1;
1393 }
1394
1395 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1396         assert(f);
1397         assert(p > 0);
1398
1399         if (p == needle)
1400                 return TEST_FOUND;
1401         else if (p < needle)
1402                 return TEST_LEFT;
1403         else
1404                 return TEST_RIGHT;
1405 }
1406
1407 int journal_file_move_to_entry_by_offset(
1408                 JournalFile *f,
1409                 uint64_t p,
1410                 direction_t direction,
1411                 Object **ret,
1412                 uint64_t *offset) {
1413
1414         return generic_array_bisect(f,
1415                                     le64toh(f->header->entry_array_offset),
1416                                     le64toh(f->header->n_entries),
1417                                     p,
1418                                     test_object_offset,
1419                                     direction,
1420                                     ret, offset, NULL);
1421 }
1422
1423
1424 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1425         Object *o;
1426         int r;
1427
1428         assert(f);
1429         assert(p > 0);
1430
1431         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1432         if (r < 0)
1433                 return r;
1434
1435         if (le64toh(o->entry.seqnum) == needle)
1436                 return TEST_FOUND;
1437         else if (le64toh(o->entry.seqnum) < needle)
1438                 return TEST_LEFT;
1439         else
1440                 return TEST_RIGHT;
1441 }
1442
1443 int journal_file_move_to_entry_by_seqnum(
1444                 JournalFile *f,
1445                 uint64_t seqnum,
1446                 direction_t direction,
1447                 Object **ret,
1448                 uint64_t *offset) {
1449
1450         return generic_array_bisect(f,
1451                                     le64toh(f->header->entry_array_offset),
1452                                     le64toh(f->header->n_entries),
1453                                     seqnum,
1454                                     test_object_seqnum,
1455                                     direction,
1456                                     ret, offset, NULL);
1457 }
1458
1459 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1460         Object *o;
1461         int r;
1462
1463         assert(f);
1464         assert(p > 0);
1465
1466         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1467         if (r < 0)
1468                 return r;
1469
1470         if (le64toh(o->entry.realtime) == needle)
1471                 return TEST_FOUND;
1472         else if (le64toh(o->entry.realtime) < needle)
1473                 return TEST_LEFT;
1474         else
1475                 return TEST_RIGHT;
1476 }
1477
1478 int journal_file_move_to_entry_by_realtime(
1479                 JournalFile *f,
1480                 uint64_t realtime,
1481                 direction_t direction,
1482                 Object **ret,
1483                 uint64_t *offset) {
1484
1485         return generic_array_bisect(f,
1486                                     le64toh(f->header->entry_array_offset),
1487                                     le64toh(f->header->n_entries),
1488                                     realtime,
1489                                     test_object_realtime,
1490                                     direction,
1491                                     ret, offset, NULL);
1492 }
1493
1494 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1495         Object *o;
1496         int r;
1497
1498         assert(f);
1499         assert(p > 0);
1500
1501         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1502         if (r < 0)
1503                 return r;
1504
1505         if (le64toh(o->entry.monotonic) == needle)
1506                 return TEST_FOUND;
1507         else if (le64toh(o->entry.monotonic) < needle)
1508                 return TEST_LEFT;
1509         else
1510                 return TEST_RIGHT;
1511 }
1512
1513 int journal_file_move_to_entry_by_monotonic(
1514                 JournalFile *f,
1515                 sd_id128_t boot_id,
1516                 uint64_t monotonic,
1517                 direction_t direction,
1518                 Object **ret,
1519                 uint64_t *offset) {
1520
1521         char t[9+32+1] = "_BOOT_ID=";
1522         Object *o;
1523         int r;
1524
1525         assert(f);
1526
1527         sd_id128_to_string(boot_id, t + 9);
1528         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1529         if (r < 0)
1530                 return r;
1531         if (r == 0)
1532                 return -ENOENT;
1533
1534         return generic_array_bisect_plus_one(f,
1535                                              le64toh(o->data.entry_offset),
1536                                              le64toh(o->data.entry_array_offset),
1537                                              le64toh(o->data.n_entries),
1538                                              monotonic,
1539                                              test_object_monotonic,
1540                                              direction,
1541                                              ret, offset, NULL);
1542 }
1543
1544 int journal_file_next_entry(
1545                 JournalFile *f,
1546                 Object *o, uint64_t p,
1547                 direction_t direction,
1548                 Object **ret, uint64_t *offset) {
1549
1550         uint64_t i, n;
1551         int r;
1552
1553         assert(f);
1554         assert(p > 0 || !o);
1555
1556         n = le64toh(f->header->n_entries);
1557         if (n <= 0)
1558                 return 0;
1559
1560         if (!o)
1561                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1562         else {
1563                 if (o->object.type != OBJECT_ENTRY)
1564                         return -EINVAL;
1565
1566                 r = generic_array_bisect(f,
1567                                          le64toh(f->header->entry_array_offset),
1568                                          le64toh(f->header->n_entries),
1569                                          p,
1570                                          test_object_offset,
1571                                          DIRECTION_DOWN,
1572                                          NULL, NULL,
1573                                          &i);
1574                 if (r <= 0)
1575                         return r;
1576
1577                 if (direction == DIRECTION_DOWN) {
1578                         if (i >= n - 1)
1579                                 return 0;
1580
1581                         i++;
1582                 } else {
1583                         if (i <= 0)
1584                                 return 0;
1585
1586                         i--;
1587                 }
1588         }
1589
1590         /* And jump to it */
1591         return generic_array_get(f,
1592                                  le64toh(f->header->entry_array_offset),
1593                                  i,
1594                                  ret, offset);
1595 }
1596
1597 int journal_file_skip_entry(
1598                 JournalFile *f,
1599                 Object *o, uint64_t p,
1600                 int64_t skip,
1601                 Object **ret, uint64_t *offset) {
1602
1603         uint64_t i, n;
1604         int r;
1605
1606         assert(f);
1607         assert(o);
1608         assert(p > 0);
1609
1610         if (o->object.type != OBJECT_ENTRY)
1611                 return -EINVAL;
1612
1613         r = generic_array_bisect(f,
1614                                  le64toh(f->header->entry_array_offset),
1615                                  le64toh(f->header->n_entries),
1616                                  p,
1617                                  test_object_offset,
1618                                  DIRECTION_DOWN,
1619                                  NULL, NULL,
1620                                  &i);
1621         if (r <= 0)
1622                 return r;
1623
1624         /* Calculate new index */
1625         if (skip < 0) {
1626                 if ((uint64_t) -skip >= i)
1627                         i = 0;
1628                 else
1629                         i = i - (uint64_t) -skip;
1630         } else
1631                 i  += (uint64_t) skip;
1632
1633         n = le64toh(f->header->n_entries);
1634         if (n <= 0)
1635                 return -EBADMSG;
1636
1637         if (i >= n)
1638                 i = n-1;
1639
1640         return generic_array_get(f,
1641                                  le64toh(f->header->entry_array_offset),
1642                                  i,
1643                                  ret, offset);
1644 }
1645
1646 int journal_file_next_entry_for_data(
1647                 JournalFile *f,
1648                 Object *o, uint64_t p,
1649                 uint64_t data_offset,
1650                 direction_t direction,
1651                 Object **ret, uint64_t *offset) {
1652
1653         uint64_t n, i;
1654         int r;
1655         Object *d;
1656
1657         assert(f);
1658         assert(p > 0 || !o);
1659
1660         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1661         if (r < 0)
1662                 return r;
1663
1664         n = le64toh(d->data.n_entries);
1665         if (n <= 0)
1666                 return n;
1667
1668         if (!o)
1669                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1670         else {
1671                 if (o->object.type != OBJECT_ENTRY)
1672                         return -EINVAL;
1673
1674                 r = generic_array_bisect_plus_one(f,
1675                                                   le64toh(d->data.entry_offset),
1676                                                   le64toh(d->data.entry_array_offset),
1677                                                   le64toh(d->data.n_entries),
1678                                                   p,
1679                                                   test_object_offset,
1680                                                   DIRECTION_DOWN,
1681                                                   NULL, NULL,
1682                                                   &i);
1683
1684                 if (r <= 0)
1685                         return r;
1686
1687                 if (direction == DIRECTION_DOWN) {
1688                         if (i >= n - 1)
1689                                 return 0;
1690
1691                         i++;
1692                 } else {
1693                         if (i <= 0)
1694                                 return 0;
1695
1696                         i--;
1697                 }
1698
1699         }
1700
1701         return generic_array_get_plus_one(f,
1702                                           le64toh(d->data.entry_offset),
1703                                           le64toh(d->data.entry_array_offset),
1704                                           i,
1705                                           ret, offset);
1706 }
1707
1708 int journal_file_move_to_entry_by_offset_for_data(
1709                 JournalFile *f,
1710                 uint64_t data_offset,
1711                 uint64_t p,
1712                 direction_t direction,
1713                 Object **ret, uint64_t *offset) {
1714
1715         int r;
1716         Object *d;
1717
1718         assert(f);
1719
1720         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1721         if (r < 0)
1722                 return r;
1723
1724         return generic_array_bisect_plus_one(f,
1725                                              le64toh(d->data.entry_offset),
1726                                              le64toh(d->data.entry_array_offset),
1727                                              le64toh(d->data.n_entries),
1728                                              p,
1729                                              test_object_offset,
1730                                              direction,
1731                                              ret, offset, NULL);
1732 }
1733
1734 int journal_file_move_to_entry_by_monotonic_for_data(
1735                 JournalFile *f,
1736                 uint64_t data_offset,
1737                 sd_id128_t boot_id,
1738                 uint64_t monotonic,
1739                 direction_t direction,
1740                 Object **ret, uint64_t *offset) {
1741
1742         char t[9+32+1] = "_BOOT_ID=";
1743         Object *o, *d;
1744         int r;
1745         uint64_t b, z;
1746
1747         assert(f);
1748
1749         /* First, seek by time */
1750         sd_id128_to_string(boot_id, t + 9);
1751         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1752         if (r < 0)
1753                 return r;
1754         if (r == 0)
1755                 return -ENOENT;
1756
1757         r = generic_array_bisect_plus_one(f,
1758                                           le64toh(o->data.entry_offset),
1759                                           le64toh(o->data.entry_array_offset),
1760                                           le64toh(o->data.n_entries),
1761                                           monotonic,
1762                                           test_object_monotonic,
1763                                           direction,
1764                                           NULL, &z, NULL);
1765         if (r <= 0)
1766                 return r;
1767
1768         /* And now, continue seeking until we find an entry that
1769          * exists in both bisection arrays */
1770
1771         for (;;) {
1772                 Object *qo;
1773                 uint64_t p, q;
1774
1775                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1776                 if (r < 0)
1777                         return r;
1778
1779                 r = generic_array_bisect_plus_one(f,
1780                                                   le64toh(d->data.entry_offset),
1781                                                   le64toh(d->data.entry_array_offset),
1782                                                   le64toh(d->data.n_entries),
1783                                                   z,
1784                                                   test_object_offset,
1785                                                   direction,
1786                                                   NULL, &p, NULL);
1787                 if (r <= 0)
1788                         return r;
1789
1790                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1791                 if (r < 0)
1792                         return r;
1793
1794                 r = generic_array_bisect_plus_one(f,
1795                                                   le64toh(o->data.entry_offset),
1796                                                   le64toh(o->data.entry_array_offset),
1797                                                   le64toh(o->data.n_entries),
1798                                                   p,
1799                                                   test_object_offset,
1800                                                   direction,
1801                                                   &qo, &q, NULL);
1802
1803                 if (r <= 0)
1804                         return r;
1805
1806                 if (p == q) {
1807                         if (ret)
1808                                 *ret = qo;
1809                         if (offset)
1810                                 *offset = q;
1811
1812                         return 1;
1813                 }
1814
1815                 z = q;
1816         }
1817
1818         return 0;
1819 }
1820
1821 int journal_file_move_to_entry_by_seqnum_for_data(
1822                 JournalFile *f,
1823                 uint64_t data_offset,
1824                 uint64_t seqnum,
1825                 direction_t direction,
1826                 Object **ret, uint64_t *offset) {
1827
1828         Object *d;
1829         int r;
1830
1831         assert(f);
1832
1833         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1834         if (r < 0)
1835                 return r;
1836
1837         return generic_array_bisect_plus_one(f,
1838                                              le64toh(d->data.entry_offset),
1839                                              le64toh(d->data.entry_array_offset),
1840                                              le64toh(d->data.n_entries),
1841                                              seqnum,
1842                                              test_object_seqnum,
1843                                              direction,
1844                                              ret, offset, NULL);
1845 }
1846
1847 int journal_file_move_to_entry_by_realtime_for_data(
1848                 JournalFile *f,
1849                 uint64_t data_offset,
1850                 uint64_t realtime,
1851                 direction_t direction,
1852                 Object **ret, uint64_t *offset) {
1853
1854         Object *d;
1855         int r;
1856
1857         assert(f);
1858
1859         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1860         if (r < 0)
1861                 return r;
1862
1863         return generic_array_bisect_plus_one(f,
1864                                              le64toh(d->data.entry_offset),
1865                                              le64toh(d->data.entry_array_offset),
1866                                              le64toh(d->data.n_entries),
1867                                              realtime,
1868                                              test_object_realtime,
1869                                              direction,
1870                                              ret, offset, NULL);
1871 }
1872
1873 void journal_file_dump(JournalFile *f) {
1874         Object *o;
1875         int r;
1876         uint64_t p;
1877
1878         assert(f);
1879
1880         journal_file_print_header(f);
1881
1882         p = le64toh(f->header->header_size);
1883         while (p != 0) {
1884                 r = journal_file_move_to_object(f, -1, p, &o);
1885                 if (r < 0)
1886                         goto fail;
1887
1888                 switch (o->object.type) {
1889
1890                 case OBJECT_UNUSED:
1891                         printf("Type: OBJECT_UNUSED\n");
1892                         break;
1893
1894                 case OBJECT_DATA:
1895                         printf("Type: OBJECT_DATA\n");
1896                         break;
1897
1898                 case OBJECT_ENTRY:
1899                         printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
1900                                (unsigned long long) le64toh(o->entry.seqnum),
1901                                (unsigned long long) le64toh(o->entry.monotonic),
1902                                (unsigned long long) le64toh(o->entry.realtime));
1903                         break;
1904
1905                 case OBJECT_FIELD_HASH_TABLE:
1906                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1907                         break;
1908
1909                 case OBJECT_DATA_HASH_TABLE:
1910                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1911                         break;
1912
1913                 case OBJECT_ENTRY_ARRAY:
1914                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1915                         break;
1916
1917                 case OBJECT_TAG:
1918                         printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
1919                                (unsigned long long) le64toh(o->tag.seqnum),
1920                                (unsigned long long) le64toh(o->tag.epoch));
1921                         break;
1922                 }
1923
1924                 if (o->object.flags & OBJECT_COMPRESSED)
1925                         printf("Flags: COMPRESSED\n");
1926
1927                 if (p == le64toh(f->header->tail_object_offset))
1928                         p = 0;
1929                 else
1930                         p = p + ALIGN64(le64toh(o->object.size));
1931         }
1932
1933         return;
1934 fail:
1935         log_error("File corrupt");
1936 }
1937
1938 void journal_file_print_header(JournalFile *f) {
1939         char a[33], b[33], c[33];
1940         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1941         struct stat st;
1942         char bytes[FORMAT_BYTES_MAX];
1943
1944         assert(f);
1945
1946         printf("File Path: %s\n"
1947                "File ID: %s\n"
1948                "Machine ID: %s\n"
1949                "Boot ID: %s\n"
1950                "Sequential Number ID: %s\n"
1951                "State: %s\n"
1952                "Compatible Flags:%s%s\n"
1953                "Incompatible Flags:%s%s\n"
1954                "Header size: %llu\n"
1955                "Arena size: %llu\n"
1956                "Data Hash Table Size: %llu\n"
1957                "Field Hash Table Size: %llu\n"
1958                "Rotate Suggested: %s\n"
1959                "Head Sequential Number: %llu\n"
1960                "Tail Sequential Number: %llu\n"
1961                "Head Realtime Timestamp: %s\n"
1962                "Tail Realtime Timestamp: %s\n"
1963                "Objects: %llu\n"
1964                "Entry Objects: %llu\n",
1965                f->path,
1966                sd_id128_to_string(f->header->file_id, a),
1967                sd_id128_to_string(f->header->machine_id, b),
1968                sd_id128_to_string(f->header->boot_id, c),
1969                sd_id128_to_string(f->header->seqnum_id, c),
1970                f->header->state == STATE_OFFLINE ? "OFFLINE" :
1971                f->header->state == STATE_ONLINE ? "ONLINE" :
1972                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
1973                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
1974                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
1975                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
1976                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
1977                (unsigned long long) le64toh(f->header->header_size),
1978                (unsigned long long) le64toh(f->header->arena_size),
1979                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1980                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1981                yes_no(journal_file_rotate_suggested(f)),
1982                (unsigned long long) le64toh(f->header->head_entry_seqnum),
1983                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
1984                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1985                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
1986                (unsigned long long) le64toh(f->header->n_objects),
1987                (unsigned long long) le64toh(f->header->n_entries));
1988
1989         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1990                 printf("Data Objects: %llu\n"
1991                        "Data Hash Table Fill: %.1f%%\n",
1992                        (unsigned long long) le64toh(f->header->n_data),
1993                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1994
1995         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1996                 printf("Field Objects: %llu\n"
1997                        "Field Hash Table Fill: %.1f%%\n",
1998                        (unsigned long long) le64toh(f->header->n_fields),
1999                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2000
2001         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2002                 printf("Tag Objects: %llu\n",
2003                        (unsigned long long) le64toh(f->header->n_tags));
2004         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2005                 printf("Entry Array Objects: %llu\n",
2006                        (unsigned long long) le64toh(f->header->n_entry_arrays));
2007
2008         if (fstat(f->fd, &st) >= 0)
2009                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2010 }
2011
2012 int journal_file_open(
2013                 const char *fname,
2014                 int flags,
2015                 mode_t mode,
2016                 bool compress,
2017                 bool seal,
2018                 JournalMetrics *metrics,
2019                 MMapCache *mmap_cache,
2020                 JournalFile *template,
2021                 JournalFile **ret) {
2022
2023         JournalFile *f;
2024         int r;
2025         bool newly_created = false;
2026
2027         assert(fname);
2028         assert(ret);
2029
2030         if ((flags & O_ACCMODE) != O_RDONLY &&
2031             (flags & O_ACCMODE) != O_RDWR)
2032                 return -EINVAL;
2033
2034         if (!endswith(fname, ".journal") &&
2035             !endswith(fname, ".journal~"))
2036                 return -EINVAL;
2037
2038         f = new0(JournalFile, 1);
2039         if (!f)
2040                 return -ENOMEM;
2041
2042         f->fd = -1;
2043         f->mode = mode;
2044
2045         f->flags = flags;
2046         f->prot = prot_from_flags(flags);
2047         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2048 #ifdef HAVE_XZ
2049         f->compress = compress;
2050 #endif
2051 #ifdef HAVE_GCRYPT
2052         f->seal = seal;
2053 #endif
2054
2055         if (mmap_cache)
2056                 f->mmap = mmap_cache_ref(mmap_cache);
2057         else {
2058                 f->mmap = mmap_cache_new();
2059                 if (!f->mmap) {
2060                         r = -ENOMEM;
2061                         goto fail;
2062                 }
2063         }
2064
2065         f->path = strdup(fname);
2066         if (!f->path) {
2067                 r = -ENOMEM;
2068                 goto fail;
2069         }
2070
2071         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2072         if (f->fd < 0) {
2073                 r = -errno;
2074                 goto fail;
2075         }
2076
2077         if (fstat(f->fd, &f->last_stat) < 0) {
2078                 r = -errno;
2079                 goto fail;
2080         }
2081
2082         if (f->last_stat.st_size == 0 && f->writable) {
2083                 newly_created = true;
2084
2085 #ifdef HAVE_GCRYPT
2086                 /* Try to load the FSPRG state, and if we can't, then
2087                  * just don't do sealing */
2088                 if (f->seal) {
2089                         r = journal_file_fss_load(f);
2090                         if (r < 0)
2091                                 f->seal = false;
2092                 }
2093 #endif
2094
2095                 r = journal_file_init_header(f, template);
2096                 if (r < 0)
2097                         goto fail;
2098
2099                 if (fstat(f->fd, &f->last_stat) < 0) {
2100                         r = -errno;
2101                         goto fail;
2102                 }
2103         }
2104
2105         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2106                 r = -EIO;
2107                 goto fail;
2108         }
2109
2110         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2111         if (f->header == MAP_FAILED) {
2112                 f->header = NULL;
2113                 r = -errno;
2114                 goto fail;
2115         }
2116
2117         if (!newly_created) {
2118                 r = journal_file_verify_header(f);
2119                 if (r < 0)
2120                         goto fail;
2121         }
2122
2123 #ifdef HAVE_GCRYPT
2124         if (!newly_created && f->writable) {
2125                 r = journal_file_fss_load(f);
2126                 if (r < 0)
2127                         goto fail;
2128         }
2129 #endif
2130
2131         if (f->writable) {
2132                 if (metrics) {
2133                         journal_default_metrics(metrics, f->fd);
2134                         f->metrics = *metrics;
2135                 } else if (template)
2136                         f->metrics = template->metrics;
2137
2138                 r = journal_file_refresh_header(f);
2139                 if (r < 0)
2140                         goto fail;
2141         }
2142
2143 #ifdef HAVE_GCRYPT
2144         r = journal_file_hmac_setup(f);
2145         if (r < 0)
2146                 goto fail;
2147 #endif
2148
2149         if (newly_created) {
2150                 r = journal_file_setup_field_hash_table(f);
2151                 if (r < 0)
2152                         goto fail;
2153
2154                 r = journal_file_setup_data_hash_table(f);
2155                 if (r < 0)
2156                         goto fail;
2157
2158 #ifdef HAVE_GCRYPT
2159                 r = journal_file_append_first_tag(f);
2160                 if (r < 0)
2161                         goto fail;
2162 #endif
2163         }
2164
2165         r = journal_file_map_field_hash_table(f);
2166         if (r < 0)
2167                 goto fail;
2168
2169         r = journal_file_map_data_hash_table(f);
2170         if (r < 0)
2171                 goto fail;
2172
2173         *ret = f;
2174         return 0;
2175
2176 fail:
2177         journal_file_close(f);
2178
2179         return r;
2180 }
2181
2182 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2183         char *p;
2184         size_t l;
2185         JournalFile *old_file, *new_file = NULL;
2186         int r;
2187
2188         assert(f);
2189         assert(*f);
2190
2191         old_file = *f;
2192
2193         if (!old_file->writable)
2194                 return -EINVAL;
2195
2196         if (!endswith(old_file->path, ".journal"))
2197                 return -EINVAL;
2198
2199         l = strlen(old_file->path);
2200
2201         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2202         if (!p)
2203                 return -ENOMEM;
2204
2205         memcpy(p, old_file->path, l - 8);
2206         p[l-8] = '@';
2207         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2208         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2209                  "-%016llx-%016llx.journal",
2210                  (unsigned long long) le64toh((*f)->header->tail_entry_seqnum),
2211                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2212
2213         r = rename(old_file->path, p);
2214         free(p);
2215
2216         if (r < 0)
2217                 return -errno;
2218
2219         old_file->header->state = STATE_ARCHIVED;
2220
2221         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2222         journal_file_close(old_file);
2223
2224         *f = new_file;
2225         return r;
2226 }
2227
2228 int journal_file_open_reliably(
2229                 const char *fname,
2230                 int flags,
2231                 mode_t mode,
2232                 bool compress,
2233                 bool seal,
2234                 JournalMetrics *metrics,
2235                 MMapCache *mmap_cache,
2236                 JournalFile *template,
2237                 JournalFile **ret) {
2238
2239         int r;
2240         size_t l;
2241         char *p;
2242
2243         r = journal_file_open(fname, flags, mode, compress, seal,
2244                               metrics, mmap_cache, template, ret);
2245         if (r != -EBADMSG && /* corrupted */
2246             r != -ENODATA && /* truncated */
2247             r != -EHOSTDOWN && /* other machine */
2248             r != -EPROTONOSUPPORT && /* incompatible feature */
2249             r != -EBUSY && /* unclean shutdown */
2250             r != -ESHUTDOWN /* already archived */)
2251                 return r;
2252
2253         if ((flags & O_ACCMODE) == O_RDONLY)
2254                 return r;
2255
2256         if (!(flags & O_CREAT))
2257                 return r;
2258
2259         if (!endswith(fname, ".journal"))
2260                 return r;
2261
2262         /* The file is corrupted. Rotate it away and try it again (but only once) */
2263
2264         l = strlen(fname);
2265         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2266                      (int) (l-8), fname,
2267                      (unsigned long long) now(CLOCK_REALTIME),
2268                      random_ull()) < 0)
2269                 return -ENOMEM;
2270
2271         r = rename(fname, p);
2272         free(p);
2273         if (r < 0)
2274                 return -errno;
2275
2276         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2277
2278         return journal_file_open(fname, flags, mode, compress, seal,
2279                                  metrics, mmap_cache, template, ret);
2280 }
2281
2282
2283 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2284         uint64_t i, n;
2285         uint64_t q, xor_hash = 0;
2286         int r;
2287         EntryItem *items;
2288         dual_timestamp ts;
2289
2290         assert(from);
2291         assert(to);
2292         assert(o);
2293         assert(p);
2294
2295         if (!to->writable)
2296                 return -EPERM;
2297
2298         ts.monotonic = le64toh(o->entry.monotonic);
2299         ts.realtime = le64toh(o->entry.realtime);
2300
2301         if (to->tail_entry_monotonic_valid &&
2302             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2303                 return -EINVAL;
2304
2305         n = journal_file_entry_n_items(o);
2306         items = alloca(sizeof(EntryItem) * n);
2307
2308         for (i = 0; i < n; i++) {
2309                 uint64_t l, h;
2310                 le64_t le_hash;
2311                 size_t t;
2312                 void *data;
2313                 Object *u;
2314
2315                 q = le64toh(o->entry.items[i].object_offset);
2316                 le_hash = o->entry.items[i].hash;
2317
2318                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2319                 if (r < 0)
2320                         return r;
2321
2322                 if (le_hash != o->data.hash)
2323                         return -EBADMSG;
2324
2325                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2326                 t = (size_t) l;
2327
2328                 /* We hit the limit on 32bit machines */
2329                 if ((uint64_t) t != l)
2330                         return -E2BIG;
2331
2332                 if (o->object.flags & OBJECT_COMPRESSED) {
2333 #ifdef HAVE_XZ
2334                         uint64_t rsize;
2335
2336                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2337                                 return -EBADMSG;
2338
2339                         data = from->compress_buffer;
2340                         l = rsize;
2341 #else
2342                         return -EPROTONOSUPPORT;
2343 #endif
2344                 } else
2345                         data = o->data.payload;
2346
2347                 r = journal_file_append_data(to, data, l, &u, &h);
2348                 if (r < 0)
2349                         return r;
2350
2351                 xor_hash ^= le64toh(u->data.hash);
2352                 items[i].object_offset = htole64(h);
2353                 items[i].hash = u->data.hash;
2354
2355                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2356                 if (r < 0)
2357                         return r;
2358         }
2359
2360         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2361 }
2362
2363 void journal_default_metrics(JournalMetrics *m, int fd) {
2364         uint64_t fs_size = 0;
2365         struct statvfs ss;
2366         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2367
2368         assert(m);
2369         assert(fd >= 0);
2370
2371         if (fstatvfs(fd, &ss) >= 0)
2372                 fs_size = ss.f_frsize * ss.f_blocks;
2373
2374         if (m->max_use == (uint64_t) -1) {
2375
2376                 if (fs_size > 0) {
2377                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2378
2379                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2380                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2381
2382                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2383                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2384                 } else
2385                         m->max_use = DEFAULT_MAX_USE_LOWER;
2386         } else {
2387                 m->max_use = PAGE_ALIGN(m->max_use);
2388
2389                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2390                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2391         }
2392
2393         if (m->max_size == (uint64_t) -1) {
2394                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2395
2396                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2397                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2398         } else
2399                 m->max_size = PAGE_ALIGN(m->max_size);
2400
2401         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2402                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2403
2404         if (m->max_size*2 > m->max_use)
2405                 m->max_use = m->max_size*2;
2406
2407         if (m->min_size == (uint64_t) -1)
2408                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2409         else {
2410                 m->min_size = PAGE_ALIGN(m->min_size);
2411
2412                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2413                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2414
2415                 if (m->min_size > m->max_size)
2416                         m->max_size = m->min_size;
2417         }
2418
2419         if (m->keep_free == (uint64_t) -1) {
2420
2421                 if (fs_size > 0) {
2422                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2423
2424                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2425                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2426
2427                 } else
2428                         m->keep_free = DEFAULT_KEEP_FREE;
2429         }
2430
2431         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2432                   format_bytes(a, sizeof(a), m->max_use),
2433                   format_bytes(b, sizeof(b), m->max_size),
2434                   format_bytes(c, sizeof(c), m->min_size),
2435                   format_bytes(d, sizeof(d), m->keep_free));
2436 }
2437
2438 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2439         assert(f);
2440         assert(from || to);
2441
2442         if (from) {
2443                 if (f->header->head_entry_realtime == 0)
2444                         return -ENOENT;
2445
2446                 *from = le64toh(f->header->head_entry_realtime);
2447         }
2448
2449         if (to) {
2450                 if (f->header->tail_entry_realtime == 0)
2451                         return -ENOENT;
2452
2453                 *to = le64toh(f->header->tail_entry_realtime);
2454         }
2455
2456         return 1;
2457 }
2458
2459 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2460         char t[9+32+1] = "_BOOT_ID=";
2461         Object *o;
2462         uint64_t p;
2463         int r;
2464
2465         assert(f);
2466         assert(from || to);
2467
2468         sd_id128_to_string(boot_id, t + 9);
2469
2470         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2471         if (r <= 0)
2472                 return r;
2473
2474         if (le64toh(o->data.n_entries) <= 0)
2475                 return 0;
2476
2477         if (from) {
2478                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2479                 if (r < 0)
2480                         return r;
2481
2482                 *from = le64toh(o->entry.monotonic);
2483         }
2484
2485         if (to) {
2486                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2487                 if (r < 0)
2488                         return r;
2489
2490                 r = generic_array_get_plus_one(f,
2491                                                le64toh(o->data.entry_offset),
2492                                                le64toh(o->data.entry_array_offset),
2493                                                le64toh(o->data.n_entries)-1,
2494                                                &o, NULL);
2495                 if (r <= 0)
2496                         return r;
2497
2498                 *to = le64toh(o->entry.monotonic);
2499         }
2500
2501         return 1;
2502 }
2503
2504 bool journal_file_rotate_suggested(JournalFile *f) {
2505         assert(f);
2506
2507         /* If we gained new header fields we gained new features,
2508          * hence suggest a rotation */
2509         if (le64toh(f->header->header_size) < sizeof(Header)) {
2510                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2511                 return true;
2512         }
2513
2514         /* Let's check if the hash tables grew over a certain fill
2515          * level (75%, borrowing this value from Java's hash table
2516          * implementation), and if so suggest a rotation. To calculate
2517          * the fill level we need the n_data field, which only exists
2518          * in newer versions. */
2519
2520         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2521                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2522                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2523                                   f->path,
2524                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2525                                   (unsigned long long) le64toh(f->header->n_data),
2526                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2527                                   (unsigned long long) (f->last_stat.st_size),
2528                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2529                         return true;
2530                 }
2531
2532         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2533                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2534                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2535                                   f->path,
2536                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2537                                   (unsigned long long) le64toh(f->header->n_fields),
2538                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2539                         return true;
2540                 }
2541
2542         return false;
2543 }