chiark / gitweb /
journald: when we detect the journal file we are about to write to has been deleted...
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
33 #include "lookup3.h"
34 #include "compress.h"
35 #include "fsprg.h"
36
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 /* How many entries to keep in the entry array chain cache at max */
65 #define CHAIN_CACHE_MAX 20
66
67 /* How much to increase the journal file size at once each time we allocate something new. */
68 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
69
70 /* Reread fstat() of the file for detecting deletions at least this often */
71 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
72
73 /* The mmap context to use for the header we pick as one above the last defined typed */
74 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
75
76 static int journal_file_set_online(JournalFile *f) {
77         assert(f);
78
79         if (!f->writable)
80                 return -EPERM;
81
82         if (!(f->fd >= 0 && f->header))
83                 return -EINVAL;
84
85         if (mmap_cache_got_sigbus(f->mmap, f->fd))
86                 return -EIO;
87
88         switch(f->header->state) {
89                 case STATE_ONLINE:
90                         return 0;
91
92                 case STATE_OFFLINE:
93                         f->header->state = STATE_ONLINE;
94                         fsync(f->fd);
95                         return 0;
96
97                 default:
98                         return -EINVAL;
99         }
100 }
101
102 int journal_file_set_offline(JournalFile *f) {
103         assert(f);
104
105         if (!f->writable)
106                 return -EPERM;
107
108         if (!(f->fd >= 0 && f->header))
109                 return -EINVAL;
110
111         if (f->header->state != STATE_ONLINE)
112                 return 0;
113
114         fsync(f->fd);
115
116         if (mmap_cache_got_sigbus(f->mmap, f->fd))
117                 return -EIO;
118
119         f->header->state = STATE_OFFLINE;
120
121         if (mmap_cache_got_sigbus(f->mmap, f->fd))
122                 return -EIO;
123
124         fsync(f->fd);
125
126         return 0;
127 }
128
129 void journal_file_close(JournalFile *f) {
130         assert(f);
131
132 #ifdef HAVE_GCRYPT
133         /* Write the final tag */
134         if (f->seal && f->writable)
135                 journal_file_append_tag(f);
136 #endif
137
138         journal_file_set_offline(f);
139
140         if (f->mmap && f->fd >= 0)
141                 mmap_cache_close_fd(f->mmap, f->fd);
142
143         safe_close(f->fd);
144         free(f->path);
145
146         if (f->mmap)
147                 mmap_cache_unref(f->mmap);
148
149         ordered_hashmap_free_free(f->chain_cache);
150
151 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
152         free(f->compress_buffer);
153 #endif
154
155 #ifdef HAVE_GCRYPT
156         if (f->fss_file)
157                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
158         else if (f->fsprg_state)
159                 free(f->fsprg_state);
160
161         free(f->fsprg_seed);
162
163         if (f->hmac)
164                 gcry_md_close(f->hmac);
165 #endif
166
167         free(f);
168 }
169
170 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
171         Header h = {};
172         ssize_t k;
173         int r;
174
175         assert(f);
176
177         memcpy(h.signature, HEADER_SIGNATURE, 8);
178         h.header_size = htole64(ALIGN64(sizeof(h)));
179
180         h.incompatible_flags |= htole32(
181                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
182                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
183
184         h.compatible_flags = htole32(
185                 f->seal * HEADER_COMPATIBLE_SEALED);
186
187         r = sd_id128_randomize(&h.file_id);
188         if (r < 0)
189                 return r;
190
191         if (template) {
192                 h.seqnum_id = template->header->seqnum_id;
193                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
194         } else
195                 h.seqnum_id = h.file_id;
196
197         k = pwrite(f->fd, &h, sizeof(h), 0);
198         if (k < 0)
199                 return -errno;
200
201         if (k != sizeof(h))
202                 return -EIO;
203
204         return 0;
205 }
206
207 static int journal_file_refresh_header(JournalFile *f) {
208         sd_id128_t boot_id;
209         int r;
210
211         assert(f);
212
213         r = sd_id128_get_machine(&f->header->machine_id);
214         if (r < 0)
215                 return r;
216
217         r = sd_id128_get_boot(&boot_id);
218         if (r < 0)
219                 return r;
220
221         if (sd_id128_equal(boot_id, f->header->boot_id))
222                 f->tail_entry_monotonic_valid = true;
223
224         f->header->boot_id = boot_id;
225
226         r = journal_file_set_online(f);
227
228         /* Sync the online state to disk */
229         fsync(f->fd);
230
231         return r;
232 }
233
234 static int journal_file_verify_header(JournalFile *f) {
235         uint32_t flags;
236
237         assert(f);
238
239         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
240                 return -EBADMSG;
241
242         /* In both read and write mode we refuse to open files with
243          * incompatible flags we don't know */
244         flags = le32toh(f->header->incompatible_flags);
245         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
246                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
247                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
248                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
249                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
250                 if (flags)
251                         log_debug("Journal file %s uses incompatible flags %"PRIx32
252                                   " disabled at compilation time.", f->path, flags);
253                 return -EPROTONOSUPPORT;
254         }
255
256         /* When open for writing we refuse to open files with
257          * compatible flags, too */
258         flags = le32toh(f->header->compatible_flags);
259         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
260                 if (flags & ~HEADER_COMPATIBLE_ANY)
261                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
262                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
263                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
264                 if (flags)
265                         log_debug("Journal file %s uses compatible flags %"PRIx32
266                                   " disabled at compilation time.", f->path, flags);
267                 return -EPROTONOSUPPORT;
268         }
269
270         if (f->header->state >= _STATE_MAX)
271                 return -EBADMSG;
272
273         /* The first addition was n_data, so check that we are at least this large */
274         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
275                 return -EBADMSG;
276
277         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
278                 return -EBADMSG;
279
280         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
281                 return -ENODATA;
282
283         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
284                 return -ENODATA;
285
286         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
287             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
288             !VALID64(le64toh(f->header->tail_object_offset)) ||
289             !VALID64(le64toh(f->header->entry_array_offset)))
290                 return -ENODATA;
291
292         if (f->writable) {
293                 uint8_t state;
294                 sd_id128_t machine_id;
295                 int r;
296
297                 r = sd_id128_get_machine(&machine_id);
298                 if (r < 0)
299                         return r;
300
301                 if (!sd_id128_equal(machine_id, f->header->machine_id))
302                         return -EHOSTDOWN;
303
304                 state = f->header->state;
305
306                 if (state == STATE_ONLINE) {
307                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
308                         return -EBUSY;
309                 } else if (state == STATE_ARCHIVED)
310                         return -ESHUTDOWN;
311                 else if (state != STATE_OFFLINE) {
312                         log_debug("Journal file %s has unknown state %u.", f->path, state);
313                         return -EBUSY;
314                 }
315         }
316
317         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
318         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
319
320         f->seal = JOURNAL_HEADER_SEALED(f->header);
321
322         return 0;
323 }
324
325 static int journal_file_fstat(JournalFile *f) {
326         assert(f);
327         assert(f->fd >= 0);
328
329         if (fstat(f->fd, &f->last_stat) < 0)
330                 return -errno;
331
332         f->last_stat_usec = now(CLOCK_MONOTONIC);
333
334         /* Refuse appending to files that are already deleted */
335         if (f->last_stat.st_nlink <= 0)
336                 return -EIDRM;
337
338         return 0;
339 }
340
341 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
342         uint64_t old_size, new_size;
343         int r;
344
345         assert(f);
346
347         /* We assume that this file is not sparse, and we know that
348          * for sure, since we always call posix_fallocate()
349          * ourselves */
350
351         if (mmap_cache_got_sigbus(f->mmap, f->fd))
352                 return -EIO;
353
354         old_size =
355                 le64toh(f->header->header_size) +
356                 le64toh(f->header->arena_size);
357
358         new_size = PAGE_ALIGN(offset + size);
359         if (new_size < le64toh(f->header->header_size))
360                 new_size = le64toh(f->header->header_size);
361
362         if (new_size <= old_size) {
363
364                 /* We already pre-allocated enough space, but before
365                  * we write to it, let's check with fstat() if the
366                  * file got deleted, in order make sure we don't throw
367                  * away the data immediately. Don't check fstat() for
368                  * all writes though, but only once ever 10s. */
369
370                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
371                         return 0;
372
373                 return journal_file_fstat(f);
374         }
375
376         /* Allocate more space. */
377
378         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
379                 return -E2BIG;
380
381         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
382                 struct statvfs svfs;
383
384                 if (fstatvfs(f->fd, &svfs) >= 0) {
385                         uint64_t available;
386
387                         available = svfs.f_bfree * svfs.f_bsize;
388
389                         if (available >= f->metrics.keep_free)
390                                 available -= f->metrics.keep_free;
391                         else
392                                 available = 0;
393
394                         if (new_size - old_size > available)
395                                 return -E2BIG;
396                 }
397         }
398
399         /* Increase by larger blocks at once */
400         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
401         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
402                 new_size = f->metrics.max_size;
403
404         /* Note that the glibc fallocate() fallback is very
405            inefficient, hence we try to minimize the allocation area
406            as we can. */
407         r = posix_fallocate(f->fd, old_size, new_size - old_size);
408         if (r != 0)
409                 return -r;
410
411         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
412
413         return journal_file_fstat(f);
414 }
415
416 static unsigned type_to_context(ObjectType type) {
417         /* One context for each type, plus one catch-all for the rest */
418         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
419         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
420         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
421 }
422
423 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
424         int r;
425
426         assert(f);
427         assert(ret);
428
429         if (size <= 0)
430                 return -EINVAL;
431
432         /* Avoid SIGBUS on invalid accesses */
433         if (offset + size > (uint64_t) f->last_stat.st_size) {
434                 /* Hmm, out of range? Let's refresh the fstat() data
435                  * first, before we trust that check. */
436
437                 r = journal_file_fstat(f);
438                 if (r < 0)
439                         return r;
440
441                 if (offset + size > (uint64_t) f->last_stat.st_size)
442                         return -EADDRNOTAVAIL;
443         }
444
445         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
446 }
447
448 static uint64_t minimum_header_size(Object *o) {
449
450         static const uint64_t table[] = {
451                 [OBJECT_DATA] = sizeof(DataObject),
452                 [OBJECT_FIELD] = sizeof(FieldObject),
453                 [OBJECT_ENTRY] = sizeof(EntryObject),
454                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
455                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
456                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
457                 [OBJECT_TAG] = sizeof(TagObject),
458         };
459
460         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
461                 return sizeof(ObjectHeader);
462
463         return table[o->object.type];
464 }
465
466 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
467         int r;
468         void *t;
469         Object *o;
470         uint64_t s;
471
472         assert(f);
473         assert(ret);
474
475         /* Objects may only be located at multiple of 64 bit */
476         if (!VALID64(offset))
477                 return -EFAULT;
478
479         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
480         if (r < 0)
481                 return r;
482
483         o = (Object*) t;
484         s = le64toh(o->object.size);
485
486         if (s < sizeof(ObjectHeader))
487                 return -EBADMSG;
488
489         if (o->object.type <= OBJECT_UNUSED)
490                 return -EBADMSG;
491
492         if (s < minimum_header_size(o))
493                 return -EBADMSG;
494
495         if (type > OBJECT_UNUSED && o->object.type != type)
496                 return -EBADMSG;
497
498         if (s > sizeof(ObjectHeader)) {
499                 r = journal_file_move_to(f, type, false, offset, s, &t);
500                 if (r < 0)
501                         return r;
502
503                 o = (Object*) t;
504         }
505
506         *ret = o;
507         return 0;
508 }
509
510 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
511         uint64_t r;
512
513         assert(f);
514
515         r = le64toh(f->header->tail_entry_seqnum) + 1;
516
517         if (seqnum) {
518                 /* If an external seqnum counter was passed, we update
519                  * both the local and the external one, and set it to
520                  * the maximum of both */
521
522                 if (*seqnum + 1 > r)
523                         r = *seqnum + 1;
524
525                 *seqnum = r;
526         }
527
528         f->header->tail_entry_seqnum = htole64(r);
529
530         if (f->header->head_entry_seqnum == 0)
531                 f->header->head_entry_seqnum = htole64(r);
532
533         return r;
534 }
535
536 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
537         int r;
538         uint64_t p;
539         Object *tail, *o;
540         void *t;
541
542         assert(f);
543         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
544         assert(size >= sizeof(ObjectHeader));
545         assert(offset);
546         assert(ret);
547
548         r = journal_file_set_online(f);
549         if (r < 0)
550                 return r;
551
552         p = le64toh(f->header->tail_object_offset);
553         if (p == 0)
554                 p = le64toh(f->header->header_size);
555         else {
556                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
557                 if (r < 0)
558                         return r;
559
560                 p += ALIGN64(le64toh(tail->object.size));
561         }
562
563         r = journal_file_allocate(f, p, size);
564         if (r < 0)
565                 return r;
566
567         r = journal_file_move_to(f, type, false, p, size, &t);
568         if (r < 0)
569                 return r;
570
571         o = (Object*) t;
572
573         zero(o->object);
574         o->object.type = type;
575         o->object.size = htole64(size);
576
577         f->header->tail_object_offset = htole64(p);
578         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
579
580         *ret = o;
581         *offset = p;
582
583         return 0;
584 }
585
586 static int journal_file_setup_data_hash_table(JournalFile *f) {
587         uint64_t s, p;
588         Object *o;
589         int r;
590
591         assert(f);
592
593         /* We estimate that we need 1 hash table entry per 768 of
594            journal file and we want to make sure we never get beyond
595            75% fill level. Calculate the hash table size for the
596            maximum file size based on these metrics. */
597
598         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
599         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
600                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
601
602         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
603
604         r = journal_file_append_object(f,
605                                        OBJECT_DATA_HASH_TABLE,
606                                        offsetof(Object, hash_table.items) + s,
607                                        &o, &p);
608         if (r < 0)
609                 return r;
610
611         memzero(o->hash_table.items, s);
612
613         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
614         f->header->data_hash_table_size = htole64(s);
615
616         return 0;
617 }
618
619 static int journal_file_setup_field_hash_table(JournalFile *f) {
620         uint64_t s, p;
621         Object *o;
622         int r;
623
624         assert(f);
625
626         /* We use a fixed size hash table for the fields as this
627          * number should grow very slowly only */
628
629         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
630         r = journal_file_append_object(f,
631                                        OBJECT_FIELD_HASH_TABLE,
632                                        offsetof(Object, hash_table.items) + s,
633                                        &o, &p);
634         if (r < 0)
635                 return r;
636
637         memzero(o->hash_table.items, s);
638
639         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
640         f->header->field_hash_table_size = htole64(s);
641
642         return 0;
643 }
644
645 static int journal_file_map_data_hash_table(JournalFile *f) {
646         uint64_t s, p;
647         void *t;
648         int r;
649
650         assert(f);
651
652         p = le64toh(f->header->data_hash_table_offset);
653         s = le64toh(f->header->data_hash_table_size);
654
655         r = journal_file_move_to(f,
656                                  OBJECT_DATA_HASH_TABLE,
657                                  true,
658                                  p, s,
659                                  &t);
660         if (r < 0)
661                 return r;
662
663         f->data_hash_table = t;
664         return 0;
665 }
666
667 static int journal_file_map_field_hash_table(JournalFile *f) {
668         uint64_t s, p;
669         void *t;
670         int r;
671
672         assert(f);
673
674         p = le64toh(f->header->field_hash_table_offset);
675         s = le64toh(f->header->field_hash_table_size);
676
677         r = journal_file_move_to(f,
678                                  OBJECT_FIELD_HASH_TABLE,
679                                  true,
680                                  p, s,
681                                  &t);
682         if (r < 0)
683                 return r;
684
685         f->field_hash_table = t;
686         return 0;
687 }
688
689 static int journal_file_link_field(
690                 JournalFile *f,
691                 Object *o,
692                 uint64_t offset,
693                 uint64_t hash) {
694
695         uint64_t p, h, m;
696         int r;
697
698         assert(f);
699         assert(o);
700         assert(offset > 0);
701
702         if (o->object.type != OBJECT_FIELD)
703                 return -EINVAL;
704
705         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
706         if (m <= 0)
707                 return -EBADMSG;
708
709         /* This might alter the window we are looking at */
710         o->field.next_hash_offset = o->field.head_data_offset = 0;
711
712         h = hash % m;
713         p = le64toh(f->field_hash_table[h].tail_hash_offset);
714         if (p == 0)
715                 f->field_hash_table[h].head_hash_offset = htole64(offset);
716         else {
717                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
718                 if (r < 0)
719                         return r;
720
721                 o->field.next_hash_offset = htole64(offset);
722         }
723
724         f->field_hash_table[h].tail_hash_offset = htole64(offset);
725
726         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
727                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
728
729         return 0;
730 }
731
732 static int journal_file_link_data(
733                 JournalFile *f,
734                 Object *o,
735                 uint64_t offset,
736                 uint64_t hash) {
737
738         uint64_t p, h, m;
739         int r;
740
741         assert(f);
742         assert(o);
743         assert(offset > 0);
744
745         if (o->object.type != OBJECT_DATA)
746                 return -EINVAL;
747
748         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
749         if (m <= 0)
750                 return -EBADMSG;
751
752         /* This might alter the window we are looking at */
753         o->data.next_hash_offset = o->data.next_field_offset = 0;
754         o->data.entry_offset = o->data.entry_array_offset = 0;
755         o->data.n_entries = 0;
756
757         h = hash % m;
758         p = le64toh(f->data_hash_table[h].tail_hash_offset);
759         if (p == 0)
760                 /* Only entry in the hash table is easy */
761                 f->data_hash_table[h].head_hash_offset = htole64(offset);
762         else {
763                 /* Move back to the previous data object, to patch in
764                  * pointer */
765
766                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
767                 if (r < 0)
768                         return r;
769
770                 o->data.next_hash_offset = htole64(offset);
771         }
772
773         f->data_hash_table[h].tail_hash_offset = htole64(offset);
774
775         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
776                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
777
778         return 0;
779 }
780
781 int journal_file_find_field_object_with_hash(
782                 JournalFile *f,
783                 const void *field, uint64_t size, uint64_t hash,
784                 Object **ret, uint64_t *offset) {
785
786         uint64_t p, osize, h, m;
787         int r;
788
789         assert(f);
790         assert(field && size > 0);
791
792         osize = offsetof(Object, field.payload) + size;
793
794         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
795
796         if (m <= 0)
797                 return -EBADMSG;
798
799         h = hash % m;
800         p = le64toh(f->field_hash_table[h].head_hash_offset);
801
802         while (p > 0) {
803                 Object *o;
804
805                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
806                 if (r < 0)
807                         return r;
808
809                 if (le64toh(o->field.hash) == hash &&
810                     le64toh(o->object.size) == osize &&
811                     memcmp(o->field.payload, field, size) == 0) {
812
813                         if (ret)
814                                 *ret = o;
815                         if (offset)
816                                 *offset = p;
817
818                         return 1;
819                 }
820
821                 p = le64toh(o->field.next_hash_offset);
822         }
823
824         return 0;
825 }
826
827 int journal_file_find_field_object(
828                 JournalFile *f,
829                 const void *field, uint64_t size,
830                 Object **ret, uint64_t *offset) {
831
832         uint64_t hash;
833
834         assert(f);
835         assert(field && size > 0);
836
837         hash = hash64(field, size);
838
839         return journal_file_find_field_object_with_hash(f,
840                                                         field, size, hash,
841                                                         ret, offset);
842 }
843
844 int journal_file_find_data_object_with_hash(
845                 JournalFile *f,
846                 const void *data, uint64_t size, uint64_t hash,
847                 Object **ret, uint64_t *offset) {
848
849         uint64_t p, osize, h, m;
850         int r;
851
852         assert(f);
853         assert(data || size == 0);
854
855         osize = offsetof(Object, data.payload) + size;
856
857         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
858         if (m <= 0)
859                 return -EBADMSG;
860
861         h = hash % m;
862         p = le64toh(f->data_hash_table[h].head_hash_offset);
863
864         while (p > 0) {
865                 Object *o;
866
867                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
868                 if (r < 0)
869                         return r;
870
871                 if (le64toh(o->data.hash) != hash)
872                         goto next;
873
874                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
875 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
876                         uint64_t l;
877                         size_t rsize;
878
879                         l = le64toh(o->object.size);
880                         if (l <= offsetof(Object, data.payload))
881                                 return -EBADMSG;
882
883                         l -= offsetof(Object, data.payload);
884
885                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
886                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
887                         if (r < 0)
888                                 return r;
889
890                         if (rsize == size &&
891                             memcmp(f->compress_buffer, data, size) == 0) {
892
893                                 if (ret)
894                                         *ret = o;
895
896                                 if (offset)
897                                         *offset = p;
898
899                                 return 1;
900                         }
901 #else
902                         return -EPROTONOSUPPORT;
903 #endif
904                 } else if (le64toh(o->object.size) == osize &&
905                            memcmp(o->data.payload, data, size) == 0) {
906
907                         if (ret)
908                                 *ret = o;
909
910                         if (offset)
911                                 *offset = p;
912
913                         return 1;
914                 }
915
916         next:
917                 p = le64toh(o->data.next_hash_offset);
918         }
919
920         return 0;
921 }
922
923 int journal_file_find_data_object(
924                 JournalFile *f,
925                 const void *data, uint64_t size,
926                 Object **ret, uint64_t *offset) {
927
928         uint64_t hash;
929
930         assert(f);
931         assert(data || size == 0);
932
933         hash = hash64(data, size);
934
935         return journal_file_find_data_object_with_hash(f,
936                                                        data, size, hash,
937                                                        ret, offset);
938 }
939
940 static int journal_file_append_field(
941                 JournalFile *f,
942                 const void *field, uint64_t size,
943                 Object **ret, uint64_t *offset) {
944
945         uint64_t hash, p;
946         uint64_t osize;
947         Object *o;
948         int r;
949
950         assert(f);
951         assert(field && size > 0);
952
953         hash = hash64(field, size);
954
955         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
956         if (r < 0)
957                 return r;
958         else if (r > 0) {
959
960                 if (ret)
961                         *ret = o;
962
963                 if (offset)
964                         *offset = p;
965
966                 return 0;
967         }
968
969         osize = offsetof(Object, field.payload) + size;
970         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
971         if (r < 0)
972                 return r;
973
974         o->field.hash = htole64(hash);
975         memcpy(o->field.payload, field, size);
976
977         r = journal_file_link_field(f, o, p, hash);
978         if (r < 0)
979                 return r;
980
981         /* The linking might have altered the window, so let's
982          * refresh our pointer */
983         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
984         if (r < 0)
985                 return r;
986
987 #ifdef HAVE_GCRYPT
988         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
989         if (r < 0)
990                 return r;
991 #endif
992
993         if (ret)
994                 *ret = o;
995
996         if (offset)
997                 *offset = p;
998
999         return 0;
1000 }
1001
1002 static int journal_file_append_data(
1003                 JournalFile *f,
1004                 const void *data, uint64_t size,
1005                 Object **ret, uint64_t *offset) {
1006
1007         uint64_t hash, p;
1008         uint64_t osize;
1009         Object *o;
1010         int r, compression = 0;
1011         const void *eq;
1012
1013         assert(f);
1014         assert(data || size == 0);
1015
1016         hash = hash64(data, size);
1017
1018         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1019         if (r < 0)
1020                 return r;
1021         else if (r > 0) {
1022
1023                 if (ret)
1024                         *ret = o;
1025
1026                 if (offset)
1027                         *offset = p;
1028
1029                 return 0;
1030         }
1031
1032         osize = offsetof(Object, data.payload) + size;
1033         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1034         if (r < 0)
1035                 return r;
1036
1037         o->data.hash = htole64(hash);
1038
1039 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1040         if (f->compress_xz &&
1041             size >= COMPRESSION_SIZE_THRESHOLD) {
1042                 size_t rsize;
1043
1044                 compression = compress_blob(data, size, o->data.payload, &rsize);
1045
1046                 if (compression) {
1047                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1048                         o->object.flags |= compression;
1049
1050                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1051                                   size, rsize, object_compressed_to_string(compression));
1052                 }
1053         }
1054 #endif
1055
1056         if (!compression && size > 0)
1057                 memcpy(o->data.payload, data, size);
1058
1059         r = journal_file_link_data(f, o, p, hash);
1060         if (r < 0)
1061                 return r;
1062
1063         /* The linking might have altered the window, so let's
1064          * refresh our pointer */
1065         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1066         if (r < 0)
1067                 return r;
1068
1069         if (!data)
1070                 eq = NULL;
1071         else
1072                 eq = memchr(data, '=', size);
1073         if (eq && eq > data) {
1074                 Object *fo = NULL;
1075                 uint64_t fp;
1076
1077                 /* Create field object ... */
1078                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1079                 if (r < 0)
1080                         return r;
1081
1082                 /* ... and link it in. */
1083                 o->data.next_field_offset = fo->field.head_data_offset;
1084                 fo->field.head_data_offset = le64toh(p);
1085         }
1086
1087 #ifdef HAVE_GCRYPT
1088         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1089         if (r < 0)
1090                 return r;
1091 #endif
1092
1093         if (ret)
1094                 *ret = o;
1095
1096         if (offset)
1097                 *offset = p;
1098
1099         return 0;
1100 }
1101
1102 uint64_t journal_file_entry_n_items(Object *o) {
1103         assert(o);
1104
1105         if (o->object.type != OBJECT_ENTRY)
1106                 return 0;
1107
1108         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1109 }
1110
1111 uint64_t journal_file_entry_array_n_items(Object *o) {
1112         assert(o);
1113
1114         if (o->object.type != OBJECT_ENTRY_ARRAY)
1115                 return 0;
1116
1117         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1118 }
1119
1120 uint64_t journal_file_hash_table_n_items(Object *o) {
1121         assert(o);
1122
1123         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1124             o->object.type != OBJECT_FIELD_HASH_TABLE)
1125                 return 0;
1126
1127         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1128 }
1129
1130 static int link_entry_into_array(JournalFile *f,
1131                                  le64_t *first,
1132                                  le64_t *idx,
1133                                  uint64_t p) {
1134         int r;
1135         uint64_t n = 0, ap = 0, q, i, a, hidx;
1136         Object *o;
1137
1138         assert(f);
1139         assert(first);
1140         assert(idx);
1141         assert(p > 0);
1142
1143         a = le64toh(*first);
1144         i = hidx = le64toh(*idx);
1145         while (a > 0) {
1146
1147                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1148                 if (r < 0)
1149                         return r;
1150
1151                 n = journal_file_entry_array_n_items(o);
1152                 if (i < n) {
1153                         o->entry_array.items[i] = htole64(p);
1154                         *idx = htole64(hidx + 1);
1155                         return 0;
1156                 }
1157
1158                 i -= n;
1159                 ap = a;
1160                 a = le64toh(o->entry_array.next_entry_array_offset);
1161         }
1162
1163         if (hidx > n)
1164                 n = (hidx+1) * 2;
1165         else
1166                 n = n * 2;
1167
1168         if (n < 4)
1169                 n = 4;
1170
1171         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1172                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1173                                        &o, &q);
1174         if (r < 0)
1175                 return r;
1176
1177 #ifdef HAVE_GCRYPT
1178         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1179         if (r < 0)
1180                 return r;
1181 #endif
1182
1183         o->entry_array.items[i] = htole64(p);
1184
1185         if (ap == 0)
1186                 *first = htole64(q);
1187         else {
1188                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1189                 if (r < 0)
1190                         return r;
1191
1192                 o->entry_array.next_entry_array_offset = htole64(q);
1193         }
1194
1195         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1196                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1197
1198         *idx = htole64(hidx + 1);
1199
1200         return 0;
1201 }
1202
1203 static int link_entry_into_array_plus_one(JournalFile *f,
1204                                           le64_t *extra,
1205                                           le64_t *first,
1206                                           le64_t *idx,
1207                                           uint64_t p) {
1208
1209         int r;
1210
1211         assert(f);
1212         assert(extra);
1213         assert(first);
1214         assert(idx);
1215         assert(p > 0);
1216
1217         if (*idx == 0)
1218                 *extra = htole64(p);
1219         else {
1220                 le64_t i;
1221
1222                 i = htole64(le64toh(*idx) - 1);
1223                 r = link_entry_into_array(f, first, &i, p);
1224                 if (r < 0)
1225                         return r;
1226         }
1227
1228         *idx = htole64(le64toh(*idx) + 1);
1229         return 0;
1230 }
1231
1232 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1233         uint64_t p;
1234         int r;
1235         assert(f);
1236         assert(o);
1237         assert(offset > 0);
1238
1239         p = le64toh(o->entry.items[i].object_offset);
1240         if (p == 0)
1241                 return -EINVAL;
1242
1243         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1244         if (r < 0)
1245                 return r;
1246
1247         return link_entry_into_array_plus_one(f,
1248                                               &o->data.entry_offset,
1249                                               &o->data.entry_array_offset,
1250                                               &o->data.n_entries,
1251                                               offset);
1252 }
1253
1254 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1255         uint64_t n, i;
1256         int r;
1257
1258         assert(f);
1259         assert(o);
1260         assert(offset > 0);
1261
1262         if (o->object.type != OBJECT_ENTRY)
1263                 return -EINVAL;
1264
1265         __sync_synchronize();
1266
1267         /* Link up the entry itself */
1268         r = link_entry_into_array(f,
1269                                   &f->header->entry_array_offset,
1270                                   &f->header->n_entries,
1271                                   offset);
1272         if (r < 0)
1273                 return r;
1274
1275         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1276
1277         if (f->header->head_entry_realtime == 0)
1278                 f->header->head_entry_realtime = o->entry.realtime;
1279
1280         f->header->tail_entry_realtime = o->entry.realtime;
1281         f->header->tail_entry_monotonic = o->entry.monotonic;
1282
1283         f->tail_entry_monotonic_valid = true;
1284
1285         /* Link up the items */
1286         n = journal_file_entry_n_items(o);
1287         for (i = 0; i < n; i++) {
1288                 r = journal_file_link_entry_item(f, o, offset, i);
1289                 if (r < 0)
1290                         return r;
1291         }
1292
1293         return 0;
1294 }
1295
1296 static int journal_file_append_entry_internal(
1297                 JournalFile *f,
1298                 const dual_timestamp *ts,
1299                 uint64_t xor_hash,
1300                 const EntryItem items[], unsigned n_items,
1301                 uint64_t *seqnum,
1302                 Object **ret, uint64_t *offset) {
1303         uint64_t np;
1304         uint64_t osize;
1305         Object *o;
1306         int r;
1307
1308         assert(f);
1309         assert(items || n_items == 0);
1310         assert(ts);
1311
1312         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1313
1314         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1315         if (r < 0)
1316                 return r;
1317
1318         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1319         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1320         o->entry.realtime = htole64(ts->realtime);
1321         o->entry.monotonic = htole64(ts->monotonic);
1322         o->entry.xor_hash = htole64(xor_hash);
1323         o->entry.boot_id = f->header->boot_id;
1324
1325 #ifdef HAVE_GCRYPT
1326         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1327         if (r < 0)
1328                 return r;
1329 #endif
1330
1331         r = journal_file_link_entry(f, o, np);
1332         if (r < 0)
1333                 return r;
1334
1335         if (ret)
1336                 *ret = o;
1337
1338         if (offset)
1339                 *offset = np;
1340
1341         return 0;
1342 }
1343
1344 void journal_file_post_change(JournalFile *f) {
1345         assert(f);
1346
1347         /* inotify() does not receive IN_MODIFY events from file
1348          * accesses done via mmap(). After each access we hence
1349          * trigger IN_MODIFY by truncating the journal file to its
1350          * current size which triggers IN_MODIFY. */
1351
1352         __sync_synchronize();
1353
1354         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1355                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1356 }
1357
1358 static int entry_item_cmp(const void *_a, const void *_b) {
1359         const EntryItem *a = _a, *b = _b;
1360
1361         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1362                 return -1;
1363         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1364                 return 1;
1365         return 0;
1366 }
1367
1368 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1369         unsigned i;
1370         EntryItem *items;
1371         int r;
1372         uint64_t xor_hash = 0;
1373         struct dual_timestamp _ts;
1374
1375         assert(f);
1376         assert(iovec || n_iovec == 0);
1377
1378         if (!ts) {
1379                 dual_timestamp_get(&_ts);
1380                 ts = &_ts;
1381         }
1382
1383         if (f->tail_entry_monotonic_valid &&
1384             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1385                 return -EINVAL;
1386
1387 #ifdef HAVE_GCRYPT
1388         r = journal_file_maybe_append_tag(f, ts->realtime);
1389         if (r < 0)
1390                 return r;
1391 #endif
1392
1393         /* alloca() can't take 0, hence let's allocate at least one */
1394         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1395
1396         for (i = 0; i < n_iovec; i++) {
1397                 uint64_t p;
1398                 Object *o;
1399
1400                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1401                 if (r < 0)
1402                         return r;
1403
1404                 xor_hash ^= le64toh(o->data.hash);
1405                 items[i].object_offset = htole64(p);
1406                 items[i].hash = o->data.hash;
1407         }
1408
1409         /* Order by the position on disk, in order to improve seek
1410          * times for rotating media. */
1411         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1412
1413         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1414
1415         /* If the memory mapping triggered a SIGBUS then we return an
1416          * IO error and ignore the error code passed down to us, since
1417          * it is very likely just an effect of a nullified replacement
1418          * mapping page */
1419
1420         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1421                 r = -EIO;
1422
1423         journal_file_post_change(f);
1424
1425         return r;
1426 }
1427
1428 typedef struct ChainCacheItem {
1429         uint64_t first; /* the array at the beginning of the chain */
1430         uint64_t array; /* the cached array */
1431         uint64_t begin; /* the first item in the cached array */
1432         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1433         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1434 } ChainCacheItem;
1435
1436 static void chain_cache_put(
1437                 OrderedHashmap *h,
1438                 ChainCacheItem *ci,
1439                 uint64_t first,
1440                 uint64_t array,
1441                 uint64_t begin,
1442                 uint64_t total,
1443                 uint64_t last_index) {
1444
1445         if (!ci) {
1446                 /* If the chain item to cache for this chain is the
1447                  * first one it's not worth caching anything */
1448                 if (array == first)
1449                         return;
1450
1451                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1452                         ci = ordered_hashmap_steal_first(h);
1453                         assert(ci);
1454                 } else {
1455                         ci = new(ChainCacheItem, 1);
1456                         if (!ci)
1457                                 return;
1458                 }
1459
1460                 ci->first = first;
1461
1462                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1463                         free(ci);
1464                         return;
1465                 }
1466         } else
1467                 assert(ci->first == first);
1468
1469         ci->array = array;
1470         ci->begin = begin;
1471         ci->total = total;
1472         ci->last_index = last_index;
1473 }
1474
1475 static int generic_array_get(
1476                 JournalFile *f,
1477                 uint64_t first,
1478                 uint64_t i,
1479                 Object **ret, uint64_t *offset) {
1480
1481         Object *o;
1482         uint64_t p = 0, a, t = 0;
1483         int r;
1484         ChainCacheItem *ci;
1485
1486         assert(f);
1487
1488         a = first;
1489
1490         /* Try the chain cache first */
1491         ci = ordered_hashmap_get(f->chain_cache, &first);
1492         if (ci && i > ci->total) {
1493                 a = ci->array;
1494                 i -= ci->total;
1495                 t = ci->total;
1496         }
1497
1498         while (a > 0) {
1499                 uint64_t k;
1500
1501                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1502                 if (r < 0)
1503                         return r;
1504
1505                 k = journal_file_entry_array_n_items(o);
1506                 if (i < k) {
1507                         p = le64toh(o->entry_array.items[i]);
1508                         goto found;
1509                 }
1510
1511                 i -= k;
1512                 t += k;
1513                 a = le64toh(o->entry_array.next_entry_array_offset);
1514         }
1515
1516         return 0;
1517
1518 found:
1519         /* Let's cache this item for the next invocation */
1520         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1521
1522         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1523         if (r < 0)
1524                 return r;
1525
1526         if (ret)
1527                 *ret = o;
1528
1529         if (offset)
1530                 *offset = p;
1531
1532         return 1;
1533 }
1534
1535 static int generic_array_get_plus_one(
1536                 JournalFile *f,
1537                 uint64_t extra,
1538                 uint64_t first,
1539                 uint64_t i,
1540                 Object **ret, uint64_t *offset) {
1541
1542         Object *o;
1543
1544         assert(f);
1545
1546         if (i == 0) {
1547                 int r;
1548
1549                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1550                 if (r < 0)
1551                         return r;
1552
1553                 if (ret)
1554                         *ret = o;
1555
1556                 if (offset)
1557                         *offset = extra;
1558
1559                 return 1;
1560         }
1561
1562         return generic_array_get(f, first, i-1, ret, offset);
1563 }
1564
1565 enum {
1566         TEST_FOUND,
1567         TEST_LEFT,
1568         TEST_RIGHT
1569 };
1570
1571 static int generic_array_bisect(
1572                 JournalFile *f,
1573                 uint64_t first,
1574                 uint64_t n,
1575                 uint64_t needle,
1576                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1577                 direction_t direction,
1578                 Object **ret,
1579                 uint64_t *offset,
1580                 uint64_t *idx) {
1581
1582         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1583         bool subtract_one = false;
1584         Object *o, *array = NULL;
1585         int r;
1586         ChainCacheItem *ci;
1587
1588         assert(f);
1589         assert(test_object);
1590
1591         /* Start with the first array in the chain */
1592         a = first;
1593
1594         ci = ordered_hashmap_get(f->chain_cache, &first);
1595         if (ci && n > ci->total) {
1596                 /* Ah, we have iterated this bisection array chain
1597                  * previously! Let's see if we can skip ahead in the
1598                  * chain, as far as the last time. But we can't jump
1599                  * backwards in the chain, so let's check that
1600                  * first. */
1601
1602                 r = test_object(f, ci->begin, needle);
1603                 if (r < 0)
1604                         return r;
1605
1606                 if (r == TEST_LEFT) {
1607                         /* OK, what we are looking for is right of the
1608                          * begin of this EntryArray, so let's jump
1609                          * straight to previously cached array in the
1610                          * chain */
1611
1612                         a = ci->array;
1613                         n -= ci->total;
1614                         t = ci->total;
1615                         last_index = ci->last_index;
1616                 }
1617         }
1618
1619         while (a > 0) {
1620                 uint64_t left, right, k, lp;
1621
1622                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1623                 if (r < 0)
1624                         return r;
1625
1626                 k = journal_file_entry_array_n_items(array);
1627                 right = MIN(k, n);
1628                 if (right <= 0)
1629                         return 0;
1630
1631                 i = right - 1;
1632                 lp = p = le64toh(array->entry_array.items[i]);
1633                 if (p <= 0)
1634                         return -EBADMSG;
1635
1636                 r = test_object(f, p, needle);
1637                 if (r < 0)
1638                         return r;
1639
1640                 if (r == TEST_FOUND)
1641                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1642
1643                 if (r == TEST_RIGHT) {
1644                         left = 0;
1645                         right -= 1;
1646
1647                         if (last_index != (uint64_t) -1) {
1648                                 assert(last_index <= right);
1649
1650                                 /* If we cached the last index we
1651                                  * looked at, let's try to not to jump
1652                                  * too wildly around and see if we can
1653                                  * limit the range to look at early to
1654                                  * the immediate neighbors of the last
1655                                  * index we looked at. */
1656
1657                                 if (last_index > 0) {
1658                                         uint64_t x = last_index - 1;
1659
1660                                         p = le64toh(array->entry_array.items[x]);
1661                                         if (p <= 0)
1662                                                 return -EBADMSG;
1663
1664                                         r = test_object(f, p, needle);
1665                                         if (r < 0)
1666                                                 return r;
1667
1668                                         if (r == TEST_FOUND)
1669                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1670
1671                                         if (r == TEST_RIGHT)
1672                                                 right = x;
1673                                         else
1674                                                 left = x + 1;
1675                                 }
1676
1677                                 if (last_index < right) {
1678                                         uint64_t y = last_index + 1;
1679
1680                                         p = le64toh(array->entry_array.items[y]);
1681                                         if (p <= 0)
1682                                                 return -EBADMSG;
1683
1684                                         r = test_object(f, p, needle);
1685                                         if (r < 0)
1686                                                 return r;
1687
1688                                         if (r == TEST_FOUND)
1689                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1690
1691                                         if (r == TEST_RIGHT)
1692                                                 right = y;
1693                                         else
1694                                                 left = y + 1;
1695                                 }
1696                         }
1697
1698                         for (;;) {
1699                                 if (left == right) {
1700                                         if (direction == DIRECTION_UP)
1701                                                 subtract_one = true;
1702
1703                                         i = left;
1704                                         goto found;
1705                                 }
1706
1707                                 assert(left < right);
1708                                 i = (left + right) / 2;
1709
1710                                 p = le64toh(array->entry_array.items[i]);
1711                                 if (p <= 0)
1712                                         return -EBADMSG;
1713
1714                                 r = test_object(f, p, needle);
1715                                 if (r < 0)
1716                                         return r;
1717
1718                                 if (r == TEST_FOUND)
1719                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1720
1721                                 if (r == TEST_RIGHT)
1722                                         right = i;
1723                                 else
1724                                         left = i + 1;
1725                         }
1726                 }
1727
1728                 if (k >= n) {
1729                         if (direction == DIRECTION_UP) {
1730                                 i = n;
1731                                 subtract_one = true;
1732                                 goto found;
1733                         }
1734
1735                         return 0;
1736                 }
1737
1738                 last_p = lp;
1739
1740                 n -= k;
1741                 t += k;
1742                 last_index = (uint64_t) -1;
1743                 a = le64toh(array->entry_array.next_entry_array_offset);
1744         }
1745
1746         return 0;
1747
1748 found:
1749         if (subtract_one && t == 0 && i == 0)
1750                 return 0;
1751
1752         /* Let's cache this item for the next invocation */
1753         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1754
1755         if (subtract_one && i == 0)
1756                 p = last_p;
1757         else if (subtract_one)
1758                 p = le64toh(array->entry_array.items[i-1]);
1759         else
1760                 p = le64toh(array->entry_array.items[i]);
1761
1762         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1763         if (r < 0)
1764                 return r;
1765
1766         if (ret)
1767                 *ret = o;
1768
1769         if (offset)
1770                 *offset = p;
1771
1772         if (idx)
1773                 *idx = t + i + (subtract_one ? -1 : 0);
1774
1775         return 1;
1776 }
1777
1778 static int generic_array_bisect_plus_one(
1779                 JournalFile *f,
1780                 uint64_t extra,
1781                 uint64_t first,
1782                 uint64_t n,
1783                 uint64_t needle,
1784                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1785                 direction_t direction,
1786                 Object **ret,
1787                 uint64_t *offset,
1788                 uint64_t *idx) {
1789
1790         int r;
1791         bool step_back = false;
1792         Object *o;
1793
1794         assert(f);
1795         assert(test_object);
1796
1797         if (n <= 0)
1798                 return 0;
1799
1800         /* This bisects the array in object 'first', but first checks
1801          * an extra  */
1802         r = test_object(f, extra, needle);
1803         if (r < 0)
1804                 return r;
1805
1806         if (r == TEST_FOUND)
1807                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1808
1809         /* if we are looking with DIRECTION_UP then we need to first
1810            see if in the actual array there is a matching entry, and
1811            return the last one of that. But if there isn't any we need
1812            to return this one. Hence remember this, and return it
1813            below. */
1814         if (r == TEST_LEFT)
1815                 step_back = direction == DIRECTION_UP;
1816
1817         if (r == TEST_RIGHT) {
1818                 if (direction == DIRECTION_DOWN)
1819                         goto found;
1820                 else
1821                         return 0;
1822         }
1823
1824         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1825
1826         if (r == 0 && step_back)
1827                 goto found;
1828
1829         if (r > 0 && idx)
1830                 (*idx) ++;
1831
1832         return r;
1833
1834 found:
1835         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1836         if (r < 0)
1837                 return r;
1838
1839         if (ret)
1840                 *ret = o;
1841
1842         if (offset)
1843                 *offset = extra;
1844
1845         if (idx)
1846                 *idx = 0;
1847
1848         return 1;
1849 }
1850
1851 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1852         assert(f);
1853         assert(p > 0);
1854
1855         if (p == needle)
1856                 return TEST_FOUND;
1857         else if (p < needle)
1858                 return TEST_LEFT;
1859         else
1860                 return TEST_RIGHT;
1861 }
1862
1863 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1864         Object *o;
1865         int r;
1866
1867         assert(f);
1868         assert(p > 0);
1869
1870         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1871         if (r < 0)
1872                 return r;
1873
1874         if (le64toh(o->entry.seqnum) == needle)
1875                 return TEST_FOUND;
1876         else if (le64toh(o->entry.seqnum) < needle)
1877                 return TEST_LEFT;
1878         else
1879                 return TEST_RIGHT;
1880 }
1881
1882 int journal_file_move_to_entry_by_seqnum(
1883                 JournalFile *f,
1884                 uint64_t seqnum,
1885                 direction_t direction,
1886                 Object **ret,
1887                 uint64_t *offset) {
1888
1889         return generic_array_bisect(f,
1890                                     le64toh(f->header->entry_array_offset),
1891                                     le64toh(f->header->n_entries),
1892                                     seqnum,
1893                                     test_object_seqnum,
1894                                     direction,
1895                                     ret, offset, NULL);
1896 }
1897
1898 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1899         Object *o;
1900         int r;
1901
1902         assert(f);
1903         assert(p > 0);
1904
1905         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1906         if (r < 0)
1907                 return r;
1908
1909         if (le64toh(o->entry.realtime) == needle)
1910                 return TEST_FOUND;
1911         else if (le64toh(o->entry.realtime) < needle)
1912                 return TEST_LEFT;
1913         else
1914                 return TEST_RIGHT;
1915 }
1916
1917 int journal_file_move_to_entry_by_realtime(
1918                 JournalFile *f,
1919                 uint64_t realtime,
1920                 direction_t direction,
1921                 Object **ret,
1922                 uint64_t *offset) {
1923
1924         return generic_array_bisect(f,
1925                                     le64toh(f->header->entry_array_offset),
1926                                     le64toh(f->header->n_entries),
1927                                     realtime,
1928                                     test_object_realtime,
1929                                     direction,
1930                                     ret, offset, NULL);
1931 }
1932
1933 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1934         Object *o;
1935         int r;
1936
1937         assert(f);
1938         assert(p > 0);
1939
1940         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1941         if (r < 0)
1942                 return r;
1943
1944         if (le64toh(o->entry.monotonic) == needle)
1945                 return TEST_FOUND;
1946         else if (le64toh(o->entry.monotonic) < needle)
1947                 return TEST_LEFT;
1948         else
1949                 return TEST_RIGHT;
1950 }
1951
1952 static inline int find_data_object_by_boot_id(
1953                 JournalFile *f,
1954                 sd_id128_t boot_id,
1955                 Object **o,
1956                 uint64_t *b) {
1957         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1958
1959         sd_id128_to_string(boot_id, t + 9);
1960         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1961 }
1962
1963 int journal_file_move_to_entry_by_monotonic(
1964                 JournalFile *f,
1965                 sd_id128_t boot_id,
1966                 uint64_t monotonic,
1967                 direction_t direction,
1968                 Object **ret,
1969                 uint64_t *offset) {
1970
1971         Object *o;
1972         int r;
1973
1974         assert(f);
1975
1976         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1977         if (r < 0)
1978                 return r;
1979         if (r == 0)
1980                 return -ENOENT;
1981
1982         return generic_array_bisect_plus_one(f,
1983                                              le64toh(o->data.entry_offset),
1984                                              le64toh(o->data.entry_array_offset),
1985                                              le64toh(o->data.n_entries),
1986                                              monotonic,
1987                                              test_object_monotonic,
1988                                              direction,
1989                                              ret, offset, NULL);
1990 }
1991
1992 void journal_file_reset_location(JournalFile *f) {
1993         f->location_type = LOCATION_HEAD;
1994         f->current_offset = 0;
1995         f->current_seqnum = 0;
1996         f->current_realtime = 0;
1997         f->current_monotonic = 0;
1998         zero(f->current_boot_id);
1999         f->current_xor_hash = 0;
2000 }
2001
2002 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
2003         f->last_direction = direction;
2004         f->location_type = LOCATION_SEEK;
2005         f->current_offset = offset;
2006         f->current_seqnum = le64toh(o->entry.seqnum);
2007         f->current_realtime = le64toh(o->entry.realtime);
2008         f->current_monotonic = le64toh(o->entry.monotonic);
2009         f->current_boot_id = o->entry.boot_id;
2010         f->current_xor_hash = le64toh(o->entry.xor_hash);
2011 }
2012
2013 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2014         assert(af);
2015         assert(bf);
2016         assert(af->location_type == LOCATION_SEEK);
2017         assert(bf->location_type == LOCATION_SEEK);
2018
2019         /* If contents and timestamps match, these entries are
2020          * identical, even if the seqnum does not match */
2021         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2022             af->current_monotonic == bf->current_monotonic &&
2023             af->current_realtime == bf->current_realtime &&
2024             af->current_xor_hash == bf->current_xor_hash)
2025                 return 0;
2026
2027         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2028
2029                 /* If this is from the same seqnum source, compare
2030                  * seqnums */
2031                 if (af->current_seqnum < bf->current_seqnum)
2032                         return -1;
2033                 if (af->current_seqnum > bf->current_seqnum)
2034                         return 1;
2035
2036                 /* Wow! This is weird, different data but the same
2037                  * seqnums? Something is borked, but let's make the
2038                  * best of it and compare by time. */
2039         }
2040
2041         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2042
2043                 /* If the boot id matches, compare monotonic time */
2044                 if (af->current_monotonic < bf->current_monotonic)
2045                         return -1;
2046                 if (af->current_monotonic > bf->current_monotonic)
2047                         return 1;
2048         }
2049
2050         /* Otherwise, compare UTC time */
2051         if (af->current_realtime < bf->current_realtime)
2052                 return -1;
2053         if (af->current_realtime > bf->current_realtime)
2054                 return 1;
2055
2056         /* Finally, compare by contents */
2057         if (af->current_xor_hash < bf->current_xor_hash)
2058                 return -1;
2059         if (af->current_xor_hash > bf->current_xor_hash)
2060                 return 1;
2061
2062         return 0;
2063 }
2064
2065 int journal_file_next_entry(
2066                 JournalFile *f,
2067                 uint64_t p,
2068                 direction_t direction,
2069                 Object **ret, uint64_t *offset) {
2070
2071         uint64_t i, n, ofs;
2072         int r;
2073
2074         assert(f);
2075
2076         n = le64toh(f->header->n_entries);
2077         if (n <= 0)
2078                 return 0;
2079
2080         if (p == 0)
2081                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2082         else {
2083                 r = generic_array_bisect(f,
2084                                          le64toh(f->header->entry_array_offset),
2085                                          le64toh(f->header->n_entries),
2086                                          p,
2087                                          test_object_offset,
2088                                          DIRECTION_DOWN,
2089                                          NULL, NULL,
2090                                          &i);
2091                 if (r <= 0)
2092                         return r;
2093
2094                 if (direction == DIRECTION_DOWN) {
2095                         if (i >= n - 1)
2096                                 return 0;
2097
2098                         i++;
2099                 } else {
2100                         if (i <= 0)
2101                                 return 0;
2102
2103                         i--;
2104                 }
2105         }
2106
2107         /* And jump to it */
2108         r = generic_array_get(f,
2109                               le64toh(f->header->entry_array_offset),
2110                               i,
2111                               ret, &ofs);
2112         if (r <= 0)
2113                 return r;
2114
2115         if (p > 0 &&
2116             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2117                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2118                           f->path, i);
2119                 return -EBADMSG;
2120         }
2121
2122         if (offset)
2123                 *offset = ofs;
2124
2125         return 1;
2126 }
2127
2128 int journal_file_next_entry_for_data(
2129                 JournalFile *f,
2130                 Object *o, uint64_t p,
2131                 uint64_t data_offset,
2132                 direction_t direction,
2133                 Object **ret, uint64_t *offset) {
2134
2135         uint64_t n, i;
2136         int r;
2137         Object *d;
2138
2139         assert(f);
2140         assert(p > 0 || !o);
2141
2142         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2143         if (r < 0)
2144                 return r;
2145
2146         n = le64toh(d->data.n_entries);
2147         if (n <= 0)
2148                 return n;
2149
2150         if (!o)
2151                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2152         else {
2153                 if (o->object.type != OBJECT_ENTRY)
2154                         return -EINVAL;
2155
2156                 r = generic_array_bisect_plus_one(f,
2157                                                   le64toh(d->data.entry_offset),
2158                                                   le64toh(d->data.entry_array_offset),
2159                                                   le64toh(d->data.n_entries),
2160                                                   p,
2161                                                   test_object_offset,
2162                                                   DIRECTION_DOWN,
2163                                                   NULL, NULL,
2164                                                   &i);
2165
2166                 if (r <= 0)
2167                         return r;
2168
2169                 if (direction == DIRECTION_DOWN) {
2170                         if (i >= n - 1)
2171                                 return 0;
2172
2173                         i++;
2174                 } else {
2175                         if (i <= 0)
2176                                 return 0;
2177
2178                         i--;
2179                 }
2180
2181         }
2182
2183         return generic_array_get_plus_one(f,
2184                                           le64toh(d->data.entry_offset),
2185                                           le64toh(d->data.entry_array_offset),
2186                                           i,
2187                                           ret, offset);
2188 }
2189
2190 int journal_file_move_to_entry_by_offset_for_data(
2191                 JournalFile *f,
2192                 uint64_t data_offset,
2193                 uint64_t p,
2194                 direction_t direction,
2195                 Object **ret, uint64_t *offset) {
2196
2197         int r;
2198         Object *d;
2199
2200         assert(f);
2201
2202         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2203         if (r < 0)
2204                 return r;
2205
2206         return generic_array_bisect_plus_one(f,
2207                                              le64toh(d->data.entry_offset),
2208                                              le64toh(d->data.entry_array_offset),
2209                                              le64toh(d->data.n_entries),
2210                                              p,
2211                                              test_object_offset,
2212                                              direction,
2213                                              ret, offset, NULL);
2214 }
2215
2216 int journal_file_move_to_entry_by_monotonic_for_data(
2217                 JournalFile *f,
2218                 uint64_t data_offset,
2219                 sd_id128_t boot_id,
2220                 uint64_t monotonic,
2221                 direction_t direction,
2222                 Object **ret, uint64_t *offset) {
2223
2224         Object *o, *d;
2225         int r;
2226         uint64_t b, z;
2227
2228         assert(f);
2229
2230         /* First, seek by time */
2231         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2232         if (r < 0)
2233                 return r;
2234         if (r == 0)
2235                 return -ENOENT;
2236
2237         r = generic_array_bisect_plus_one(f,
2238                                           le64toh(o->data.entry_offset),
2239                                           le64toh(o->data.entry_array_offset),
2240                                           le64toh(o->data.n_entries),
2241                                           monotonic,
2242                                           test_object_monotonic,
2243                                           direction,
2244                                           NULL, &z, NULL);
2245         if (r <= 0)
2246                 return r;
2247
2248         /* And now, continue seeking until we find an entry that
2249          * exists in both bisection arrays */
2250
2251         for (;;) {
2252                 Object *qo;
2253                 uint64_t p, q;
2254
2255                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2256                 if (r < 0)
2257                         return r;
2258
2259                 r = generic_array_bisect_plus_one(f,
2260                                                   le64toh(d->data.entry_offset),
2261                                                   le64toh(d->data.entry_array_offset),
2262                                                   le64toh(d->data.n_entries),
2263                                                   z,
2264                                                   test_object_offset,
2265                                                   direction,
2266                                                   NULL, &p, NULL);
2267                 if (r <= 0)
2268                         return r;
2269
2270                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2271                 if (r < 0)
2272                         return r;
2273
2274                 r = generic_array_bisect_plus_one(f,
2275                                                   le64toh(o->data.entry_offset),
2276                                                   le64toh(o->data.entry_array_offset),
2277                                                   le64toh(o->data.n_entries),
2278                                                   p,
2279                                                   test_object_offset,
2280                                                   direction,
2281                                                   &qo, &q, NULL);
2282
2283                 if (r <= 0)
2284                         return r;
2285
2286                 if (p == q) {
2287                         if (ret)
2288                                 *ret = qo;
2289                         if (offset)
2290                                 *offset = q;
2291
2292                         return 1;
2293                 }
2294
2295                 z = q;
2296         }
2297 }
2298
2299 int journal_file_move_to_entry_by_seqnum_for_data(
2300                 JournalFile *f,
2301                 uint64_t data_offset,
2302                 uint64_t seqnum,
2303                 direction_t direction,
2304                 Object **ret, uint64_t *offset) {
2305
2306         Object *d;
2307         int r;
2308
2309         assert(f);
2310
2311         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2312         if (r < 0)
2313                 return r;
2314
2315         return generic_array_bisect_plus_one(f,
2316                                              le64toh(d->data.entry_offset),
2317                                              le64toh(d->data.entry_array_offset),
2318                                              le64toh(d->data.n_entries),
2319                                              seqnum,
2320                                              test_object_seqnum,
2321                                              direction,
2322                                              ret, offset, NULL);
2323 }
2324
2325 int journal_file_move_to_entry_by_realtime_for_data(
2326                 JournalFile *f,
2327                 uint64_t data_offset,
2328                 uint64_t realtime,
2329                 direction_t direction,
2330                 Object **ret, uint64_t *offset) {
2331
2332         Object *d;
2333         int r;
2334
2335         assert(f);
2336
2337         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2338         if (r < 0)
2339                 return r;
2340
2341         return generic_array_bisect_plus_one(f,
2342                                              le64toh(d->data.entry_offset),
2343                                              le64toh(d->data.entry_array_offset),
2344                                              le64toh(d->data.n_entries),
2345                                              realtime,
2346                                              test_object_realtime,
2347                                              direction,
2348                                              ret, offset, NULL);
2349 }
2350
2351 void journal_file_dump(JournalFile *f) {
2352         Object *o;
2353         int r;
2354         uint64_t p;
2355
2356         assert(f);
2357
2358         journal_file_print_header(f);
2359
2360         p = le64toh(f->header->header_size);
2361         while (p != 0) {
2362                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2363                 if (r < 0)
2364                         goto fail;
2365
2366                 switch (o->object.type) {
2367
2368                 case OBJECT_UNUSED:
2369                         printf("Type: OBJECT_UNUSED\n");
2370                         break;
2371
2372                 case OBJECT_DATA:
2373                         printf("Type: OBJECT_DATA\n");
2374                         break;
2375
2376                 case OBJECT_FIELD:
2377                         printf("Type: OBJECT_FIELD\n");
2378                         break;
2379
2380                 case OBJECT_ENTRY:
2381                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2382                                le64toh(o->entry.seqnum),
2383                                le64toh(o->entry.monotonic),
2384                                le64toh(o->entry.realtime));
2385                         break;
2386
2387                 case OBJECT_FIELD_HASH_TABLE:
2388                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2389                         break;
2390
2391                 case OBJECT_DATA_HASH_TABLE:
2392                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2393                         break;
2394
2395                 case OBJECT_ENTRY_ARRAY:
2396                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2397                         break;
2398
2399                 case OBJECT_TAG:
2400                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2401                                le64toh(o->tag.seqnum),
2402                                le64toh(o->tag.epoch));
2403                         break;
2404
2405                 default:
2406                         printf("Type: unknown (%u)\n", o->object.type);
2407                         break;
2408                 }
2409
2410                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2411                         printf("Flags: %s\n",
2412                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2413
2414                 if (p == le64toh(f->header->tail_object_offset))
2415                         p = 0;
2416                 else
2417                         p = p + ALIGN64(le64toh(o->object.size));
2418         }
2419
2420         return;
2421 fail:
2422         log_error("File corrupt");
2423 }
2424
2425 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2426         const char *x;
2427
2428         x = format_timestamp(buf, l, t);
2429         if (x)
2430                 return x;
2431         return " --- ";
2432 }
2433
2434 void journal_file_print_header(JournalFile *f) {
2435         char a[33], b[33], c[33], d[33];
2436         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2437         struct stat st;
2438         char bytes[FORMAT_BYTES_MAX];
2439
2440         assert(f);
2441
2442         printf("File Path: %s\n"
2443                "File ID: %s\n"
2444                "Machine ID: %s\n"
2445                "Boot ID: %s\n"
2446                "Sequential Number ID: %s\n"
2447                "State: %s\n"
2448                "Compatible Flags:%s%s\n"
2449                "Incompatible Flags:%s%s%s\n"
2450                "Header size: %"PRIu64"\n"
2451                "Arena size: %"PRIu64"\n"
2452                "Data Hash Table Size: %"PRIu64"\n"
2453                "Field Hash Table Size: %"PRIu64"\n"
2454                "Rotate Suggested: %s\n"
2455                "Head Sequential Number: %"PRIu64"\n"
2456                "Tail Sequential Number: %"PRIu64"\n"
2457                "Head Realtime Timestamp: %s\n"
2458                "Tail Realtime Timestamp: %s\n"
2459                "Tail Monotonic Timestamp: %s\n"
2460                "Objects: %"PRIu64"\n"
2461                "Entry Objects: %"PRIu64"\n",
2462                f->path,
2463                sd_id128_to_string(f->header->file_id, a),
2464                sd_id128_to_string(f->header->machine_id, b),
2465                sd_id128_to_string(f->header->boot_id, c),
2466                sd_id128_to_string(f->header->seqnum_id, d),
2467                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2468                f->header->state == STATE_ONLINE ? "ONLINE" :
2469                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2470                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2471                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2472                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2473                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2474                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2475                le64toh(f->header->header_size),
2476                le64toh(f->header->arena_size),
2477                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2478                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2479                yes_no(journal_file_rotate_suggested(f, 0)),
2480                le64toh(f->header->head_entry_seqnum),
2481                le64toh(f->header->tail_entry_seqnum),
2482                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2483                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2484                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2485                le64toh(f->header->n_objects),
2486                le64toh(f->header->n_entries));
2487
2488         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2489                 printf("Data Objects: %"PRIu64"\n"
2490                        "Data Hash Table Fill: %.1f%%\n",
2491                        le64toh(f->header->n_data),
2492                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2493
2494         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2495                 printf("Field Objects: %"PRIu64"\n"
2496                        "Field Hash Table Fill: %.1f%%\n",
2497                        le64toh(f->header->n_fields),
2498                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2499
2500         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2501                 printf("Tag Objects: %"PRIu64"\n",
2502                        le64toh(f->header->n_tags));
2503         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2504                 printf("Entry Array Objects: %"PRIu64"\n",
2505                        le64toh(f->header->n_entry_arrays));
2506
2507         if (fstat(f->fd, &st) >= 0)
2508                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2509 }
2510
2511 int journal_file_open(
2512                 const char *fname,
2513                 int flags,
2514                 mode_t mode,
2515                 bool compress,
2516                 bool seal,
2517                 JournalMetrics *metrics,
2518                 MMapCache *mmap_cache,
2519                 JournalFile *template,
2520                 JournalFile **ret) {
2521
2522         bool newly_created = false;
2523         JournalFile *f;
2524         void *h;
2525         int r;
2526
2527         assert(fname);
2528         assert(ret);
2529
2530         if ((flags & O_ACCMODE) != O_RDONLY &&
2531             (flags & O_ACCMODE) != O_RDWR)
2532                 return -EINVAL;
2533
2534         if (!endswith(fname, ".journal") &&
2535             !endswith(fname, ".journal~"))
2536                 return -EINVAL;
2537
2538         f = new0(JournalFile, 1);
2539         if (!f)
2540                 return -ENOMEM;
2541
2542         f->fd = -1;
2543         f->mode = mode;
2544
2545         f->flags = flags;
2546         f->prot = prot_from_flags(flags);
2547         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2548 #if defined(HAVE_LZ4)
2549         f->compress_lz4 = compress;
2550 #elif defined(HAVE_XZ)
2551         f->compress_xz = compress;
2552 #endif
2553 #ifdef HAVE_GCRYPT
2554         f->seal = seal;
2555 #endif
2556
2557         if (mmap_cache)
2558                 f->mmap = mmap_cache_ref(mmap_cache);
2559         else {
2560                 f->mmap = mmap_cache_new();
2561                 if (!f->mmap) {
2562                         r = -ENOMEM;
2563                         goto fail;
2564                 }
2565         }
2566
2567         f->path = strdup(fname);
2568         if (!f->path) {
2569                 r = -ENOMEM;
2570                 goto fail;
2571         }
2572
2573         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2574         if (!f->chain_cache) {
2575                 r = -ENOMEM;
2576                 goto fail;
2577         }
2578
2579         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2580         if (f->fd < 0) {
2581                 r = -errno;
2582                 goto fail;
2583         }
2584
2585         r = journal_file_fstat(f);
2586         if (r < 0)
2587                 goto fail;
2588
2589         if (f->last_stat.st_size == 0 && f->writable) {
2590                 /* Let's attach the creation time to the journal file,
2591                  * so that the vacuuming code knows the age of this
2592                  * file even if the file might end up corrupted one
2593                  * day... Ideally we'd just use the creation time many
2594                  * file systems maintain for each file, but there is
2595                  * currently no usable API to query this, hence let's
2596                  * emulate this via extended attributes. If extended
2597                  * attributes are not supported we'll just skip this,
2598                  * and rely solely on mtime/atime/ctime of the file. */
2599
2600                 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
2601
2602 #ifdef HAVE_GCRYPT
2603                 /* Try to load the FSPRG state, and if we can't, then
2604                  * just don't do sealing */
2605                 if (f->seal) {
2606                         r = journal_file_fss_load(f);
2607                         if (r < 0)
2608                                 f->seal = false;
2609                 }
2610 #endif
2611
2612                 r = journal_file_init_header(f, template);
2613                 if (r < 0)
2614                         goto fail;
2615
2616                 r = journal_file_fstat(f);
2617                 if (r < 0)
2618                         goto fail;
2619
2620                 newly_created = true;
2621         }
2622
2623         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2624                 r = -EIO;
2625                 goto fail;
2626         }
2627
2628         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2629         if (r < 0) {
2630                 r = -errno;
2631                 goto fail;
2632         }
2633
2634         f->header = h;
2635
2636         if (!newly_created) {
2637                 r = journal_file_verify_header(f);
2638                 if (r < 0)
2639                         goto fail;
2640         }
2641
2642 #ifdef HAVE_GCRYPT
2643         if (!newly_created && f->writable) {
2644                 r = journal_file_fss_load(f);
2645                 if (r < 0)
2646                         goto fail;
2647         }
2648 #endif
2649
2650         if (f->writable) {
2651                 if (metrics) {
2652                         journal_default_metrics(metrics, f->fd);
2653                         f->metrics = *metrics;
2654                 } else if (template)
2655                         f->metrics = template->metrics;
2656
2657                 r = journal_file_refresh_header(f);
2658                 if (r < 0)
2659                         goto fail;
2660         }
2661
2662 #ifdef HAVE_GCRYPT
2663         r = journal_file_hmac_setup(f);
2664         if (r < 0)
2665                 goto fail;
2666 #endif
2667
2668         if (newly_created) {
2669                 r = journal_file_setup_field_hash_table(f);
2670                 if (r < 0)
2671                         goto fail;
2672
2673                 r = journal_file_setup_data_hash_table(f);
2674                 if (r < 0)
2675                         goto fail;
2676
2677 #ifdef HAVE_GCRYPT
2678                 r = journal_file_append_first_tag(f);
2679                 if (r < 0)
2680                         goto fail;
2681 #endif
2682         }
2683
2684         r = journal_file_map_field_hash_table(f);
2685         if (r < 0)
2686                 goto fail;
2687
2688         r = journal_file_map_data_hash_table(f);
2689         if (r < 0)
2690                 goto fail;
2691
2692         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2693                 r = -EIO;
2694                 goto fail;
2695         }
2696
2697         *ret = f;
2698         return 0;
2699
2700 fail:
2701         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2702                 r = -EIO;
2703
2704         journal_file_close(f);
2705
2706         return r;
2707 }
2708
2709 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2710         _cleanup_free_ char *p = NULL;
2711         size_t l;
2712         JournalFile *old_file, *new_file = NULL;
2713         int r;
2714
2715         assert(f);
2716         assert(*f);
2717
2718         old_file = *f;
2719
2720         if (!old_file->writable)
2721                 return -EINVAL;
2722
2723         if (!endswith(old_file->path, ".journal"))
2724                 return -EINVAL;
2725
2726         l = strlen(old_file->path);
2727         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2728                      (int) l - 8, old_file->path,
2729                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2730                      le64toh((*f)->header->head_entry_seqnum),
2731                      le64toh((*f)->header->head_entry_realtime));
2732         if (r < 0)
2733                 return -ENOMEM;
2734
2735         /* Try to rename the file to the archived version. If the file
2736          * already was deleted, we'll get ENOENT, let's ignore that
2737          * case. */
2738         r = rename(old_file->path, p);
2739         if (r < 0 && errno != ENOENT)
2740                 return -errno;
2741
2742         old_file->header->state = STATE_ARCHIVED;
2743
2744         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2745         journal_file_close(old_file);
2746
2747         *f = new_file;
2748         return r;
2749 }
2750
2751 int journal_file_open_reliably(
2752                 const char *fname,
2753                 int flags,
2754                 mode_t mode,
2755                 bool compress,
2756                 bool seal,
2757                 JournalMetrics *metrics,
2758                 MMapCache *mmap_cache,
2759                 JournalFile *template,
2760                 JournalFile **ret) {
2761
2762         int r;
2763         size_t l;
2764         _cleanup_free_ char *p = NULL;
2765
2766         r = journal_file_open(fname, flags, mode, compress, seal,
2767                               metrics, mmap_cache, template, ret);
2768         if (r != -EBADMSG && /* corrupted */
2769             r != -ENODATA && /* truncated */
2770             r != -EHOSTDOWN && /* other machine */
2771             r != -EPROTONOSUPPORT && /* incompatible feature */
2772             r != -EBUSY && /* unclean shutdown */
2773             r != -ESHUTDOWN && /* already archived */
2774             r != -EIO /* IO error, including SIGBUS on mmap */)
2775                 return r;
2776
2777         if ((flags & O_ACCMODE) == O_RDONLY)
2778                 return r;
2779
2780         if (!(flags & O_CREAT))
2781                 return r;
2782
2783         if (!endswith(fname, ".journal"))
2784                 return r;
2785
2786         /* The file is corrupted. Rotate it away and try it again (but only once) */
2787
2788         l = strlen(fname);
2789         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2790                      (int) l - 8, fname,
2791                      (unsigned long long) now(CLOCK_REALTIME),
2792                      random_u64()) < 0)
2793                 return -ENOMEM;
2794
2795         r = rename(fname, p);
2796         if (r < 0)
2797                 return -errno;
2798
2799         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2800
2801         return journal_file_open(fname, flags, mode, compress, seal,
2802                                  metrics, mmap_cache, template, ret);
2803 }
2804
2805 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2806         uint64_t i, n;
2807         uint64_t q, xor_hash = 0;
2808         int r;
2809         EntryItem *items;
2810         dual_timestamp ts;
2811
2812         assert(from);
2813         assert(to);
2814         assert(o);
2815         assert(p);
2816
2817         if (!to->writable)
2818                 return -EPERM;
2819
2820         ts.monotonic = le64toh(o->entry.monotonic);
2821         ts.realtime = le64toh(o->entry.realtime);
2822
2823         n = journal_file_entry_n_items(o);
2824         /* alloca() can't take 0, hence let's allocate at least one */
2825         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2826
2827         for (i = 0; i < n; i++) {
2828                 uint64_t l, h;
2829                 le64_t le_hash;
2830                 size_t t;
2831                 void *data;
2832                 Object *u;
2833
2834                 q = le64toh(o->entry.items[i].object_offset);
2835                 le_hash = o->entry.items[i].hash;
2836
2837                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2838                 if (r < 0)
2839                         return r;
2840
2841                 if (le_hash != o->data.hash)
2842                         return -EBADMSG;
2843
2844                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2845                 t = (size_t) l;
2846
2847                 /* We hit the limit on 32bit machines */
2848                 if ((uint64_t) t != l)
2849                         return -E2BIG;
2850
2851                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2852 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2853                         size_t rsize;
2854
2855                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2856                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2857                         if (r < 0)
2858                                 return r;
2859
2860                         data = from->compress_buffer;
2861                         l = rsize;
2862 #else
2863                         return -EPROTONOSUPPORT;
2864 #endif
2865                 } else
2866                         data = o->data.payload;
2867
2868                 r = journal_file_append_data(to, data, l, &u, &h);
2869                 if (r < 0)
2870                         return r;
2871
2872                 xor_hash ^= le64toh(u->data.hash);
2873                 items[i].object_offset = htole64(h);
2874                 items[i].hash = u->data.hash;
2875
2876                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2877                 if (r < 0)
2878                         return r;
2879         }
2880
2881         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2882
2883         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2884                 return -EIO;
2885
2886         return r;
2887 }
2888
2889 void journal_default_metrics(JournalMetrics *m, int fd) {
2890         uint64_t fs_size = 0;
2891         struct statvfs ss;
2892         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2893
2894         assert(m);
2895         assert(fd >= 0);
2896
2897         if (fstatvfs(fd, &ss) >= 0)
2898                 fs_size = ss.f_frsize * ss.f_blocks;
2899
2900         if (m->max_use == (uint64_t) -1) {
2901
2902                 if (fs_size > 0) {
2903                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2904
2905                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2906                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2907
2908                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2909                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2910                 } else
2911                         m->max_use = DEFAULT_MAX_USE_LOWER;
2912         } else {
2913                 m->max_use = PAGE_ALIGN(m->max_use);
2914
2915                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2916                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2917         }
2918
2919         if (m->max_size == (uint64_t) -1) {
2920                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2921
2922                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2923                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2924         } else
2925                 m->max_size = PAGE_ALIGN(m->max_size);
2926
2927         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2928                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2929
2930         if (m->max_size*2 > m->max_use)
2931                 m->max_use = m->max_size*2;
2932
2933         if (m->min_size == (uint64_t) -1)
2934                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2935         else {
2936                 m->min_size = PAGE_ALIGN(m->min_size);
2937
2938                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2939                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2940
2941                 if (m->min_size > m->max_size)
2942                         m->max_size = m->min_size;
2943         }
2944
2945         if (m->keep_free == (uint64_t) -1) {
2946
2947                 if (fs_size > 0) {
2948                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2949
2950                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2951                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2952
2953                 } else
2954                         m->keep_free = DEFAULT_KEEP_FREE;
2955         }
2956
2957         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2958                   format_bytes(a, sizeof(a), m->max_use),
2959                   format_bytes(b, sizeof(b), m->max_size),
2960                   format_bytes(c, sizeof(c), m->min_size),
2961                   format_bytes(d, sizeof(d), m->keep_free));
2962 }
2963
2964 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2965         assert(f);
2966         assert(from || to);
2967
2968         if (from) {
2969                 if (f->header->head_entry_realtime == 0)
2970                         return -ENOENT;
2971
2972                 *from = le64toh(f->header->head_entry_realtime);
2973         }
2974
2975         if (to) {
2976                 if (f->header->tail_entry_realtime == 0)
2977                         return -ENOENT;
2978
2979                 *to = le64toh(f->header->tail_entry_realtime);
2980         }
2981
2982         return 1;
2983 }
2984
2985 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2986         Object *o;
2987         uint64_t p;
2988         int r;
2989
2990         assert(f);
2991         assert(from || to);
2992
2993         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2994         if (r <= 0)
2995                 return r;
2996
2997         if (le64toh(o->data.n_entries) <= 0)
2998                 return 0;
2999
3000         if (from) {
3001                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3002                 if (r < 0)
3003                         return r;
3004
3005                 *from = le64toh(o->entry.monotonic);
3006         }
3007
3008         if (to) {
3009                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3010                 if (r < 0)
3011                         return r;
3012
3013                 r = generic_array_get_plus_one(f,
3014                                                le64toh(o->data.entry_offset),
3015                                                le64toh(o->data.entry_array_offset),
3016                                                le64toh(o->data.n_entries)-1,
3017                                                &o, NULL);
3018                 if (r <= 0)
3019                         return r;
3020
3021                 *to = le64toh(o->entry.monotonic);
3022         }
3023
3024         return 1;
3025 }
3026
3027 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3028         assert(f);
3029
3030         /* If we gained new header fields we gained new features,
3031          * hence suggest a rotation */
3032         if (le64toh(f->header->header_size) < sizeof(Header)) {
3033                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3034                 return true;
3035         }
3036
3037         /* Let's check if the hash tables grew over a certain fill
3038          * level (75%, borrowing this value from Java's hash table
3039          * implementation), and if so suggest a rotation. To calculate
3040          * the fill level we need the n_data field, which only exists
3041          * in newer versions. */
3042
3043         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3044                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3045                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3046                                   f->path,
3047                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3048                                   le64toh(f->header->n_data),
3049                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3050                                   (unsigned long long) f->last_stat.st_size,
3051                                   f->last_stat.st_size / le64toh(f->header->n_data));
3052                         return true;
3053                 }
3054
3055         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3056                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3057                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3058                                   f->path,
3059                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3060                                   le64toh(f->header->n_fields),
3061                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3062                         return true;
3063                 }
3064
3065         /* Are the data objects properly indexed by field objects? */
3066         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3067             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3068             le64toh(f->header->n_data) > 0 &&
3069             le64toh(f->header->n_fields) == 0)
3070                 return true;
3071
3072         if (max_file_usec > 0) {
3073                 usec_t t, h;
3074
3075                 h = le64toh(f->header->head_entry_realtime);
3076                 t = now(CLOCK_REALTIME);
3077
3078                 if (h > 0 && t > h + max_file_usec)
3079                         return true;
3080         }
3081
3082         return false;
3083 }