chiark / gitweb /
Add __attribute__((const, pure, format)) in various places
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 int journal_file_set_online(JournalFile *f) {
72         assert(f);
73
74         if (!f->writable)
75                 return -EPERM;
76
77         if (!(f->fd >= 0 && f->header))
78                 return -EINVAL;
79
80         switch(f->header->state) {
81                 case STATE_ONLINE:
82                         return 0;
83
84                 case STATE_OFFLINE:
85                         f->header->state = STATE_ONLINE;
86                         fsync(f->fd);
87                         return 0;
88
89                 default:
90                         return -EINVAL;
91         }
92 }
93
94 int journal_file_set_offline(JournalFile *f) {
95         assert(f);
96
97         if (!f->writable)
98                 return -EPERM;
99
100         if (!(f->fd >= 0 && f->header))
101                 return -EINVAL;
102
103         if (f->header->state != STATE_ONLINE)
104                 return 0;
105
106         fsync(f->fd);
107
108         f->header->state = STATE_OFFLINE;
109
110         fsync(f->fd);
111
112         return 0;
113 }
114
115 void journal_file_close(JournalFile *f) {
116         assert(f);
117
118 #ifdef HAVE_GCRYPT
119         /* Write the final tag */
120         if (f->seal && f->writable)
121                 journal_file_append_tag(f);
122 #endif
123
124         /* Sync everything to disk, before we mark the file offline */
125         if (f->mmap && f->fd >= 0)
126                 mmap_cache_close_fd(f->mmap, f->fd);
127
128         journal_file_set_offline(f);
129
130         if (f->header)
131                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
132
133         if (f->fd >= 0)
134                 close_nointr_nofail(f->fd);
135
136         free(f->path);
137
138         if (f->mmap)
139                 mmap_cache_unref(f->mmap);
140
141         hashmap_free_free(f->chain_cache);
142
143 #ifdef HAVE_XZ
144         free(f->compress_buffer);
145 #endif
146
147 #ifdef HAVE_GCRYPT
148         if (f->fss_file)
149                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
150         else if (f->fsprg_state)
151                 free(f->fsprg_state);
152
153         free(f->fsprg_seed);
154
155         if (f->hmac)
156                 gcry_md_close(f->hmac);
157 #endif
158
159         free(f);
160 }
161
162 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
163         Header h;
164         ssize_t k;
165         int r;
166
167         assert(f);
168
169         zero(h);
170         memcpy(h.signature, HEADER_SIGNATURE, 8);
171         h.header_size = htole64(ALIGN64(sizeof(h)));
172
173         h.incompatible_flags =
174                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
175
176         h.compatible_flags =
177                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
178
179         r = sd_id128_randomize(&h.file_id);
180         if (r < 0)
181                 return r;
182
183         if (template) {
184                 h.seqnum_id = template->header->seqnum_id;
185                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
186         } else
187                 h.seqnum_id = h.file_id;
188
189         k = pwrite(f->fd, &h, sizeof(h), 0);
190         if (k < 0)
191                 return -errno;
192
193         if (k != sizeof(h))
194                 return -EIO;
195
196         return 0;
197 }
198
199 static int journal_file_refresh_header(JournalFile *f) {
200         int r;
201         sd_id128_t boot_id;
202
203         assert(f);
204
205         r = sd_id128_get_machine(&f->header->machine_id);
206         if (r < 0)
207                 return r;
208
209         r = sd_id128_get_boot(&boot_id);
210         if (r < 0)
211                 return r;
212
213         if (sd_id128_equal(boot_id, f->header->boot_id))
214                 f->tail_entry_monotonic_valid = true;
215
216         f->header->boot_id = boot_id;
217
218         journal_file_set_online(f);
219
220         /* Sync the online state to disk */
221         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
222         fdatasync(f->fd);
223
224         return 0;
225 }
226
227 static int journal_file_verify_header(JournalFile *f) {
228         assert(f);
229
230         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
231                 return -EBADMSG;
232
233         /* In both read and write mode we refuse to open files with
234          * incompatible flags we don't know */
235 #ifdef HAVE_XZ
236         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
237                 return -EPROTONOSUPPORT;
238 #else
239         if (f->header->incompatible_flags != 0)
240                 return -EPROTONOSUPPORT;
241 #endif
242
243         /* When open for writing we refuse to open files with
244          * compatible flags, too */
245         if (f->writable) {
246 #ifdef HAVE_GCRYPT
247                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
248                         return -EPROTONOSUPPORT;
249 #else
250                 if (f->header->compatible_flags != 0)
251                         return -EPROTONOSUPPORT;
252 #endif
253         }
254
255         if (f->header->state >= _STATE_MAX)
256                 return -EBADMSG;
257
258         /* The first addition was n_data, so check that we are at least this large */
259         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
260                 return -EBADMSG;
261
262         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
263                 return -EBADMSG;
264
265         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
266                 return -ENODATA;
267
268         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
269                 return -ENODATA;
270
271         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
272             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
273             !VALID64(le64toh(f->header->tail_object_offset)) ||
274             !VALID64(le64toh(f->header->entry_array_offset)))
275                 return -ENODATA;
276
277         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
278             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
279             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
280             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
281                 return -ENODATA;
282
283         if (f->writable) {
284                 uint8_t state;
285                 sd_id128_t machine_id;
286                 int r;
287
288                 r = sd_id128_get_machine(&machine_id);
289                 if (r < 0)
290                         return r;
291
292                 if (!sd_id128_equal(machine_id, f->header->machine_id))
293                         return -EHOSTDOWN;
294
295                 state = f->header->state;
296
297                 if (state == STATE_ONLINE) {
298                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
299                         return -EBUSY;
300                 } else if (state == STATE_ARCHIVED)
301                         return -ESHUTDOWN;
302                 else if (state != STATE_OFFLINE) {
303                         log_debug("Journal file %s has unknown state %u.", f->path, state);
304                         return -EBUSY;
305                 }
306         }
307
308         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
309
310         f->seal = JOURNAL_HEADER_SEALED(f->header);
311
312         return 0;
313 }
314
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316         uint64_t old_size, new_size;
317         int r;
318
319         assert(f);
320
321         /* We assume that this file is not sparse, and we know that
322          * for sure, since we always call posix_fallocate()
323          * ourselves */
324
325         old_size =
326                 le64toh(f->header->header_size) +
327                 le64toh(f->header->arena_size);
328
329         new_size = PAGE_ALIGN(offset + size);
330         if (new_size < le64toh(f->header->header_size))
331                 new_size = le64toh(f->header->header_size);
332
333         if (new_size <= old_size)
334                 return 0;
335
336         if (f->metrics.max_size > 0 &&
337             new_size > f->metrics.max_size)
338                 return -E2BIG;
339
340         if (new_size > f->metrics.min_size &&
341             f->metrics.keep_free > 0) {
342                 struct statvfs svfs;
343
344                 if (fstatvfs(f->fd, &svfs) >= 0) {
345                         uint64_t available;
346
347                         available = svfs.f_bfree * svfs.f_bsize;
348
349                         if (available >= f->metrics.keep_free)
350                                 available -= f->metrics.keep_free;
351                         else
352                                 available = 0;
353
354                         if (new_size - old_size > available)
355                                 return -E2BIG;
356                 }
357         }
358
359         /* Note that the glibc fallocate() fallback is very
360            inefficient, hence we try to minimize the allocation area
361            as we can. */
362         r = posix_fallocate(f->fd, old_size, new_size - old_size);
363         if (r != 0)
364                 return -r;
365
366         if (fstat(f->fd, &f->last_stat) < 0)
367                 return -errno;
368
369         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
370
371         return 0;
372 }
373
374 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
375         assert(f);
376         assert(ret);
377
378         if (size <= 0)
379                 return -EINVAL;
380
381         /* Avoid SIGBUS on invalid accesses */
382         if (offset + size > (uint64_t) f->last_stat.st_size) {
383                 /* Hmm, out of range? Let's refresh the fstat() data
384                  * first, before we trust that check. */
385
386                 if (fstat(f->fd, &f->last_stat) < 0 ||
387                     offset + size > (uint64_t) f->last_stat.st_size)
388                         return -EADDRNOTAVAIL;
389         }
390
391         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
392 }
393
394 static uint64_t minimum_header_size(Object *o) {
395
396         static uint64_t table[] = {
397                 [OBJECT_DATA] = sizeof(DataObject),
398                 [OBJECT_FIELD] = sizeof(FieldObject),
399                 [OBJECT_ENTRY] = sizeof(EntryObject),
400                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
401                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
402                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
403                 [OBJECT_TAG] = sizeof(TagObject),
404         };
405
406         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
407                 return sizeof(ObjectHeader);
408
409         return table[o->object.type];
410 }
411
412 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
413         int r;
414         void *t;
415         Object *o;
416         uint64_t s;
417         unsigned context;
418
419         assert(f);
420         assert(ret);
421
422         /* Objects may only be located at multiple of 64 bit */
423         if (!VALID64(offset))
424                 return -EFAULT;
425
426         /* One context for each type, plus one catch-all for the rest */
427         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
428
429         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
430         if (r < 0)
431                 return r;
432
433         o = (Object*) t;
434         s = le64toh(o->object.size);
435
436         if (s < sizeof(ObjectHeader))
437                 return -EBADMSG;
438
439         if (o->object.type <= OBJECT_UNUSED)
440                 return -EBADMSG;
441
442         if (s < minimum_header_size(o))
443                 return -EBADMSG;
444
445         if (type > 0 && o->object.type != type)
446                 return -EBADMSG;
447
448         if (s > sizeof(ObjectHeader)) {
449                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
450                 if (r < 0)
451                         return r;
452
453                 o = (Object*) t;
454         }
455
456         *ret = o;
457         return 0;
458 }
459
460 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
461         uint64_t r;
462
463         assert(f);
464
465         r = le64toh(f->header->tail_entry_seqnum) + 1;
466
467         if (seqnum) {
468                 /* If an external seqnum counter was passed, we update
469                  * both the local and the external one, and set it to
470                  * the maximum of both */
471
472                 if (*seqnum + 1 > r)
473                         r = *seqnum + 1;
474
475                 *seqnum = r;
476         }
477
478         f->header->tail_entry_seqnum = htole64(r);
479
480         if (f->header->head_entry_seqnum == 0)
481                 f->header->head_entry_seqnum = htole64(r);
482
483         return r;
484 }
485
486 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
487         int r;
488         uint64_t p;
489         Object *tail, *o;
490         void *t;
491
492         assert(f);
493         assert(type > 0 && type < _OBJECT_TYPE_MAX);
494         assert(size >= sizeof(ObjectHeader));
495         assert(offset);
496         assert(ret);
497
498         r = journal_file_set_online(f);
499         if (r < 0)
500                 return r;
501
502         p = le64toh(f->header->tail_object_offset);
503         if (p == 0)
504                 p = le64toh(f->header->header_size);
505         else {
506                 r = journal_file_move_to_object(f, -1, p, &tail);
507                 if (r < 0)
508                         return r;
509
510                 p += ALIGN64(le64toh(tail->object.size));
511         }
512
513         r = journal_file_allocate(f, p, size);
514         if (r < 0)
515                 return r;
516
517         r = journal_file_move_to(f, type, false, p, size, &t);
518         if (r < 0)
519                 return r;
520
521         o = (Object*) t;
522
523         zero(o->object);
524         o->object.type = type;
525         o->object.size = htole64(size);
526
527         f->header->tail_object_offset = htole64(p);
528         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
529
530         *ret = o;
531         *offset = p;
532
533         return 0;
534 }
535
536 static int journal_file_setup_data_hash_table(JournalFile *f) {
537         uint64_t s, p;
538         Object *o;
539         int r;
540
541         assert(f);
542
543         /* We estimate that we need 1 hash table entry per 768 of
544            journal file and we want to make sure we never get beyond
545            75% fill level. Calculate the hash table size for the
546            maximum file size based on these metrics. */
547
548         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
549         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
550                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
551
552         log_debug("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
553
554         r = journal_file_append_object(f,
555                                        OBJECT_DATA_HASH_TABLE,
556                                        offsetof(Object, hash_table.items) + s,
557                                        &o, &p);
558         if (r < 0)
559                 return r;
560
561         memset(o->hash_table.items, 0, s);
562
563         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
564         f->header->data_hash_table_size = htole64(s);
565
566         return 0;
567 }
568
569 static int journal_file_setup_field_hash_table(JournalFile *f) {
570         uint64_t s, p;
571         Object *o;
572         int r;
573
574         assert(f);
575
576         /* We use a fixed size hash table for the fields as this
577          * number should grow very slowly only */
578
579         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
580         r = journal_file_append_object(f,
581                                        OBJECT_FIELD_HASH_TABLE,
582                                        offsetof(Object, hash_table.items) + s,
583                                        &o, &p);
584         if (r < 0)
585                 return r;
586
587         memset(o->hash_table.items, 0, s);
588
589         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
590         f->header->field_hash_table_size = htole64(s);
591
592         return 0;
593 }
594
595 static int journal_file_map_data_hash_table(JournalFile *f) {
596         uint64_t s, p;
597         void *t;
598         int r;
599
600         assert(f);
601
602         p = le64toh(f->header->data_hash_table_offset);
603         s = le64toh(f->header->data_hash_table_size);
604
605         r = journal_file_move_to(f,
606                                  OBJECT_DATA_HASH_TABLE,
607                                  true,
608                                  p, s,
609                                  &t);
610         if (r < 0)
611                 return r;
612
613         f->data_hash_table = t;
614         return 0;
615 }
616
617 static int journal_file_map_field_hash_table(JournalFile *f) {
618         uint64_t s, p;
619         void *t;
620         int r;
621
622         assert(f);
623
624         p = le64toh(f->header->field_hash_table_offset);
625         s = le64toh(f->header->field_hash_table_size);
626
627         r = journal_file_move_to(f,
628                                  OBJECT_FIELD_HASH_TABLE,
629                                  true,
630                                  p, s,
631                                  &t);
632         if (r < 0)
633                 return r;
634
635         f->field_hash_table = t;
636         return 0;
637 }
638
639 static int journal_file_link_field(
640                 JournalFile *f,
641                 Object *o,
642                 uint64_t offset,
643                 uint64_t hash) {
644
645         uint64_t p, h;
646         int r;
647
648         assert(f);
649         assert(o);
650         assert(offset > 0);
651
652         if (o->object.type != OBJECT_FIELD)
653                 return -EINVAL;
654
655         /* This might alter the window we are looking at */
656
657         o->field.next_hash_offset = o->field.head_data_offset = 0;
658
659         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
660         p = le64toh(f->field_hash_table[h].tail_hash_offset);
661         if (p == 0)
662                 f->field_hash_table[h].head_hash_offset = htole64(offset);
663         else {
664                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
665                 if (r < 0)
666                         return r;
667
668                 o->field.next_hash_offset = htole64(offset);
669         }
670
671         f->field_hash_table[h].tail_hash_offset = htole64(offset);
672
673         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
674                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
675
676         return 0;
677 }
678
679 static int journal_file_link_data(
680                 JournalFile *f,
681                 Object *o,
682                 uint64_t offset,
683                 uint64_t hash) {
684
685         uint64_t p, h;
686         int r;
687
688         assert(f);
689         assert(o);
690         assert(offset > 0);
691
692         if (o->object.type != OBJECT_DATA)
693                 return -EINVAL;
694
695         /* This might alter the window we are looking at */
696
697         o->data.next_hash_offset = o->data.next_field_offset = 0;
698         o->data.entry_offset = o->data.entry_array_offset = 0;
699         o->data.n_entries = 0;
700
701         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
702         p = le64toh(f->data_hash_table[h].tail_hash_offset);
703         if (p == 0)
704                 /* Only entry in the hash table is easy */
705                 f->data_hash_table[h].head_hash_offset = htole64(offset);
706         else {
707                 /* Move back to the previous data object, to patch in
708                  * pointer */
709
710                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
711                 if (r < 0)
712                         return r;
713
714                 o->data.next_hash_offset = htole64(offset);
715         }
716
717         f->data_hash_table[h].tail_hash_offset = htole64(offset);
718
719         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
720                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
721
722         return 0;
723 }
724
725 int journal_file_find_field_object_with_hash(
726                 JournalFile *f,
727                 const void *field, uint64_t size, uint64_t hash,
728                 Object **ret, uint64_t *offset) {
729
730         uint64_t p, osize, h;
731         int r;
732
733         assert(f);
734         assert(field && size > 0);
735
736         osize = offsetof(Object, field.payload) + size;
737
738         if (f->header->field_hash_table_size == 0)
739                 return -EBADMSG;
740
741         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
742         p = le64toh(f->field_hash_table[h].head_hash_offset);
743
744         while (p > 0) {
745                 Object *o;
746
747                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
748                 if (r < 0)
749                         return r;
750
751                 if (le64toh(o->field.hash) == hash &&
752                     le64toh(o->object.size) == osize &&
753                     memcmp(o->field.payload, field, size) == 0) {
754
755                         if (ret)
756                                 *ret = o;
757                         if (offset)
758                                 *offset = p;
759
760                         return 1;
761                 }
762
763                 p = le64toh(o->field.next_hash_offset);
764         }
765
766         return 0;
767 }
768
769 int journal_file_find_field_object(
770                 JournalFile *f,
771                 const void *field, uint64_t size,
772                 Object **ret, uint64_t *offset) {
773
774         uint64_t hash;
775
776         assert(f);
777         assert(field && size > 0);
778
779         hash = hash64(field, size);
780
781         return journal_file_find_field_object_with_hash(f,
782                                                         field, size, hash,
783                                                         ret, offset);
784 }
785
786 int journal_file_find_data_object_with_hash(
787                 JournalFile *f,
788                 const void *data, uint64_t size, uint64_t hash,
789                 Object **ret, uint64_t *offset) {
790
791         uint64_t p, osize, h;
792         int r;
793
794         assert(f);
795         assert(data || size == 0);
796
797         osize = offsetof(Object, data.payload) + size;
798
799         if (f->header->data_hash_table_size == 0)
800                 return -EBADMSG;
801
802         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
803         p = le64toh(f->data_hash_table[h].head_hash_offset);
804
805         while (p > 0) {
806                 Object *o;
807
808                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
809                 if (r < 0)
810                         return r;
811
812                 if (le64toh(o->data.hash) != hash)
813                         goto next;
814
815                 if (o->object.flags & OBJECT_COMPRESSED) {
816 #ifdef HAVE_XZ
817                         uint64_t l, rsize;
818
819                         l = le64toh(o->object.size);
820                         if (l <= offsetof(Object, data.payload))
821                                 return -EBADMSG;
822
823                         l -= offsetof(Object, data.payload);
824
825                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
826                                 return -EBADMSG;
827
828                         if (rsize == size &&
829                             memcmp(f->compress_buffer, data, size) == 0) {
830
831                                 if (ret)
832                                         *ret = o;
833
834                                 if (offset)
835                                         *offset = p;
836
837                                 return 1;
838                         }
839 #else
840                         return -EPROTONOSUPPORT;
841 #endif
842
843                 } else if (le64toh(o->object.size) == osize &&
844                            memcmp(o->data.payload, data, size) == 0) {
845
846                         if (ret)
847                                 *ret = o;
848
849                         if (offset)
850                                 *offset = p;
851
852                         return 1;
853                 }
854
855         next:
856                 p = le64toh(o->data.next_hash_offset);
857         }
858
859         return 0;
860 }
861
862 int journal_file_find_data_object(
863                 JournalFile *f,
864                 const void *data, uint64_t size,
865                 Object **ret, uint64_t *offset) {
866
867         uint64_t hash;
868
869         assert(f);
870         assert(data || size == 0);
871
872         hash = hash64(data, size);
873
874         return journal_file_find_data_object_with_hash(f,
875                                                        data, size, hash,
876                                                        ret, offset);
877 }
878
879 static int journal_file_append_field(
880                 JournalFile *f,
881                 const void *field, uint64_t size,
882                 Object **ret, uint64_t *offset) {
883
884         uint64_t hash, p;
885         uint64_t osize;
886         Object *o;
887         int r;
888
889         assert(f);
890         assert(field && size > 0);
891
892         hash = hash64(field, size);
893
894         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
895         if (r < 0)
896                 return r;
897         else if (r > 0) {
898
899                 if (ret)
900                         *ret = o;
901
902                 if (offset)
903                         *offset = p;
904
905                 return 0;
906         }
907
908         osize = offsetof(Object, field.payload) + size;
909         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
910
911         o->field.hash = htole64(hash);
912         memcpy(o->field.payload, field, size);
913
914         r = journal_file_link_field(f, o, p, hash);
915         if (r < 0)
916                 return r;
917
918         /* The linking might have altered the window, so let's
919          * refresh our pointer */
920         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
921         if (r < 0)
922                 return r;
923
924 #ifdef HAVE_GCRYPT
925         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
926         if (r < 0)
927                 return r;
928 #endif
929
930         if (ret)
931                 *ret = o;
932
933         if (offset)
934                 *offset = p;
935
936         return 0;
937 }
938
939 static int journal_file_append_data(
940                 JournalFile *f,
941                 const void *data, uint64_t size,
942                 Object **ret, uint64_t *offset) {
943
944         uint64_t hash, p;
945         uint64_t osize;
946         Object *o;
947         int r;
948         bool compressed = false;
949         const void *eq;
950
951         assert(f);
952         assert(data || size == 0);
953
954         hash = hash64(data, size);
955
956         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
957         if (r < 0)
958                 return r;
959         else if (r > 0) {
960
961                 if (ret)
962                         *ret = o;
963
964                 if (offset)
965                         *offset = p;
966
967                 return 0;
968         }
969
970         osize = offsetof(Object, data.payload) + size;
971         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
972         if (r < 0)
973                 return r;
974
975         o->data.hash = htole64(hash);
976
977 #ifdef HAVE_XZ
978         if (f->compress &&
979             size >= COMPRESSION_SIZE_THRESHOLD) {
980                 uint64_t rsize;
981
982                 compressed = compress_blob(data, size, o->data.payload, &rsize);
983
984                 if (compressed) {
985                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
986                         o->object.flags |= OBJECT_COMPRESSED;
987
988                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
989                 }
990         }
991 #endif
992
993         if (!compressed && size > 0)
994                 memcpy(o->data.payload, data, size);
995
996         r = journal_file_link_data(f, o, p, hash);
997         if (r < 0)
998                 return r;
999
1000         /* The linking might have altered the window, so let's
1001          * refresh our pointer */
1002         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1003         if (r < 0)
1004                 return r;
1005
1006         eq = memchr(data, '=', size);
1007         if (eq && eq > data) {
1008                 uint64_t fp;
1009                 Object *fo;
1010
1011                 /* Create field object ... */
1012                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1013                 if (r < 0)
1014                         return r;
1015
1016                 /* ... and link it in. */
1017                 o->data.next_field_offset = fo->field.head_data_offset;
1018                 fo->field.head_data_offset = le64toh(p);
1019         }
1020
1021 #ifdef HAVE_GCRYPT
1022         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1023         if (r < 0)
1024                 return r;
1025 #endif
1026
1027         if (ret)
1028                 *ret = o;
1029
1030         if (offset)
1031                 *offset = p;
1032
1033         return 0;
1034 }
1035
1036 uint64_t journal_file_entry_n_items(Object *o) {
1037         assert(o);
1038
1039         if (o->object.type != OBJECT_ENTRY)
1040                 return 0;
1041
1042         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1043 }
1044
1045 uint64_t journal_file_entry_array_n_items(Object *o) {
1046         assert(o);
1047
1048         if (o->object.type != OBJECT_ENTRY_ARRAY)
1049                 return 0;
1050
1051         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1052 }
1053
1054 uint64_t journal_file_hash_table_n_items(Object *o) {
1055         assert(o);
1056
1057         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1058             o->object.type != OBJECT_FIELD_HASH_TABLE)
1059                 return 0;
1060
1061         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1062 }
1063
1064 static int link_entry_into_array(JournalFile *f,
1065                                  le64_t *first,
1066                                  le64_t *idx,
1067                                  uint64_t p) {
1068         int r;
1069         uint64_t n = 0, ap = 0, q, i, a, hidx;
1070         Object *o;
1071
1072         assert(f);
1073         assert(first);
1074         assert(idx);
1075         assert(p > 0);
1076
1077         a = le64toh(*first);
1078         i = hidx = le64toh(*idx);
1079         while (a > 0) {
1080
1081                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1082                 if (r < 0)
1083                         return r;
1084
1085                 n = journal_file_entry_array_n_items(o);
1086                 if (i < n) {
1087                         o->entry_array.items[i] = htole64(p);
1088                         *idx = htole64(hidx + 1);
1089                         return 0;
1090                 }
1091
1092                 i -= n;
1093                 ap = a;
1094                 a = le64toh(o->entry_array.next_entry_array_offset);
1095         }
1096
1097         if (hidx > n)
1098                 n = (hidx+1) * 2;
1099         else
1100                 n = n * 2;
1101
1102         if (n < 4)
1103                 n = 4;
1104
1105         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1106                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1107                                        &o, &q);
1108         if (r < 0)
1109                 return r;
1110
1111 #ifdef HAVE_GCRYPT
1112         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1113         if (r < 0)
1114                 return r;
1115 #endif
1116
1117         o->entry_array.items[i] = htole64(p);
1118
1119         if (ap == 0)
1120                 *first = htole64(q);
1121         else {
1122                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1123                 if (r < 0)
1124                         return r;
1125
1126                 o->entry_array.next_entry_array_offset = htole64(q);
1127         }
1128
1129         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1130                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1131
1132         *idx = htole64(hidx + 1);
1133
1134         return 0;
1135 }
1136
1137 static int link_entry_into_array_plus_one(JournalFile *f,
1138                                           le64_t *extra,
1139                                           le64_t *first,
1140                                           le64_t *idx,
1141                                           uint64_t p) {
1142
1143         int r;
1144
1145         assert(f);
1146         assert(extra);
1147         assert(first);
1148         assert(idx);
1149         assert(p > 0);
1150
1151         if (*idx == 0)
1152                 *extra = htole64(p);
1153         else {
1154                 le64_t i;
1155
1156                 i = htole64(le64toh(*idx) - 1);
1157                 r = link_entry_into_array(f, first, &i, p);
1158                 if (r < 0)
1159                         return r;
1160         }
1161
1162         *idx = htole64(le64toh(*idx) + 1);
1163         return 0;
1164 }
1165
1166 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1167         uint64_t p;
1168         int r;
1169         assert(f);
1170         assert(o);
1171         assert(offset > 0);
1172
1173         p = le64toh(o->entry.items[i].object_offset);
1174         if (p == 0)
1175                 return -EINVAL;
1176
1177         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1178         if (r < 0)
1179                 return r;
1180
1181         return link_entry_into_array_plus_one(f,
1182                                               &o->data.entry_offset,
1183                                               &o->data.entry_array_offset,
1184                                               &o->data.n_entries,
1185                                               offset);
1186 }
1187
1188 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1189         uint64_t n, i;
1190         int r;
1191
1192         assert(f);
1193         assert(o);
1194         assert(offset > 0);
1195
1196         if (o->object.type != OBJECT_ENTRY)
1197                 return -EINVAL;
1198
1199         __sync_synchronize();
1200
1201         /* Link up the entry itself */
1202         r = link_entry_into_array(f,
1203                                   &f->header->entry_array_offset,
1204                                   &f->header->n_entries,
1205                                   offset);
1206         if (r < 0)
1207                 return r;
1208
1209         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
1210
1211         if (f->header->head_entry_realtime == 0)
1212                 f->header->head_entry_realtime = o->entry.realtime;
1213
1214         f->header->tail_entry_realtime = o->entry.realtime;
1215         f->header->tail_entry_monotonic = o->entry.monotonic;
1216
1217         f->tail_entry_monotonic_valid = true;
1218
1219         /* Link up the items */
1220         n = journal_file_entry_n_items(o);
1221         for (i = 0; i < n; i++) {
1222                 r = journal_file_link_entry_item(f, o, offset, i);
1223                 if (r < 0)
1224                         return r;
1225         }
1226
1227         return 0;
1228 }
1229
1230 static int journal_file_append_entry_internal(
1231                 JournalFile *f,
1232                 const dual_timestamp *ts,
1233                 uint64_t xor_hash,
1234                 const EntryItem items[], unsigned n_items,
1235                 uint64_t *seqnum,
1236                 Object **ret, uint64_t *offset) {
1237         uint64_t np;
1238         uint64_t osize;
1239         Object *o;
1240         int r;
1241
1242         assert(f);
1243         assert(items || n_items == 0);
1244         assert(ts);
1245
1246         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1247
1248         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1249         if (r < 0)
1250                 return r;
1251
1252         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1253         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1254         o->entry.realtime = htole64(ts->realtime);
1255         o->entry.monotonic = htole64(ts->monotonic);
1256         o->entry.xor_hash = htole64(xor_hash);
1257         o->entry.boot_id = f->header->boot_id;
1258
1259 #ifdef HAVE_GCRYPT
1260         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1261         if (r < 0)
1262                 return r;
1263 #endif
1264
1265         r = journal_file_link_entry(f, o, np);
1266         if (r < 0)
1267                 return r;
1268
1269         if (ret)
1270                 *ret = o;
1271
1272         if (offset)
1273                 *offset = np;
1274
1275         return 0;
1276 }
1277
1278 void journal_file_post_change(JournalFile *f) {
1279         assert(f);
1280
1281         /* inotify() does not receive IN_MODIFY events from file
1282          * accesses done via mmap(). After each access we hence
1283          * trigger IN_MODIFY by truncating the journal file to its
1284          * current size which triggers IN_MODIFY. */
1285
1286         __sync_synchronize();
1287
1288         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1289                 log_error("Failed to truncate file to its own size: %m");
1290 }
1291
1292 static int entry_item_cmp(const void *_a, const void *_b) {
1293         const EntryItem *a = _a, *b = _b;
1294
1295         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1296                 return -1;
1297         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1298                 return 1;
1299         return 0;
1300 }
1301
1302 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1303         unsigned i;
1304         EntryItem *items;
1305         int r;
1306         uint64_t xor_hash = 0;
1307         struct dual_timestamp _ts;
1308
1309         assert(f);
1310         assert(iovec || n_iovec == 0);
1311
1312         if (!ts) {
1313                 dual_timestamp_get(&_ts);
1314                 ts = &_ts;
1315         }
1316
1317         if (f->tail_entry_monotonic_valid &&
1318             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1319                 return -EINVAL;
1320
1321 #ifdef HAVE_GCRYPT
1322         r = journal_file_maybe_append_tag(f, ts->realtime);
1323         if (r < 0)
1324                 return r;
1325 #endif
1326
1327         /* alloca() can't take 0, hence let's allocate at least one */
1328         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1329
1330         for (i = 0; i < n_iovec; i++) {
1331                 uint64_t p;
1332                 Object *o;
1333
1334                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1335                 if (r < 0)
1336                         return r;
1337
1338                 xor_hash ^= le64toh(o->data.hash);
1339                 items[i].object_offset = htole64(p);
1340                 items[i].hash = o->data.hash;
1341         }
1342
1343         /* Order by the position on disk, in order to improve seek
1344          * times for rotating media. */
1345         qsort(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1346
1347         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1348
1349         journal_file_post_change(f);
1350
1351         return r;
1352 }
1353
1354 typedef struct ChainCacheItem {
1355         uint64_t first; /* the array at the begin of the chain */
1356         uint64_t array; /* the cached array */
1357         uint64_t begin; /* the first item in the cached array */
1358         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1359 } ChainCacheItem;
1360
1361 static void chain_cache_put(
1362                 Hashmap *h,
1363                 ChainCacheItem *ci,
1364                 uint64_t first,
1365                 uint64_t array,
1366                 uint64_t begin,
1367                 uint64_t total) {
1368
1369         if (!ci) {
1370                 /* If the chain item to cache for this chain is the
1371                  * first one it's not worth caching anything */
1372                 if (array == first)
1373                         return;
1374
1375                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1376                         ci = hashmap_steal_first(h);
1377                 else {
1378                         ci = new(ChainCacheItem, 1);
1379                         if (!ci)
1380                                 return;
1381                 }
1382
1383                 ci->first = first;
1384
1385                 if (hashmap_put(h, &ci->first, ci) < 0) {
1386                         free(ci);
1387                         return;
1388                 }
1389         } else
1390                 assert(ci->first == first);
1391
1392         ci->array = array;
1393         ci->begin = begin;
1394         ci->total = total;
1395 }
1396
1397 static int generic_array_get(JournalFile *f,
1398                              uint64_t first,
1399                              uint64_t i,
1400                              Object **ret, uint64_t *offset) {
1401
1402         Object *o;
1403         uint64_t p = 0, a, t = 0;
1404         int r;
1405         ChainCacheItem *ci;
1406
1407         assert(f);
1408
1409         a = first;
1410
1411         /* Try the chain cache first */
1412         ci = hashmap_get(f->chain_cache, &first);
1413         if (ci && i > ci->total) {
1414                 a = ci->array;
1415                 i -= ci->total;
1416                 t = ci->total;
1417         }
1418
1419         while (a > 0) {
1420                 uint64_t k;
1421
1422                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1423                 if (r < 0)
1424                         return r;
1425
1426                 k = journal_file_entry_array_n_items(o);
1427                 if (i < k) {
1428                         p = le64toh(o->entry_array.items[i]);
1429                         goto found;
1430                 }
1431
1432                 i -= k;
1433                 t += k;
1434                 a = le64toh(o->entry_array.next_entry_array_offset);
1435         }
1436
1437         return 0;
1438
1439 found:
1440         /* Let's cache this item for the next invocation */
1441         chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t);
1442
1443         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1444         if (r < 0)
1445                 return r;
1446
1447         if (ret)
1448                 *ret = o;
1449
1450         if (offset)
1451                 *offset = p;
1452
1453         return 1;
1454 }
1455
1456 static int generic_array_get_plus_one(JournalFile *f,
1457                                       uint64_t extra,
1458                                       uint64_t first,
1459                                       uint64_t i,
1460                                       Object **ret, uint64_t *offset) {
1461
1462         Object *o;
1463
1464         assert(f);
1465
1466         if (i == 0) {
1467                 int r;
1468
1469                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1470                 if (r < 0)
1471                         return r;
1472
1473                 if (ret)
1474                         *ret = o;
1475
1476                 if (offset)
1477                         *offset = extra;
1478
1479                 return 1;
1480         }
1481
1482         return generic_array_get(f, first, i-1, ret, offset);
1483 }
1484
1485 enum {
1486         TEST_FOUND,
1487         TEST_LEFT,
1488         TEST_RIGHT
1489 };
1490
1491 static int generic_array_bisect(JournalFile *f,
1492                                 uint64_t first,
1493                                 uint64_t n,
1494                                 uint64_t needle,
1495                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1496                                 direction_t direction,
1497                                 Object **ret,
1498                                 uint64_t *offset,
1499                                 uint64_t *idx) {
1500
1501         uint64_t a, p, t = 0, i = 0, last_p = 0;
1502         bool subtract_one = false;
1503         Object *o, *array = NULL;
1504         int r;
1505         ChainCacheItem *ci;
1506
1507         assert(f);
1508         assert(test_object);
1509
1510         /* Start with the first array in the chain */
1511         a = first;
1512
1513         ci = hashmap_get(f->chain_cache, &first);
1514         if (ci && n > ci->total) {
1515                 /* Ah, we have iterated this bisection array chain
1516                  * previously! Let's see if we can skip ahead in the
1517                  * chain, as far as the last time. But we can't jump
1518                  * backwards in the chain, so let's check that
1519                  * first. */
1520
1521                 r = test_object(f, ci->begin, needle);
1522                 if (r < 0)
1523                         return r;
1524
1525                 if (r == TEST_LEFT) {
1526                         /* OK, what we are looking for is right of th
1527                          * begin of this EntryArray, so let's jump
1528                          * straight to previously cached array in the
1529                          * chain */
1530
1531                         a = ci->array;
1532                         n -= ci->total;
1533                         t = ci->total;
1534                 }
1535         }
1536
1537         while (a > 0) {
1538                 uint64_t left, right, k, lp;
1539
1540                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1541                 if (r < 0)
1542                         return r;
1543
1544                 k = journal_file_entry_array_n_items(array);
1545                 right = MIN(k, n);
1546                 if (right <= 0)
1547                         return 0;
1548
1549                 i = right - 1;
1550                 lp = p = le64toh(array->entry_array.items[i]);
1551                 if (p <= 0)
1552                         return -EBADMSG;
1553
1554                 r = test_object(f, p, needle);
1555                 if (r < 0)
1556                         return r;
1557
1558                 if (r == TEST_FOUND)
1559                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1560
1561                 if (r == TEST_RIGHT) {
1562                         left = 0;
1563                         right -= 1;
1564                         for (;;) {
1565                                 if (left == right) {
1566                                         if (direction == DIRECTION_UP)
1567                                                 subtract_one = true;
1568
1569                                         i = left;
1570                                         goto found;
1571                                 }
1572
1573                                 assert(left < right);
1574
1575                                 i = (left + right) / 2;
1576                                 p = le64toh(array->entry_array.items[i]);
1577                                 if (p <= 0)
1578                                         return -EBADMSG;
1579
1580                                 r = test_object(f, p, needle);
1581                                 if (r < 0)
1582                                         return r;
1583
1584                                 if (r == TEST_FOUND)
1585                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1586
1587                                 if (r == TEST_RIGHT)
1588                                         right = i;
1589                                 else
1590                                         left = i + 1;
1591                         }
1592                 }
1593
1594                 if (k > n) {
1595                         if (direction == DIRECTION_UP) {
1596                                 i = n;
1597                                 subtract_one = true;
1598                                 goto found;
1599                         }
1600
1601                         return 0;
1602                 }
1603
1604                 last_p = lp;
1605
1606                 n -= k;
1607                 t += k;
1608                 a = le64toh(array->entry_array.next_entry_array_offset);
1609         }
1610
1611         return 0;
1612
1613 found:
1614         if (subtract_one && t == 0 && i == 0)
1615                 return 0;
1616
1617         /* Let's cache this item for the next invocation */
1618         chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t);
1619
1620         if (subtract_one && i == 0)
1621                 p = last_p;
1622         else if (subtract_one)
1623                 p = le64toh(array->entry_array.items[i-1]);
1624         else
1625                 p = le64toh(array->entry_array.items[i]);
1626
1627         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1628         if (r < 0)
1629                 return r;
1630
1631         if (ret)
1632                 *ret = o;
1633
1634         if (offset)
1635                 *offset = p;
1636
1637         if (idx)
1638                 *idx = t + i + (subtract_one ? -1 : 0);
1639
1640         return 1;
1641 }
1642
1643 static int generic_array_bisect_plus_one(JournalFile *f,
1644                                          uint64_t extra,
1645                                          uint64_t first,
1646                                          uint64_t n,
1647                                          uint64_t needle,
1648                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1649                                          direction_t direction,
1650                                          Object **ret,
1651                                          uint64_t *offset,
1652                                          uint64_t *idx) {
1653
1654         int r;
1655         bool step_back = false;
1656         Object *o;
1657
1658         assert(f);
1659         assert(test_object);
1660
1661         if (n <= 0)
1662                 return 0;
1663
1664         /* This bisects the array in object 'first', but first checks
1665          * an extra  */
1666         r = test_object(f, extra, needle);
1667         if (r < 0)
1668                 return r;
1669
1670         if (r == TEST_FOUND)
1671                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1672
1673         /* if we are looking with DIRECTION_UP then we need to first
1674            see if in the actual array there is a matching entry, and
1675            return the last one of that. But if there isn't any we need
1676            to return this one. Hence remember this, and return it
1677            below. */
1678         if (r == TEST_LEFT)
1679                 step_back = direction == DIRECTION_UP;
1680
1681         if (r == TEST_RIGHT) {
1682                 if (direction == DIRECTION_DOWN)
1683                         goto found;
1684                 else
1685                         return 0;
1686         }
1687
1688         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1689
1690         if (r == 0 && step_back)
1691                 goto found;
1692
1693         if (r > 0 && idx)
1694                 (*idx) ++;
1695
1696         return r;
1697
1698 found:
1699         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1700         if (r < 0)
1701                 return r;
1702
1703         if (ret)
1704                 *ret = o;
1705
1706         if (offset)
1707                 *offset = extra;
1708
1709         if (idx)
1710                 *idx = 0;
1711
1712         return 1;
1713 }
1714
1715 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1716         assert(f);
1717         assert(p > 0);
1718
1719         if (p == needle)
1720                 return TEST_FOUND;
1721         else if (p < needle)
1722                 return TEST_LEFT;
1723         else
1724                 return TEST_RIGHT;
1725 }
1726
1727 int journal_file_move_to_entry_by_offset(
1728                 JournalFile *f,
1729                 uint64_t p,
1730                 direction_t direction,
1731                 Object **ret,
1732                 uint64_t *offset) {
1733
1734         return generic_array_bisect(f,
1735                                     le64toh(f->header->entry_array_offset),
1736                                     le64toh(f->header->n_entries),
1737                                     p,
1738                                     test_object_offset,
1739                                     direction,
1740                                     ret, offset, NULL);
1741 }
1742
1743
1744 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1745         Object *o;
1746         int r;
1747
1748         assert(f);
1749         assert(p > 0);
1750
1751         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1752         if (r < 0)
1753                 return r;
1754
1755         if (le64toh(o->entry.seqnum) == needle)
1756                 return TEST_FOUND;
1757         else if (le64toh(o->entry.seqnum) < needle)
1758                 return TEST_LEFT;
1759         else
1760                 return TEST_RIGHT;
1761 }
1762
1763 int journal_file_move_to_entry_by_seqnum(
1764                 JournalFile *f,
1765                 uint64_t seqnum,
1766                 direction_t direction,
1767                 Object **ret,
1768                 uint64_t *offset) {
1769
1770         return generic_array_bisect(f,
1771                                     le64toh(f->header->entry_array_offset),
1772                                     le64toh(f->header->n_entries),
1773                                     seqnum,
1774                                     test_object_seqnum,
1775                                     direction,
1776                                     ret, offset, NULL);
1777 }
1778
1779 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1780         Object *o;
1781         int r;
1782
1783         assert(f);
1784         assert(p > 0);
1785
1786         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1787         if (r < 0)
1788                 return r;
1789
1790         if (le64toh(o->entry.realtime) == needle)
1791                 return TEST_FOUND;
1792         else if (le64toh(o->entry.realtime) < needle)
1793                 return TEST_LEFT;
1794         else
1795                 return TEST_RIGHT;
1796 }
1797
1798 int journal_file_move_to_entry_by_realtime(
1799                 JournalFile *f,
1800                 uint64_t realtime,
1801                 direction_t direction,
1802                 Object **ret,
1803                 uint64_t *offset) {
1804
1805         return generic_array_bisect(f,
1806                                     le64toh(f->header->entry_array_offset),
1807                                     le64toh(f->header->n_entries),
1808                                     realtime,
1809                                     test_object_realtime,
1810                                     direction,
1811                                     ret, offset, NULL);
1812 }
1813
1814 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1815         Object *o;
1816         int r;
1817
1818         assert(f);
1819         assert(p > 0);
1820
1821         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1822         if (r < 0)
1823                 return r;
1824
1825         if (le64toh(o->entry.monotonic) == needle)
1826                 return TEST_FOUND;
1827         else if (le64toh(o->entry.monotonic) < needle)
1828                 return TEST_LEFT;
1829         else
1830                 return TEST_RIGHT;
1831 }
1832
1833 int journal_file_move_to_entry_by_monotonic(
1834                 JournalFile *f,
1835                 sd_id128_t boot_id,
1836                 uint64_t monotonic,
1837                 direction_t direction,
1838                 Object **ret,
1839                 uint64_t *offset) {
1840
1841         char t[9+32+1] = "_BOOT_ID=";
1842         Object *o;
1843         int r;
1844
1845         assert(f);
1846
1847         sd_id128_to_string(boot_id, t + 9);
1848         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1849         if (r < 0)
1850                 return r;
1851         if (r == 0)
1852                 return -ENOENT;
1853
1854         return generic_array_bisect_plus_one(f,
1855                                              le64toh(o->data.entry_offset),
1856                                              le64toh(o->data.entry_array_offset),
1857                                              le64toh(o->data.n_entries),
1858                                              monotonic,
1859                                              test_object_monotonic,
1860                                              direction,
1861                                              ret, offset, NULL);
1862 }
1863
1864 int journal_file_next_entry(
1865                 JournalFile *f,
1866                 Object *o, uint64_t p,
1867                 direction_t direction,
1868                 Object **ret, uint64_t *offset) {
1869
1870         uint64_t i, n;
1871         int r;
1872
1873         assert(f);
1874         assert(p > 0 || !o);
1875
1876         n = le64toh(f->header->n_entries);
1877         if (n <= 0)
1878                 return 0;
1879
1880         if (!o)
1881                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1882         else {
1883                 if (o->object.type != OBJECT_ENTRY)
1884                         return -EINVAL;
1885
1886                 r = generic_array_bisect(f,
1887                                          le64toh(f->header->entry_array_offset),
1888                                          le64toh(f->header->n_entries),
1889                                          p,
1890                                          test_object_offset,
1891                                          DIRECTION_DOWN,
1892                                          NULL, NULL,
1893                                          &i);
1894                 if (r <= 0)
1895                         return r;
1896
1897                 if (direction == DIRECTION_DOWN) {
1898                         if (i >= n - 1)
1899                                 return 0;
1900
1901                         i++;
1902                 } else {
1903                         if (i <= 0)
1904                                 return 0;
1905
1906                         i--;
1907                 }
1908         }
1909
1910         /* And jump to it */
1911         return generic_array_get(f,
1912                                  le64toh(f->header->entry_array_offset),
1913                                  i,
1914                                  ret, offset);
1915 }
1916
1917 int journal_file_skip_entry(
1918                 JournalFile *f,
1919                 Object *o, uint64_t p,
1920                 int64_t skip,
1921                 Object **ret, uint64_t *offset) {
1922
1923         uint64_t i, n;
1924         int r;
1925
1926         assert(f);
1927         assert(o);
1928         assert(p > 0);
1929
1930         if (o->object.type != OBJECT_ENTRY)
1931                 return -EINVAL;
1932
1933         r = generic_array_bisect(f,
1934                                  le64toh(f->header->entry_array_offset),
1935                                  le64toh(f->header->n_entries),
1936                                  p,
1937                                  test_object_offset,
1938                                  DIRECTION_DOWN,
1939                                  NULL, NULL,
1940                                  &i);
1941         if (r <= 0)
1942                 return r;
1943
1944         /* Calculate new index */
1945         if (skip < 0) {
1946                 if ((uint64_t) -skip >= i)
1947                         i = 0;
1948                 else
1949                         i = i - (uint64_t) -skip;
1950         } else
1951                 i  += (uint64_t) skip;
1952
1953         n = le64toh(f->header->n_entries);
1954         if (n <= 0)
1955                 return -EBADMSG;
1956
1957         if (i >= n)
1958                 i = n-1;
1959
1960         return generic_array_get(f,
1961                                  le64toh(f->header->entry_array_offset),
1962                                  i,
1963                                  ret, offset);
1964 }
1965
1966 int journal_file_next_entry_for_data(
1967                 JournalFile *f,
1968                 Object *o, uint64_t p,
1969                 uint64_t data_offset,
1970                 direction_t direction,
1971                 Object **ret, uint64_t *offset) {
1972
1973         uint64_t n, i;
1974         int r;
1975         Object *d;
1976
1977         assert(f);
1978         assert(p > 0 || !o);
1979
1980         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1981         if (r < 0)
1982                 return r;
1983
1984         n = le64toh(d->data.n_entries);
1985         if (n <= 0)
1986                 return n;
1987
1988         if (!o)
1989                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1990         else {
1991                 if (o->object.type != OBJECT_ENTRY)
1992                         return -EINVAL;
1993
1994                 r = generic_array_bisect_plus_one(f,
1995                                                   le64toh(d->data.entry_offset),
1996                                                   le64toh(d->data.entry_array_offset),
1997                                                   le64toh(d->data.n_entries),
1998                                                   p,
1999                                                   test_object_offset,
2000                                                   DIRECTION_DOWN,
2001                                                   NULL, NULL,
2002                                                   &i);
2003
2004                 if (r <= 0)
2005                         return r;
2006
2007                 if (direction == DIRECTION_DOWN) {
2008                         if (i >= n - 1)
2009                                 return 0;
2010
2011                         i++;
2012                 } else {
2013                         if (i <= 0)
2014                                 return 0;
2015
2016                         i--;
2017                 }
2018
2019         }
2020
2021         return generic_array_get_plus_one(f,
2022                                           le64toh(d->data.entry_offset),
2023                                           le64toh(d->data.entry_array_offset),
2024                                           i,
2025                                           ret, offset);
2026 }
2027
2028 int journal_file_move_to_entry_by_offset_for_data(
2029                 JournalFile *f,
2030                 uint64_t data_offset,
2031                 uint64_t p,
2032                 direction_t direction,
2033                 Object **ret, uint64_t *offset) {
2034
2035         int r;
2036         Object *d;
2037
2038         assert(f);
2039
2040         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2041         if (r < 0)
2042                 return r;
2043
2044         return generic_array_bisect_plus_one(f,
2045                                              le64toh(d->data.entry_offset),
2046                                              le64toh(d->data.entry_array_offset),
2047                                              le64toh(d->data.n_entries),
2048                                              p,
2049                                              test_object_offset,
2050                                              direction,
2051                                              ret, offset, NULL);
2052 }
2053
2054 int journal_file_move_to_entry_by_monotonic_for_data(
2055                 JournalFile *f,
2056                 uint64_t data_offset,
2057                 sd_id128_t boot_id,
2058                 uint64_t monotonic,
2059                 direction_t direction,
2060                 Object **ret, uint64_t *offset) {
2061
2062         char t[9+32+1] = "_BOOT_ID=";
2063         Object *o, *d;
2064         int r;
2065         uint64_t b, z;
2066
2067         assert(f);
2068
2069         /* First, seek by time */
2070         sd_id128_to_string(boot_id, t + 9);
2071         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
2072         if (r < 0)
2073                 return r;
2074         if (r == 0)
2075                 return -ENOENT;
2076
2077         r = generic_array_bisect_plus_one(f,
2078                                           le64toh(o->data.entry_offset),
2079                                           le64toh(o->data.entry_array_offset),
2080                                           le64toh(o->data.n_entries),
2081                                           monotonic,
2082                                           test_object_monotonic,
2083                                           direction,
2084                                           NULL, &z, NULL);
2085         if (r <= 0)
2086                 return r;
2087
2088         /* And now, continue seeking until we find an entry that
2089          * exists in both bisection arrays */
2090
2091         for (;;) {
2092                 Object *qo;
2093                 uint64_t p, q;
2094
2095                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2096                 if (r < 0)
2097                         return r;
2098
2099                 r = generic_array_bisect_plus_one(f,
2100                                                   le64toh(d->data.entry_offset),
2101                                                   le64toh(d->data.entry_array_offset),
2102                                                   le64toh(d->data.n_entries),
2103                                                   z,
2104                                                   test_object_offset,
2105                                                   direction,
2106                                                   NULL, &p, NULL);
2107                 if (r <= 0)
2108                         return r;
2109
2110                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2111                 if (r < 0)
2112                         return r;
2113
2114                 r = generic_array_bisect_plus_one(f,
2115                                                   le64toh(o->data.entry_offset),
2116                                                   le64toh(o->data.entry_array_offset),
2117                                                   le64toh(o->data.n_entries),
2118                                                   p,
2119                                                   test_object_offset,
2120                                                   direction,
2121                                                   &qo, &q, NULL);
2122
2123                 if (r <= 0)
2124                         return r;
2125
2126                 if (p == q) {
2127                         if (ret)
2128                                 *ret = qo;
2129                         if (offset)
2130                                 *offset = q;
2131
2132                         return 1;
2133                 }
2134
2135                 z = q;
2136         }
2137
2138         return 0;
2139 }
2140
2141 int journal_file_move_to_entry_by_seqnum_for_data(
2142                 JournalFile *f,
2143                 uint64_t data_offset,
2144                 uint64_t seqnum,
2145                 direction_t direction,
2146                 Object **ret, uint64_t *offset) {
2147
2148         Object *d;
2149         int r;
2150
2151         assert(f);
2152
2153         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2154         if (r < 0)
2155                 return r;
2156
2157         return generic_array_bisect_plus_one(f,
2158                                              le64toh(d->data.entry_offset),
2159                                              le64toh(d->data.entry_array_offset),
2160                                              le64toh(d->data.n_entries),
2161                                              seqnum,
2162                                              test_object_seqnum,
2163                                              direction,
2164                                              ret, offset, NULL);
2165 }
2166
2167 int journal_file_move_to_entry_by_realtime_for_data(
2168                 JournalFile *f,
2169                 uint64_t data_offset,
2170                 uint64_t realtime,
2171                 direction_t direction,
2172                 Object **ret, uint64_t *offset) {
2173
2174         Object *d;
2175         int r;
2176
2177         assert(f);
2178
2179         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2180         if (r < 0)
2181                 return r;
2182
2183         return generic_array_bisect_plus_one(f,
2184                                              le64toh(d->data.entry_offset),
2185                                              le64toh(d->data.entry_array_offset),
2186                                              le64toh(d->data.n_entries),
2187                                              realtime,
2188                                              test_object_realtime,
2189                                              direction,
2190                                              ret, offset, NULL);
2191 }
2192
2193 void journal_file_dump(JournalFile *f) {
2194         Object *o;
2195         int r;
2196         uint64_t p;
2197
2198         assert(f);
2199
2200         journal_file_print_header(f);
2201
2202         p = le64toh(f->header->header_size);
2203         while (p != 0) {
2204                 r = journal_file_move_to_object(f, -1, p, &o);
2205                 if (r < 0)
2206                         goto fail;
2207
2208                 switch (o->object.type) {
2209
2210                 case OBJECT_UNUSED:
2211                         printf("Type: OBJECT_UNUSED\n");
2212                         break;
2213
2214                 case OBJECT_DATA:
2215                         printf("Type: OBJECT_DATA\n");
2216                         break;
2217
2218                 case OBJECT_FIELD:
2219                         printf("Type: OBJECT_FIELD\n");
2220                         break;
2221
2222                 case OBJECT_ENTRY:
2223                         printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
2224                                (unsigned long long) le64toh(o->entry.seqnum),
2225                                (unsigned long long) le64toh(o->entry.monotonic),
2226                                (unsigned long long) le64toh(o->entry.realtime));
2227                         break;
2228
2229                 case OBJECT_FIELD_HASH_TABLE:
2230                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2231                         break;
2232
2233                 case OBJECT_DATA_HASH_TABLE:
2234                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2235                         break;
2236
2237                 case OBJECT_ENTRY_ARRAY:
2238                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2239                         break;
2240
2241                 case OBJECT_TAG:
2242                         printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
2243                                (unsigned long long) le64toh(o->tag.seqnum),
2244                                (unsigned long long) le64toh(o->tag.epoch));
2245                         break;
2246
2247                 default:
2248                         printf("Type: unknown (%u)\n", o->object.type);
2249                         break;
2250                 }
2251
2252                 if (o->object.flags & OBJECT_COMPRESSED)
2253                         printf("Flags: COMPRESSED\n");
2254
2255                 if (p == le64toh(f->header->tail_object_offset))
2256                         p = 0;
2257                 else
2258                         p = p + ALIGN64(le64toh(o->object.size));
2259         }
2260
2261         return;
2262 fail:
2263         log_error("File corrupt");
2264 }
2265
2266 void journal_file_print_header(JournalFile *f) {
2267         char a[33], b[33], c[33];
2268         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
2269         struct stat st;
2270         char bytes[FORMAT_BYTES_MAX];
2271
2272         assert(f);
2273
2274         printf("File Path: %s\n"
2275                "File ID: %s\n"
2276                "Machine ID: %s\n"
2277                "Boot ID: %s\n"
2278                "Sequential Number ID: %s\n"
2279                "State: %s\n"
2280                "Compatible Flags:%s%s\n"
2281                "Incompatible Flags:%s%s\n"
2282                "Header size: %llu\n"
2283                "Arena size: %llu\n"
2284                "Data Hash Table Size: %llu\n"
2285                "Field Hash Table Size: %llu\n"
2286                "Rotate Suggested: %s\n"
2287                "Head Sequential Number: %llu\n"
2288                "Tail Sequential Number: %llu\n"
2289                "Head Realtime Timestamp: %s\n"
2290                "Tail Realtime Timestamp: %s\n"
2291                "Objects: %llu\n"
2292                "Entry Objects: %llu\n",
2293                f->path,
2294                sd_id128_to_string(f->header->file_id, a),
2295                sd_id128_to_string(f->header->machine_id, b),
2296                sd_id128_to_string(f->header->boot_id, c),
2297                sd_id128_to_string(f->header->seqnum_id, c),
2298                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2299                f->header->state == STATE_ONLINE ? "ONLINE" :
2300                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2301                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2302                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2303                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2304                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2305                (unsigned long long) le64toh(f->header->header_size),
2306                (unsigned long long) le64toh(f->header->arena_size),
2307                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2308                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2309                yes_no(journal_file_rotate_suggested(f, 0)),
2310                (unsigned long long) le64toh(f->header->head_entry_seqnum),
2311                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
2312                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2313                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2314                (unsigned long long) le64toh(f->header->n_objects),
2315                (unsigned long long) le64toh(f->header->n_entries));
2316
2317         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2318                 printf("Data Objects: %llu\n"
2319                        "Data Hash Table Fill: %.1f%%\n",
2320                        (unsigned long long) le64toh(f->header->n_data),
2321                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2322
2323         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2324                 printf("Field Objects: %llu\n"
2325                        "Field Hash Table Fill: %.1f%%\n",
2326                        (unsigned long long) le64toh(f->header->n_fields),
2327                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2328
2329         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2330                 printf("Tag Objects: %llu\n",
2331                        (unsigned long long) le64toh(f->header->n_tags));
2332         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2333                 printf("Entry Array Objects: %llu\n",
2334                        (unsigned long long) le64toh(f->header->n_entry_arrays));
2335
2336         if (fstat(f->fd, &st) >= 0)
2337                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2338 }
2339
2340 int journal_file_open(
2341                 const char *fname,
2342                 int flags,
2343                 mode_t mode,
2344                 bool compress,
2345                 bool seal,
2346                 JournalMetrics *metrics,
2347                 MMapCache *mmap_cache,
2348                 JournalFile *template,
2349                 JournalFile **ret) {
2350
2351         JournalFile *f;
2352         int r;
2353         bool newly_created = false;
2354
2355         assert(fname);
2356         assert(ret);
2357
2358         if ((flags & O_ACCMODE) != O_RDONLY &&
2359             (flags & O_ACCMODE) != O_RDWR)
2360                 return -EINVAL;
2361
2362         if (!endswith(fname, ".journal") &&
2363             !endswith(fname, ".journal~"))
2364                 return -EINVAL;
2365
2366         f = new0(JournalFile, 1);
2367         if (!f)
2368                 return -ENOMEM;
2369
2370         f->fd = -1;
2371         f->mode = mode;
2372
2373         f->flags = flags;
2374         f->prot = prot_from_flags(flags);
2375         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2376 #ifdef HAVE_XZ
2377         f->compress = compress;
2378 #endif
2379 #ifdef HAVE_GCRYPT
2380         f->seal = seal;
2381 #endif
2382
2383         if (mmap_cache)
2384                 f->mmap = mmap_cache_ref(mmap_cache);
2385         else {
2386                 f->mmap = mmap_cache_new();
2387                 if (!f->mmap) {
2388                         r = -ENOMEM;
2389                         goto fail;
2390                 }
2391         }
2392
2393         f->path = strdup(fname);
2394         if (!f->path) {
2395                 r = -ENOMEM;
2396                 goto fail;
2397         }
2398
2399         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2400         if (!f->chain_cache) {
2401                 r = -ENOMEM;
2402                 goto fail;
2403         }
2404
2405         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2406         if (f->fd < 0) {
2407                 r = -errno;
2408                 goto fail;
2409         }
2410
2411         if (fstat(f->fd, &f->last_stat) < 0) {
2412                 r = -errno;
2413                 goto fail;
2414         }
2415
2416         if (f->last_stat.st_size == 0 && f->writable) {
2417 #ifdef HAVE_XATTR
2418                 uint64_t crtime;
2419
2420                 /* Let's attach the creation time to the journal file,
2421                  * so that the vacuuming code knows the age of this
2422                  * file even if the file might end up corrupted one
2423                  * day... Ideally we'd just use the creation time many
2424                  * file systems maintain for each file, but there is
2425                  * currently no usable API to query this, hence let's
2426                  * emulate this via extended attributes. If extended
2427                  * attributes are not supported we'll just skip this,
2428                  * and rely solely on mtime/atime/ctime of the file.*/
2429
2430                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2431                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2432 #endif
2433
2434 #ifdef HAVE_GCRYPT
2435                 /* Try to load the FSPRG state, and if we can't, then
2436                  * just don't do sealing */
2437                 if (f->seal) {
2438                         r = journal_file_fss_load(f);
2439                         if (r < 0)
2440                                 f->seal = false;
2441                 }
2442 #endif
2443
2444                 r = journal_file_init_header(f, template);
2445                 if (r < 0)
2446                         goto fail;
2447
2448                 if (fstat(f->fd, &f->last_stat) < 0) {
2449                         r = -errno;
2450                         goto fail;
2451                 }
2452
2453                 newly_created = true;
2454         }
2455
2456         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2457                 r = -EIO;
2458                 goto fail;
2459         }
2460
2461         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2462         if (f->header == MAP_FAILED) {
2463                 f->header = NULL;
2464                 r = -errno;
2465                 goto fail;
2466         }
2467
2468         if (!newly_created) {
2469                 r = journal_file_verify_header(f);
2470                 if (r < 0)
2471                         goto fail;
2472         }
2473
2474 #ifdef HAVE_GCRYPT
2475         if (!newly_created && f->writable) {
2476                 r = journal_file_fss_load(f);
2477                 if (r < 0)
2478                         goto fail;
2479         }
2480 #endif
2481
2482         if (f->writable) {
2483                 if (metrics) {
2484                         journal_default_metrics(metrics, f->fd);
2485                         f->metrics = *metrics;
2486                 } else if (template)
2487                         f->metrics = template->metrics;
2488
2489                 r = journal_file_refresh_header(f);
2490                 if (r < 0)
2491                         goto fail;
2492         }
2493
2494 #ifdef HAVE_GCRYPT
2495         r = journal_file_hmac_setup(f);
2496         if (r < 0)
2497                 goto fail;
2498 #endif
2499
2500         if (newly_created) {
2501                 r = journal_file_setup_field_hash_table(f);
2502                 if (r < 0)
2503                         goto fail;
2504
2505                 r = journal_file_setup_data_hash_table(f);
2506                 if (r < 0)
2507                         goto fail;
2508
2509 #ifdef HAVE_GCRYPT
2510                 r = journal_file_append_first_tag(f);
2511                 if (r < 0)
2512                         goto fail;
2513 #endif
2514         }
2515
2516         r = journal_file_map_field_hash_table(f);
2517         if (r < 0)
2518                 goto fail;
2519
2520         r = journal_file_map_data_hash_table(f);
2521         if (r < 0)
2522                 goto fail;
2523
2524         *ret = f;
2525         return 0;
2526
2527 fail:
2528         journal_file_close(f);
2529
2530         return r;
2531 }
2532
2533 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2534         char *p;
2535         size_t l;
2536         JournalFile *old_file, *new_file = NULL;
2537         int r;
2538
2539         assert(f);
2540         assert(*f);
2541
2542         old_file = *f;
2543
2544         if (!old_file->writable)
2545                 return -EINVAL;
2546
2547         if (!endswith(old_file->path, ".journal"))
2548                 return -EINVAL;
2549
2550         l = strlen(old_file->path);
2551
2552         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2553         if (!p)
2554                 return -ENOMEM;
2555
2556         memcpy(p, old_file->path, l - 8);
2557         p[l-8] = '@';
2558         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2559         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2560                  "-%016llx-%016llx.journal",
2561                  (unsigned long long) le64toh((*f)->header->head_entry_seqnum),
2562                  (unsigned long long) le64toh((*f)->header->head_entry_realtime));
2563
2564         r = rename(old_file->path, p);
2565         free(p);
2566
2567         if (r < 0)
2568                 return -errno;
2569
2570         old_file->header->state = STATE_ARCHIVED;
2571
2572         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2573         journal_file_close(old_file);
2574
2575         *f = new_file;
2576         return r;
2577 }
2578
2579 int journal_file_open_reliably(
2580                 const char *fname,
2581                 int flags,
2582                 mode_t mode,
2583                 bool compress,
2584                 bool seal,
2585                 JournalMetrics *metrics,
2586                 MMapCache *mmap_cache,
2587                 JournalFile *template,
2588                 JournalFile **ret) {
2589
2590         int r;
2591         size_t l;
2592         char *p;
2593
2594         r = journal_file_open(fname, flags, mode, compress, seal,
2595                               metrics, mmap_cache, template, ret);
2596         if (r != -EBADMSG && /* corrupted */
2597             r != -ENODATA && /* truncated */
2598             r != -EHOSTDOWN && /* other machine */
2599             r != -EPROTONOSUPPORT && /* incompatible feature */
2600             r != -EBUSY && /* unclean shutdown */
2601             r != -ESHUTDOWN /* already archived */)
2602                 return r;
2603
2604         if ((flags & O_ACCMODE) == O_RDONLY)
2605                 return r;
2606
2607         if (!(flags & O_CREAT))
2608                 return r;
2609
2610         if (!endswith(fname, ".journal"))
2611                 return r;
2612
2613         /* The file is corrupted. Rotate it away and try it again (but only once) */
2614
2615         l = strlen(fname);
2616         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2617                      (int) (l-8), fname,
2618                      (unsigned long long) now(CLOCK_REALTIME),
2619                      random_ull()) < 0)
2620                 return -ENOMEM;
2621
2622         r = rename(fname, p);
2623         free(p);
2624         if (r < 0)
2625                 return -errno;
2626
2627         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2628
2629         return journal_file_open(fname, flags, mode, compress, seal,
2630                                  metrics, mmap_cache, template, ret);
2631 }
2632
2633 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2634         uint64_t i, n;
2635         uint64_t q, xor_hash = 0;
2636         int r;
2637         EntryItem *items;
2638         dual_timestamp ts;
2639
2640         assert(from);
2641         assert(to);
2642         assert(o);
2643         assert(p);
2644
2645         if (!to->writable)
2646                 return -EPERM;
2647
2648         ts.monotonic = le64toh(o->entry.monotonic);
2649         ts.realtime = le64toh(o->entry.realtime);
2650
2651         if (to->tail_entry_monotonic_valid &&
2652             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2653                 return -EINVAL;
2654
2655         n = journal_file_entry_n_items(o);
2656         items = alloca(sizeof(EntryItem) * n);
2657
2658         for (i = 0; i < n; i++) {
2659                 uint64_t l, h;
2660                 le64_t le_hash;
2661                 size_t t;
2662                 void *data;
2663                 Object *u;
2664
2665                 q = le64toh(o->entry.items[i].object_offset);
2666                 le_hash = o->entry.items[i].hash;
2667
2668                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2669                 if (r < 0)
2670                         return r;
2671
2672                 if (le_hash != o->data.hash)
2673                         return -EBADMSG;
2674
2675                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2676                 t = (size_t) l;
2677
2678                 /* We hit the limit on 32bit machines */
2679                 if ((uint64_t) t != l)
2680                         return -E2BIG;
2681
2682                 if (o->object.flags & OBJECT_COMPRESSED) {
2683 #ifdef HAVE_XZ
2684                         uint64_t rsize;
2685
2686                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2687                                 return -EBADMSG;
2688
2689                         data = from->compress_buffer;
2690                         l = rsize;
2691 #else
2692                         return -EPROTONOSUPPORT;
2693 #endif
2694                 } else
2695                         data = o->data.payload;
2696
2697                 r = journal_file_append_data(to, data, l, &u, &h);
2698                 if (r < 0)
2699                         return r;
2700
2701                 xor_hash ^= le64toh(u->data.hash);
2702                 items[i].object_offset = htole64(h);
2703                 items[i].hash = u->data.hash;
2704
2705                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2706                 if (r < 0)
2707                         return r;
2708         }
2709
2710         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2711 }
2712
2713 void journal_default_metrics(JournalMetrics *m, int fd) {
2714         uint64_t fs_size = 0;
2715         struct statvfs ss;
2716         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2717
2718         assert(m);
2719         assert(fd >= 0);
2720
2721         if (fstatvfs(fd, &ss) >= 0)
2722                 fs_size = ss.f_frsize * ss.f_blocks;
2723
2724         if (m->max_use == (uint64_t) -1) {
2725
2726                 if (fs_size > 0) {
2727                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2728
2729                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2730                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2731
2732                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2733                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2734                 } else
2735                         m->max_use = DEFAULT_MAX_USE_LOWER;
2736         } else {
2737                 m->max_use = PAGE_ALIGN(m->max_use);
2738
2739                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2740                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2741         }
2742
2743         if (m->max_size == (uint64_t) -1) {
2744                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2745
2746                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2747                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2748         } else
2749                 m->max_size = PAGE_ALIGN(m->max_size);
2750
2751         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2752                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2753
2754         if (m->max_size*2 > m->max_use)
2755                 m->max_use = m->max_size*2;
2756
2757         if (m->min_size == (uint64_t) -1)
2758                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2759         else {
2760                 m->min_size = PAGE_ALIGN(m->min_size);
2761
2762                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2763                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2764
2765                 if (m->min_size > m->max_size)
2766                         m->max_size = m->min_size;
2767         }
2768
2769         if (m->keep_free == (uint64_t) -1) {
2770
2771                 if (fs_size > 0) {
2772                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2773
2774                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2775                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2776
2777                 } else
2778                         m->keep_free = DEFAULT_KEEP_FREE;
2779         }
2780
2781         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2782                   format_bytes(a, sizeof(a), m->max_use),
2783                   format_bytes(b, sizeof(b), m->max_size),
2784                   format_bytes(c, sizeof(c), m->min_size),
2785                   format_bytes(d, sizeof(d), m->keep_free));
2786 }
2787
2788 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2789         assert(f);
2790         assert(from || to);
2791
2792         if (from) {
2793                 if (f->header->head_entry_realtime == 0)
2794                         return -ENOENT;
2795
2796                 *from = le64toh(f->header->head_entry_realtime);
2797         }
2798
2799         if (to) {
2800                 if (f->header->tail_entry_realtime == 0)
2801                         return -ENOENT;
2802
2803                 *to = le64toh(f->header->tail_entry_realtime);
2804         }
2805
2806         return 1;
2807 }
2808
2809 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2810         char t[9+32+1] = "_BOOT_ID=";
2811         Object *o;
2812         uint64_t p;
2813         int r;
2814
2815         assert(f);
2816         assert(from || to);
2817
2818         sd_id128_to_string(boot_id, t + 9);
2819
2820         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2821         if (r <= 0)
2822                 return r;
2823
2824         if (le64toh(o->data.n_entries) <= 0)
2825                 return 0;
2826
2827         if (from) {
2828                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2829                 if (r < 0)
2830                         return r;
2831
2832                 *from = le64toh(o->entry.monotonic);
2833         }
2834
2835         if (to) {
2836                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2837                 if (r < 0)
2838                         return r;
2839
2840                 r = generic_array_get_plus_one(f,
2841                                                le64toh(o->data.entry_offset),
2842                                                le64toh(o->data.entry_array_offset),
2843                                                le64toh(o->data.n_entries)-1,
2844                                                &o, NULL);
2845                 if (r <= 0)
2846                         return r;
2847
2848                 *to = le64toh(o->entry.monotonic);
2849         }
2850
2851         return 1;
2852 }
2853
2854 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2855         assert(f);
2856
2857         /* If we gained new header fields we gained new features,
2858          * hence suggest a rotation */
2859         if (le64toh(f->header->header_size) < sizeof(Header)) {
2860                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2861                 return true;
2862         }
2863
2864         /* Let's check if the hash tables grew over a certain fill
2865          * level (75%, borrowing this value from Java's hash table
2866          * implementation), and if so suggest a rotation. To calculate
2867          * the fill level we need the n_data field, which only exists
2868          * in newer versions. */
2869
2870         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2871                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2872                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2873                                   f->path,
2874                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2875                                   (unsigned long long) le64toh(f->header->n_data),
2876                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2877                                   (unsigned long long) (f->last_stat.st_size),
2878                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2879                         return true;
2880                 }
2881
2882         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2883                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2884                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2885                                   f->path,
2886                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2887                                   (unsigned long long) le64toh(f->header->n_fields),
2888                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2889                         return true;
2890                 }
2891
2892         /* Are the data objects properly indexed by field objects? */
2893         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2894             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2895             le64toh(f->header->n_data) > 0 &&
2896             le64toh(f->header->n_fields) == 0)
2897                 return true;
2898
2899         if (max_file_usec > 0) {
2900                 usec_t t, h;
2901
2902                 h = le64toh(f->header->head_entry_realtime);
2903                 t = now(CLOCK_REALTIME);
2904
2905                 if (h > 0 && t > h + max_file_usec)
2906                         return true;
2907         }
2908
2909         return false;
2910 }