chiark / gitweb /
journal: size journal data hash table based on maximum file size metrics
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65
66 #define JOURNAL_HEADER_CONTAINS(h, field) \
67         (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
68
69 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
70
71 void journal_file_close(JournalFile *f) {
72         int t;
73
74         assert(f);
75
76         if (f->header) {
77                 if (f->writable)
78                         f->header->state = STATE_OFFLINE;
79
80                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
81         }
82
83         for (t = 0; t < _WINDOW_MAX; t++)
84                 if (f->windows[t].ptr)
85                         munmap(f->windows[t].ptr, f->windows[t].size);
86
87         if (f->fd >= 0)
88                 close_nointr_nofail(f->fd);
89
90         free(f->path);
91
92 #ifdef HAVE_XZ
93         free(f->compress_buffer);
94 #endif
95
96         free(f);
97 }
98
99 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
100         Header h;
101         ssize_t k;
102         int r;
103
104         assert(f);
105
106         zero(h);
107         memcpy(h.signature, signature, 8);
108         h.header_size = htole64(ALIGN64(sizeof(h)));
109
110         r = sd_id128_randomize(&h.file_id);
111         if (r < 0)
112                 return r;
113
114         if (template) {
115                 h.seqnum_id = template->header->seqnum_id;
116                 h.tail_seqnum = template->header->tail_seqnum;
117         } else
118                 h.seqnum_id = h.file_id;
119
120         k = pwrite(f->fd, &h, sizeof(h), 0);
121         if (k < 0)
122                 return -errno;
123
124         if (k != sizeof(h))
125                 return -EIO;
126
127         return 0;
128 }
129
130 static int journal_file_refresh_header(JournalFile *f) {
131         int r;
132         sd_id128_t boot_id;
133
134         assert(f);
135
136         r = sd_id128_get_machine(&f->header->machine_id);
137         if (r < 0)
138                 return r;
139
140         r = sd_id128_get_boot(&boot_id);
141         if (r < 0)
142                 return r;
143
144         if (sd_id128_equal(boot_id, f->header->boot_id))
145                 f->tail_entry_monotonic_valid = true;
146
147         f->header->boot_id = boot_id;
148
149         f->header->state = STATE_ONLINE;
150
151         __sync_synchronize();
152
153         return 0;
154 }
155
156 static int journal_file_verify_header(JournalFile *f) {
157         assert(f);
158
159         if (memcmp(f->header, signature, 8))
160                 return -EBADMSG;
161
162 #ifdef HAVE_XZ
163         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
164                 return -EPROTONOSUPPORT;
165 #else
166         if (f->header->incompatible_flags != 0)
167                 return -EPROTONOSUPPORT;
168 #endif
169
170         /* The first addition was n_data, so check that we are at least this large */
171         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
172                 return -EBADMSG;
173
174         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
175                 return -ENODATA;
176
177         if (f->writable) {
178                 uint8_t state;
179                 sd_id128_t machine_id;
180                 int r;
181
182                 r = sd_id128_get_machine(&machine_id);
183                 if (r < 0)
184                         return r;
185
186                 if (!sd_id128_equal(machine_id, f->header->machine_id))
187                         return -EHOSTDOWN;
188
189                 state = f->header->state;
190
191                 if (state == STATE_ONLINE) {
192                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
193                         return -EBUSY;
194                 } else if (state == STATE_ARCHIVED)
195                         return -ESHUTDOWN;
196                 else if (state != STATE_OFFLINE) {
197                         log_debug("Journal file %s has unknown state %u.", f->path, state);
198                         return -EBUSY;
199                 }
200         }
201
202         return 0;
203 }
204
205 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
206         uint64_t old_size, new_size;
207         int r;
208
209         assert(f);
210
211         /* We assume that this file is not sparse, and we know that
212          * for sure, since we always call posix_fallocate()
213          * ourselves */
214
215         old_size =
216                 le64toh(f->header->header_size) +
217                 le64toh(f->header->arena_size);
218
219         new_size = PAGE_ALIGN(offset + size);
220         if (new_size < le64toh(f->header->header_size))
221                 new_size = le64toh(f->header->header_size);
222
223         if (new_size <= old_size)
224                 return 0;
225
226         if (f->metrics.max_size > 0 &&
227             new_size > f->metrics.max_size)
228                 return -E2BIG;
229
230         if (new_size > f->metrics.min_size &&
231             f->metrics.keep_free > 0) {
232                 struct statvfs svfs;
233
234                 if (fstatvfs(f->fd, &svfs) >= 0) {
235                         uint64_t available;
236
237                         available = svfs.f_bfree * svfs.f_bsize;
238
239                         if (available >= f->metrics.keep_free)
240                                 available -= f->metrics.keep_free;
241                         else
242                                 available = 0;
243
244                         if (new_size - old_size > available)
245                                 return -E2BIG;
246                 }
247         }
248
249         /* Note that the glibc fallocate() fallback is very
250            inefficient, hence we try to minimize the allocation area
251            as we can. */
252         r = posix_fallocate(f->fd, old_size, new_size - old_size);
253         if (r != 0)
254                 return -r;
255
256         if (fstat(f->fd, &f->last_stat) < 0)
257                 return -errno;
258
259         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
260
261         return 0;
262 }
263
264 static int journal_file_map(
265                 JournalFile *f,
266                 uint64_t offset,
267                 uint64_t size,
268                 void **_window,
269                 uint64_t *_woffset,
270                 uint64_t *_wsize,
271                 void **ret) {
272
273         uint64_t woffset, wsize;
274         void *window;
275
276         assert(f);
277         assert(size > 0);
278         assert(ret);
279
280         woffset = offset & ~((uint64_t) page_size() - 1ULL);
281         wsize = size + (offset - woffset);
282         wsize = PAGE_ALIGN(wsize);
283
284         /* Avoid SIGBUS on invalid accesses */
285         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
286                 return -EADDRNOTAVAIL;
287
288         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
289         if (window == MAP_FAILED)
290                 return -errno;
291
292         if (_window)
293                 *_window = window;
294
295         if (_woffset)
296                 *_woffset = woffset;
297
298         if (_wsize)
299                 *_wsize = wsize;
300
301         *ret = (uint8_t*) window + (offset - woffset);
302
303         return 0;
304 }
305
306 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
307         void *p = NULL;
308         uint64_t delta;
309         int r;
310         Window *w;
311
312         assert(f);
313         assert(ret);
314         assert(wt >= 0);
315         assert(wt < _WINDOW_MAX);
316
317         if (offset + size > (uint64_t) f->last_stat.st_size) {
318                 /* Hmm, out of range? Let's refresh the fstat() data
319                  * first, before we trust that check. */
320
321                 if (fstat(f->fd, &f->last_stat) < 0 ||
322                     offset + size > (uint64_t) f->last_stat.st_size)
323                         return -EADDRNOTAVAIL;
324         }
325
326         w = f->windows + wt;
327
328         if (_likely_(w->ptr &&
329                      w->offset <= offset &&
330                      w->offset + w->size >= offset + size)) {
331
332                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
333                 return 0;
334         }
335
336         if (w->ptr) {
337                 if (munmap(w->ptr, w->size) < 0)
338                         return -errno;
339
340                 w->ptr = NULL;
341                 w->size = w->offset = 0;
342         }
343
344         if (size < DEFAULT_WINDOW_SIZE) {
345                 /* If the default window size is larger then what was
346                  * asked for extend the mapping a bit in the hope to
347                  * minimize needed remappings later on. We add half
348                  * the window space before and half behind the
349                  * requested mapping */
350
351                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
352
353                 if (delta > offset)
354                         delta = offset;
355
356                 offset -= delta;
357                 size = DEFAULT_WINDOW_SIZE;
358         } else
359                 delta = 0;
360
361         if (offset + size > (uint64_t) f->last_stat.st_size)
362                 size = (uint64_t) f->last_stat.st_size - offset;
363
364         if (size <= 0)
365                 return -EADDRNOTAVAIL;
366
367         r = journal_file_map(f,
368                              offset, size,
369                              &w->ptr, &w->offset, &w->size,
370                              &p);
371
372         if (r < 0)
373                 return r;
374
375         *ret = (uint8_t*) p + delta;
376         return 0;
377 }
378
379 static bool verify_hash(Object *o) {
380         uint64_t h1, h2;
381
382         assert(o);
383
384         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
385                 h1 = le64toh(o->data.hash);
386                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
387         } else if (o->object.type == OBJECT_FIELD) {
388                 h1 = le64toh(o->field.hash);
389                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
390         } else
391                 return true;
392
393         return h1 == h2;
394 }
395
396 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
397         int r;
398         void *t;
399         Object *o;
400         uint64_t s;
401
402         assert(f);
403         assert(ret);
404         assert(type < _OBJECT_TYPE_MAX);
405
406         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
407         if (r < 0)
408                 return r;
409
410         o = (Object*) t;
411         s = le64toh(o->object.size);
412
413         if (s < sizeof(ObjectHeader))
414                 return -EBADMSG;
415
416         if (type >= 0 && o->object.type != type)
417                 return -EBADMSG;
418
419         if (s > sizeof(ObjectHeader)) {
420                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
421                 if (r < 0)
422                         return r;
423
424                 o = (Object*) t;
425         }
426
427         if (!verify_hash(o))
428                 return -EBADMSG;
429
430         *ret = o;
431         return 0;
432 }
433
434 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
435         uint64_t r;
436
437         assert(f);
438
439         r = le64toh(f->header->tail_seqnum) + 1;
440
441         if (seqnum) {
442                 /* If an external seqnum counter was passed, we update
443                  * both the local and the external one, and set it to
444                  * the maximum of both */
445
446                 if (*seqnum + 1 > r)
447                         r = *seqnum + 1;
448
449                 *seqnum = r;
450         }
451
452         f->header->tail_seqnum = htole64(r);
453
454         if (f->header->head_seqnum == 0)
455                 f->header->head_seqnum = htole64(r);
456
457         return r;
458 }
459
460 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
461         int r;
462         uint64_t p;
463         Object *tail, *o;
464         void *t;
465
466         assert(f);
467         assert(size >= sizeof(ObjectHeader));
468         assert(offset);
469         assert(ret);
470
471         p = le64toh(f->header->tail_object_offset);
472         if (p == 0)
473                 p = le64toh(f->header->header_size);
474         else {
475                 r = journal_file_move_to_object(f, -1, p, &tail);
476                 if (r < 0)
477                         return r;
478
479                 p += ALIGN64(le64toh(tail->object.size));
480         }
481
482         r = journal_file_allocate(f, p, size);
483         if (r < 0)
484                 return r;
485
486         r = journal_file_move_to(f, type, p, size, &t);
487         if (r < 0)
488                 return r;
489
490         o = (Object*) t;
491
492         zero(o->object);
493         o->object.type = type;
494         o->object.size = htole64(size);
495
496         f->header->tail_object_offset = htole64(p);
497         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
498
499         *ret = o;
500         *offset = p;
501
502         return 0;
503 }
504
505 static int journal_file_setup_data_hash_table(JournalFile *f) {
506         uint64_t s, p;
507         Object *o;
508         int r;
509
510         assert(f);
511
512         /* We estimate that we need 1 hash table entry per 2K of
513            journal file and we want to make sure we never get beyond
514            75% fill level. Calculate the hash table size for the
515            maximum file size based on these metrics. */
516
517         s = (f->metrics.max_size * 4 / 2048 / 3) * sizeof(HashItem);
518         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
519                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
520
521         log_info("Reserving %llu entries in hash table.", (unsigned long long) s);
522
523         r = journal_file_append_object(f,
524                                        OBJECT_DATA_HASH_TABLE,
525                                        offsetof(Object, hash_table.items) + s,
526                                        &o, &p);
527         if (r < 0)
528                 return r;
529
530         memset(o->hash_table.items, 0, s);
531
532         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
533         f->header->data_hash_table_size = htole64(s);
534
535         return 0;
536 }
537
538 static int journal_file_setup_field_hash_table(JournalFile *f) {
539         uint64_t s, p;
540         Object *o;
541         int r;
542
543         assert(f);
544
545         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
546         r = journal_file_append_object(f,
547                                        OBJECT_FIELD_HASH_TABLE,
548                                        offsetof(Object, hash_table.items) + s,
549                                        &o, &p);
550         if (r < 0)
551                 return r;
552
553         memset(o->hash_table.items, 0, s);
554
555         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
556         f->header->field_hash_table_size = htole64(s);
557
558         return 0;
559 }
560
561 static int journal_file_map_data_hash_table(JournalFile *f) {
562         uint64_t s, p;
563         void *t;
564         int r;
565
566         assert(f);
567
568         p = le64toh(f->header->data_hash_table_offset);
569         s = le64toh(f->header->data_hash_table_size);
570
571         r = journal_file_move_to(f,
572                                  WINDOW_DATA_HASH_TABLE,
573                                  p, s,
574                                  &t);
575         if (r < 0)
576                 return r;
577
578         f->data_hash_table = t;
579         return 0;
580 }
581
582 static int journal_file_map_field_hash_table(JournalFile *f) {
583         uint64_t s, p;
584         void *t;
585         int r;
586
587         assert(f);
588
589         p = le64toh(f->header->field_hash_table_offset);
590         s = le64toh(f->header->field_hash_table_size);
591
592         r = journal_file_move_to(f,
593                                  WINDOW_FIELD_HASH_TABLE,
594                                  p, s,
595                                  &t);
596         if (r < 0)
597                 return r;
598
599         f->field_hash_table = t;
600         return 0;
601 }
602
603 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
604         uint64_t p, h;
605         int r;
606
607         assert(f);
608         assert(o);
609         assert(offset > 0);
610         assert(o->object.type == OBJECT_DATA);
611
612         /* This might alter the window we are looking at */
613
614         o->data.next_hash_offset = o->data.next_field_offset = 0;
615         o->data.entry_offset = o->data.entry_array_offset = 0;
616         o->data.n_entries = 0;
617
618         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
619         p = le64toh(f->data_hash_table[h].tail_hash_offset);
620         if (p == 0) {
621                 /* Only entry in the hash table is easy */
622                 f->data_hash_table[h].head_hash_offset = htole64(offset);
623         } else {
624                 /* Move back to the previous data object, to patch in
625                  * pointer */
626
627                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
628                 if (r < 0)
629                         return r;
630
631                 o->data.next_hash_offset = htole64(offset);
632         }
633
634         f->data_hash_table[h].tail_hash_offset = htole64(offset);
635
636         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
637                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
638
639         return 0;
640 }
641
642 int journal_file_find_data_object_with_hash(
643                 JournalFile *f,
644                 const void *data, uint64_t size, uint64_t hash,
645                 Object **ret, uint64_t *offset) {
646
647         uint64_t p, osize, h;
648         int r;
649
650         assert(f);
651         assert(data || size == 0);
652
653         osize = offsetof(Object, data.payload) + size;
654
655         if (f->header->data_hash_table_size == 0)
656                 return -EBADMSG;
657
658         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
659         p = le64toh(f->data_hash_table[h].head_hash_offset);
660
661         while (p > 0) {
662                 Object *o;
663
664                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
665                 if (r < 0)
666                         return r;
667
668                 if (le64toh(o->data.hash) != hash)
669                         goto next;
670
671                 if (o->object.flags & OBJECT_COMPRESSED) {
672 #ifdef HAVE_XZ
673                         uint64_t l, rsize;
674
675                         l = le64toh(o->object.size);
676                         if (l <= offsetof(Object, data.payload))
677                                 return -EBADMSG;
678
679                         l -= offsetof(Object, data.payload);
680
681                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
682                                 return -EBADMSG;
683
684                         if (rsize == size &&
685                             memcmp(f->compress_buffer, data, size) == 0) {
686
687                                 if (ret)
688                                         *ret = o;
689
690                                 if (offset)
691                                         *offset = p;
692
693                                 return 1;
694                         }
695 #else
696                         return -EPROTONOSUPPORT;
697 #endif
698
699                 } else if (le64toh(o->object.size) == osize &&
700                            memcmp(o->data.payload, data, size) == 0) {
701
702                         if (ret)
703                                 *ret = o;
704
705                         if (offset)
706                                 *offset = p;
707
708                         return 1;
709                 }
710
711         next:
712                 p = le64toh(o->data.next_hash_offset);
713         }
714
715         return 0;
716 }
717
718 int journal_file_find_data_object(
719                 JournalFile *f,
720                 const void *data, uint64_t size,
721                 Object **ret, uint64_t *offset) {
722
723         uint64_t hash;
724
725         assert(f);
726         assert(data || size == 0);
727
728         hash = hash64(data, size);
729
730         return journal_file_find_data_object_with_hash(f,
731                                                        data, size, hash,
732                                                        ret, offset);
733 }
734
735 static int journal_file_append_data(
736                 JournalFile *f,
737                 const void *data, uint64_t size,
738                 Object **ret, uint64_t *offset) {
739
740         uint64_t hash, p;
741         uint64_t osize;
742         Object *o;
743         int r;
744         bool compressed = false;
745
746         assert(f);
747         assert(data || size == 0);
748
749         hash = hash64(data, size);
750
751         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
752         if (r < 0)
753                 return r;
754         else if (r > 0) {
755
756                 if (ret)
757                         *ret = o;
758
759                 if (offset)
760                         *offset = p;
761
762                 return 0;
763         }
764
765         osize = offsetof(Object, data.payload) + size;
766         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
767         if (r < 0)
768                 return r;
769
770         o->data.hash = htole64(hash);
771
772 #ifdef HAVE_XZ
773         if (f->compress &&
774             size >= COMPRESSION_SIZE_THRESHOLD) {
775                 uint64_t rsize;
776
777                 compressed = compress_blob(data, size, o->data.payload, &rsize);
778
779                 if (compressed) {
780                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
781                         o->object.flags |= OBJECT_COMPRESSED;
782
783                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
784
785                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
786                 }
787         }
788 #endif
789
790         if (!compressed)
791                 memcpy(o->data.payload, data, size);
792
793         r = journal_file_link_data(f, o, p, hash);
794         if (r < 0)
795                 return r;
796
797         /* The linking might have altered the window, so let's
798          * refresh our pointer */
799         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
800         if (r < 0)
801                 return r;
802
803         if (ret)
804                 *ret = o;
805
806         if (offset)
807                 *offset = p;
808
809         return 0;
810 }
811
812 uint64_t journal_file_entry_n_items(Object *o) {
813         assert(o);
814         assert(o->object.type == OBJECT_ENTRY);
815
816         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
817 }
818
819 static uint64_t journal_file_entry_array_n_items(Object *o) {
820         assert(o);
821         assert(o->object.type == OBJECT_ENTRY_ARRAY);
822
823         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
824 }
825
826 static int link_entry_into_array(JournalFile *f,
827                                  le64_t *first,
828                                  le64_t *idx,
829                                  uint64_t p) {
830         int r;
831         uint64_t n = 0, ap = 0, q, i, a, hidx;
832         Object *o;
833
834         assert(f);
835         assert(first);
836         assert(idx);
837         assert(p > 0);
838
839         a = le64toh(*first);
840         i = hidx = le64toh(*idx);
841         while (a > 0) {
842
843                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
844                 if (r < 0)
845                         return r;
846
847                 n = journal_file_entry_array_n_items(o);
848                 if (i < n) {
849                         o->entry_array.items[i] = htole64(p);
850                         *idx = htole64(hidx + 1);
851                         return 0;
852                 }
853
854                 i -= n;
855                 ap = a;
856                 a = le64toh(o->entry_array.next_entry_array_offset);
857         }
858
859         if (hidx > n)
860                 n = (hidx+1) * 2;
861         else
862                 n = n * 2;
863
864         if (n < 4)
865                 n = 4;
866
867         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
868                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
869                                        &o, &q);
870         if (r < 0)
871                 return r;
872
873         o->entry_array.items[i] = htole64(p);
874
875         if (ap == 0)
876                 *first = htole64(q);
877         else {
878                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
879                 if (r < 0)
880                         return r;
881
882                 o->entry_array.next_entry_array_offset = htole64(q);
883         }
884
885         *idx = htole64(hidx + 1);
886
887         return 0;
888 }
889
890 static int link_entry_into_array_plus_one(JournalFile *f,
891                                           le64_t *extra,
892                                           le64_t *first,
893                                           le64_t *idx,
894                                           uint64_t p) {
895
896         int r;
897
898         assert(f);
899         assert(extra);
900         assert(first);
901         assert(idx);
902         assert(p > 0);
903
904         if (*idx == 0)
905                 *extra = htole64(p);
906         else {
907                 le64_t i;
908
909                 i = htole64(le64toh(*idx) - 1);
910                 r = link_entry_into_array(f, first, &i, p);
911                 if (r < 0)
912                         return r;
913         }
914
915         *idx = htole64(le64toh(*idx) + 1);
916         return 0;
917 }
918
919 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
920         uint64_t p;
921         int r;
922         assert(f);
923         assert(o);
924         assert(offset > 0);
925
926         p = le64toh(o->entry.items[i].object_offset);
927         if (p == 0)
928                 return -EINVAL;
929
930         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
931         if (r < 0)
932                 return r;
933
934         return link_entry_into_array_plus_one(f,
935                                               &o->data.entry_offset,
936                                               &o->data.entry_array_offset,
937                                               &o->data.n_entries,
938                                               offset);
939 }
940
941 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
942         uint64_t n, i;
943         int r;
944
945         assert(f);
946         assert(o);
947         assert(offset > 0);
948         assert(o->object.type == OBJECT_ENTRY);
949
950         __sync_synchronize();
951
952         /* Link up the entry itself */
953         r = link_entry_into_array(f,
954                                   &f->header->entry_array_offset,
955                                   &f->header->n_entries,
956                                   offset);
957         if (r < 0)
958                 return r;
959
960         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
961
962         if (f->header->head_entry_realtime == 0)
963                 f->header->head_entry_realtime = o->entry.realtime;
964
965         f->header->tail_entry_realtime = o->entry.realtime;
966         f->header->tail_entry_monotonic = o->entry.monotonic;
967
968         f->tail_entry_monotonic_valid = true;
969
970         /* Link up the items */
971         n = journal_file_entry_n_items(o);
972         for (i = 0; i < n; i++) {
973                 r = journal_file_link_entry_item(f, o, offset, i);
974                 if (r < 0)
975                         return r;
976         }
977
978         return 0;
979 }
980
981 static int journal_file_append_entry_internal(
982                 JournalFile *f,
983                 const dual_timestamp *ts,
984                 uint64_t xor_hash,
985                 const EntryItem items[], unsigned n_items,
986                 uint64_t *seqnum,
987                 Object **ret, uint64_t *offset) {
988         uint64_t np;
989         uint64_t osize;
990         Object *o;
991         int r;
992
993         assert(f);
994         assert(items || n_items == 0);
995         assert(ts);
996
997         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
998
999         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1000         if (r < 0)
1001                 return r;
1002
1003         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
1004         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1005         o->entry.realtime = htole64(ts->realtime);
1006         o->entry.monotonic = htole64(ts->monotonic);
1007         o->entry.xor_hash = htole64(xor_hash);
1008         o->entry.boot_id = f->header->boot_id;
1009
1010         r = journal_file_link_entry(f, o, np);
1011         if (r < 0)
1012                 return r;
1013
1014         if (ret)
1015                 *ret = o;
1016
1017         if (offset)
1018                 *offset = np;
1019
1020         return 0;
1021 }
1022
1023 void journal_file_post_change(JournalFile *f) {
1024         assert(f);
1025
1026         /* inotify() does not receive IN_MODIFY events from file
1027          * accesses done via mmap(). After each access we hence
1028          * trigger IN_MODIFY by truncating the journal file to its
1029          * current size which triggers IN_MODIFY. */
1030
1031         __sync_synchronize();
1032
1033         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1034                 log_error("Failed to to truncate file to its own size: %m");
1035 }
1036
1037 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1038         unsigned i;
1039         EntryItem *items;
1040         int r;
1041         uint64_t xor_hash = 0;
1042         struct dual_timestamp _ts;
1043
1044         assert(f);
1045         assert(iovec || n_iovec == 0);
1046
1047         if (!f->writable)
1048                 return -EPERM;
1049
1050         if (!ts) {
1051                 dual_timestamp_get(&_ts);
1052                 ts = &_ts;
1053         }
1054
1055         if (f->tail_entry_monotonic_valid &&
1056             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1057                 return -EINVAL;
1058
1059         items = alloca(sizeof(EntryItem) * n_iovec);
1060
1061         for (i = 0; i < n_iovec; i++) {
1062                 uint64_t p;
1063                 Object *o;
1064
1065                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1066                 if (r < 0)
1067                         return r;
1068
1069                 xor_hash ^= le64toh(o->data.hash);
1070                 items[i].object_offset = htole64(p);
1071                 items[i].hash = o->data.hash;
1072         }
1073
1074         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1075
1076         journal_file_post_change(f);
1077
1078         return r;
1079 }
1080
1081 static int generic_array_get(JournalFile *f,
1082                              uint64_t first,
1083                              uint64_t i,
1084                              Object **ret, uint64_t *offset) {
1085
1086         Object *o;
1087         uint64_t p = 0, a;
1088         int r;
1089
1090         assert(f);
1091
1092         a = first;
1093         while (a > 0) {
1094                 uint64_t n;
1095
1096                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1097                 if (r < 0)
1098                         return r;
1099
1100                 n = journal_file_entry_array_n_items(o);
1101                 if (i < n) {
1102                         p = le64toh(o->entry_array.items[i]);
1103                         break;
1104                 }
1105
1106                 i -= n;
1107                 a = le64toh(o->entry_array.next_entry_array_offset);
1108         }
1109
1110         if (a <= 0 || p <= 0)
1111                 return 0;
1112
1113         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1114         if (r < 0)
1115                 return r;
1116
1117         if (ret)
1118                 *ret = o;
1119
1120         if (offset)
1121                 *offset = p;
1122
1123         return 1;
1124 }
1125
1126 static int generic_array_get_plus_one(JournalFile *f,
1127                                       uint64_t extra,
1128                                       uint64_t first,
1129                                       uint64_t i,
1130                                       Object **ret, uint64_t *offset) {
1131
1132         Object *o;
1133
1134         assert(f);
1135
1136         if (i == 0) {
1137                 int r;
1138
1139                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1140                 if (r < 0)
1141                         return r;
1142
1143                 if (ret)
1144                         *ret = o;
1145
1146                 if (offset)
1147                         *offset = extra;
1148
1149                 return 1;
1150         }
1151
1152         return generic_array_get(f, first, i-1, ret, offset);
1153 }
1154
1155 enum {
1156         TEST_FOUND,
1157         TEST_LEFT,
1158         TEST_RIGHT
1159 };
1160
1161 static int generic_array_bisect(JournalFile *f,
1162                                 uint64_t first,
1163                                 uint64_t n,
1164                                 uint64_t needle,
1165                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1166                                 direction_t direction,
1167                                 Object **ret,
1168                                 uint64_t *offset,
1169                                 uint64_t *idx) {
1170
1171         uint64_t a, p, t = 0, i = 0, last_p = 0;
1172         bool subtract_one = false;
1173         Object *o, *array = NULL;
1174         int r;
1175
1176         assert(f);
1177         assert(test_object);
1178
1179         a = first;
1180         while (a > 0) {
1181                 uint64_t left, right, k, lp;
1182
1183                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1184                 if (r < 0)
1185                         return r;
1186
1187                 k = journal_file_entry_array_n_items(array);
1188                 right = MIN(k, n);
1189                 if (right <= 0)
1190                         return 0;
1191
1192                 i = right - 1;
1193                 lp = p = le64toh(array->entry_array.items[i]);
1194                 if (p <= 0)
1195                         return -EBADMSG;
1196
1197                 r = test_object(f, p, needle);
1198                 if (r < 0)
1199                         return r;
1200
1201                 if (r == TEST_FOUND)
1202                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1203
1204                 if (r == TEST_RIGHT) {
1205                         left = 0;
1206                         right -= 1;
1207                         for (;;) {
1208                                 if (left == right) {
1209                                         if (direction == DIRECTION_UP)
1210                                                 subtract_one = true;
1211
1212                                         i = left;
1213                                         goto found;
1214                                 }
1215
1216                                 assert(left < right);
1217
1218                                 i = (left + right) / 2;
1219                                 p = le64toh(array->entry_array.items[i]);
1220                                 if (p <= 0)
1221                                         return -EBADMSG;
1222
1223                                 r = test_object(f, p, needle);
1224                                 if (r < 0)
1225                                         return r;
1226
1227                                 if (r == TEST_FOUND)
1228                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1229
1230                                 if (r == TEST_RIGHT)
1231                                         right = i;
1232                                 else
1233                                         left = i + 1;
1234                         }
1235                 }
1236
1237                 if (k > n) {
1238                         if (direction == DIRECTION_UP) {
1239                                 i = n;
1240                                 subtract_one = true;
1241                                 goto found;
1242                         }
1243
1244                         return 0;
1245                 }
1246
1247                 last_p = lp;
1248
1249                 n -= k;
1250                 t += k;
1251                 a = le64toh(array->entry_array.next_entry_array_offset);
1252         }
1253
1254         return 0;
1255
1256 found:
1257         if (subtract_one && t == 0 && i == 0)
1258                 return 0;
1259
1260         if (subtract_one && i == 0)
1261                 p = last_p;
1262         else if (subtract_one)
1263                 p = le64toh(array->entry_array.items[i-1]);
1264         else
1265                 p = le64toh(array->entry_array.items[i]);
1266
1267         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1268         if (r < 0)
1269                 return r;
1270
1271         if (ret)
1272                 *ret = o;
1273
1274         if (offset)
1275                 *offset = p;
1276
1277         if (idx)
1278                 *idx = t + i + (subtract_one ? -1 : 0);
1279
1280         return 1;
1281 }
1282
1283 static int generic_array_bisect_plus_one(JournalFile *f,
1284                                          uint64_t extra,
1285                                          uint64_t first,
1286                                          uint64_t n,
1287                                          uint64_t needle,
1288                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1289                                          direction_t direction,
1290                                          Object **ret,
1291                                          uint64_t *offset,
1292                                          uint64_t *idx) {
1293
1294         int r;
1295         bool step_back = false;
1296         Object *o;
1297
1298         assert(f);
1299         assert(test_object);
1300
1301         if (n <= 0)
1302                 return 0;
1303
1304         /* This bisects the array in object 'first', but first checks
1305          * an extra  */
1306         r = test_object(f, extra, needle);
1307         if (r < 0)
1308                 return r;
1309
1310         if (r == TEST_FOUND)
1311                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1312
1313         /* if we are looking with DIRECTION_UP then we need to first
1314            see if in the actual array there is a matching entry, and
1315            return the last one of that. But if there isn't any we need
1316            to return this one. Hence remember this, and return it
1317            below. */
1318         if (r == TEST_LEFT)
1319                 step_back = direction == DIRECTION_UP;
1320
1321         if (r == TEST_RIGHT) {
1322                 if (direction == DIRECTION_DOWN)
1323                         goto found;
1324                 else
1325                         return 0;
1326         }
1327
1328         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1329
1330         if (r == 0 && step_back)
1331                 goto found;
1332
1333         if (r > 0 && idx)
1334                 (*idx) ++;
1335
1336         return r;
1337
1338 found:
1339         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1340         if (r < 0)
1341                 return r;
1342
1343         if (ret)
1344                 *ret = o;
1345
1346         if (offset)
1347                 *offset = extra;
1348
1349         if (idx)
1350                 *idx = 0;
1351
1352         return 1;
1353 }
1354
1355 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1356         assert(f);
1357         assert(p > 0);
1358
1359         if (p == needle)
1360                 return TEST_FOUND;
1361         else if (p < needle)
1362                 return TEST_LEFT;
1363         else
1364                 return TEST_RIGHT;
1365 }
1366
1367 int journal_file_move_to_entry_by_offset(
1368                 JournalFile *f,
1369                 uint64_t p,
1370                 direction_t direction,
1371                 Object **ret,
1372                 uint64_t *offset) {
1373
1374         return generic_array_bisect(f,
1375                                     le64toh(f->header->entry_array_offset),
1376                                     le64toh(f->header->n_entries),
1377                                     p,
1378                                     test_object_offset,
1379                                     direction,
1380                                     ret, offset, NULL);
1381 }
1382
1383
1384 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1385         Object *o;
1386         int r;
1387
1388         assert(f);
1389         assert(p > 0);
1390
1391         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1392         if (r < 0)
1393                 return r;
1394
1395         if (le64toh(o->entry.seqnum) == needle)
1396                 return TEST_FOUND;
1397         else if (le64toh(o->entry.seqnum) < needle)
1398                 return TEST_LEFT;
1399         else
1400                 return TEST_RIGHT;
1401 }
1402
1403 int journal_file_move_to_entry_by_seqnum(
1404                 JournalFile *f,
1405                 uint64_t seqnum,
1406                 direction_t direction,
1407                 Object **ret,
1408                 uint64_t *offset) {
1409
1410         return generic_array_bisect(f,
1411                                     le64toh(f->header->entry_array_offset),
1412                                     le64toh(f->header->n_entries),
1413                                     seqnum,
1414                                     test_object_seqnum,
1415                                     direction,
1416                                     ret, offset, NULL);
1417 }
1418
1419 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1420         Object *o;
1421         int r;
1422
1423         assert(f);
1424         assert(p > 0);
1425
1426         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1427         if (r < 0)
1428                 return r;
1429
1430         if (le64toh(o->entry.realtime) == needle)
1431                 return TEST_FOUND;
1432         else if (le64toh(o->entry.realtime) < needle)
1433                 return TEST_LEFT;
1434         else
1435                 return TEST_RIGHT;
1436 }
1437
1438 int journal_file_move_to_entry_by_realtime(
1439                 JournalFile *f,
1440                 uint64_t realtime,
1441                 direction_t direction,
1442                 Object **ret,
1443                 uint64_t *offset) {
1444
1445         return generic_array_bisect(f,
1446                                     le64toh(f->header->entry_array_offset),
1447                                     le64toh(f->header->n_entries),
1448                                     realtime,
1449                                     test_object_realtime,
1450                                     direction,
1451                                     ret, offset, NULL);
1452 }
1453
1454 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1455         Object *o;
1456         int r;
1457
1458         assert(f);
1459         assert(p > 0);
1460
1461         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1462         if (r < 0)
1463                 return r;
1464
1465         if (le64toh(o->entry.monotonic) == needle)
1466                 return TEST_FOUND;
1467         else if (le64toh(o->entry.monotonic) < needle)
1468                 return TEST_LEFT;
1469         else
1470                 return TEST_RIGHT;
1471 }
1472
1473 int journal_file_move_to_entry_by_monotonic(
1474                 JournalFile *f,
1475                 sd_id128_t boot_id,
1476                 uint64_t monotonic,
1477                 direction_t direction,
1478                 Object **ret,
1479                 uint64_t *offset) {
1480
1481         char t[9+32+1] = "_BOOT_ID=";
1482         Object *o;
1483         int r;
1484
1485         assert(f);
1486
1487         sd_id128_to_string(boot_id, t + 9);
1488         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1489         if (r < 0)
1490                 return r;
1491         if (r == 0)
1492                 return -ENOENT;
1493
1494         return generic_array_bisect_plus_one(f,
1495                                              le64toh(o->data.entry_offset),
1496                                              le64toh(o->data.entry_array_offset),
1497                                              le64toh(o->data.n_entries),
1498                                              monotonic,
1499                                              test_object_monotonic,
1500                                              direction,
1501                                              ret, offset, NULL);
1502 }
1503
1504 int journal_file_next_entry(
1505                 JournalFile *f,
1506                 Object *o, uint64_t p,
1507                 direction_t direction,
1508                 Object **ret, uint64_t *offset) {
1509
1510         uint64_t i, n;
1511         int r;
1512
1513         assert(f);
1514         assert(p > 0 || !o);
1515
1516         n = le64toh(f->header->n_entries);
1517         if (n <= 0)
1518                 return 0;
1519
1520         if (!o)
1521                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1522         else {
1523                 if (o->object.type != OBJECT_ENTRY)
1524                         return -EINVAL;
1525
1526                 r = generic_array_bisect(f,
1527                                          le64toh(f->header->entry_array_offset),
1528                                          le64toh(f->header->n_entries),
1529                                          p,
1530                                          test_object_offset,
1531                                          DIRECTION_DOWN,
1532                                          NULL, NULL,
1533                                          &i);
1534                 if (r <= 0)
1535                         return r;
1536
1537                 if (direction == DIRECTION_DOWN) {
1538                         if (i >= n - 1)
1539                                 return 0;
1540
1541                         i++;
1542                 } else {
1543                         if (i <= 0)
1544                                 return 0;
1545
1546                         i--;
1547                 }
1548         }
1549
1550         /* And jump to it */
1551         return generic_array_get(f,
1552                                  le64toh(f->header->entry_array_offset),
1553                                  i,
1554                                  ret, offset);
1555 }
1556
1557 int journal_file_skip_entry(
1558                 JournalFile *f,
1559                 Object *o, uint64_t p,
1560                 int64_t skip,
1561                 Object **ret, uint64_t *offset) {
1562
1563         uint64_t i, n;
1564         int r;
1565
1566         assert(f);
1567         assert(o);
1568         assert(p > 0);
1569
1570         if (o->object.type != OBJECT_ENTRY)
1571                 return -EINVAL;
1572
1573         r = generic_array_bisect(f,
1574                                  le64toh(f->header->entry_array_offset),
1575                                  le64toh(f->header->n_entries),
1576                                  p,
1577                                  test_object_offset,
1578                                  DIRECTION_DOWN,
1579                                  NULL, NULL,
1580                                  &i);
1581         if (r <= 0)
1582                 return r;
1583
1584         /* Calculate new index */
1585         if (skip < 0) {
1586                 if ((uint64_t) -skip >= i)
1587                         i = 0;
1588                 else
1589                         i = i - (uint64_t) -skip;
1590         } else
1591                 i  += (uint64_t) skip;
1592
1593         n = le64toh(f->header->n_entries);
1594         if (n <= 0)
1595                 return -EBADMSG;
1596
1597         if (i >= n)
1598                 i = n-1;
1599
1600         return generic_array_get(f,
1601                                  le64toh(f->header->entry_array_offset),
1602                                  i,
1603                                  ret, offset);
1604 }
1605
1606 int journal_file_next_entry_for_data(
1607                 JournalFile *f,
1608                 Object *o, uint64_t p,
1609                 uint64_t data_offset,
1610                 direction_t direction,
1611                 Object **ret, uint64_t *offset) {
1612
1613         uint64_t n, i;
1614         int r;
1615         Object *d;
1616
1617         assert(f);
1618         assert(p > 0 || !o);
1619
1620         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1621         if (r < 0)
1622                 return r;
1623
1624         n = le64toh(d->data.n_entries);
1625         if (n <= 0)
1626                 return n;
1627
1628         if (!o)
1629                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1630         else {
1631                 if (o->object.type != OBJECT_ENTRY)
1632                         return -EINVAL;
1633
1634                 r = generic_array_bisect_plus_one(f,
1635                                                   le64toh(d->data.entry_offset),
1636                                                   le64toh(d->data.entry_array_offset),
1637                                                   le64toh(d->data.n_entries),
1638                                                   p,
1639                                                   test_object_offset,
1640                                                   DIRECTION_DOWN,
1641                                                   NULL, NULL,
1642                                                   &i);
1643
1644                 if (r <= 0)
1645                         return r;
1646
1647                 if (direction == DIRECTION_DOWN) {
1648                         if (i >= n - 1)
1649                                 return 0;
1650
1651                         i++;
1652                 } else {
1653                         if (i <= 0)
1654                                 return 0;
1655
1656                         i--;
1657                 }
1658
1659         }
1660
1661         return generic_array_get_plus_one(f,
1662                                           le64toh(d->data.entry_offset),
1663                                           le64toh(d->data.entry_array_offset),
1664                                           i,
1665                                           ret, offset);
1666 }
1667
1668 int journal_file_move_to_entry_by_offset_for_data(
1669                 JournalFile *f,
1670                 uint64_t data_offset,
1671                 uint64_t p,
1672                 direction_t direction,
1673                 Object **ret, uint64_t *offset) {
1674
1675         int r;
1676         Object *d;
1677
1678         assert(f);
1679
1680         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1681         if (r < 0)
1682                 return r;
1683
1684         return generic_array_bisect_plus_one(f,
1685                                              le64toh(d->data.entry_offset),
1686                                              le64toh(d->data.entry_array_offset),
1687                                              le64toh(d->data.n_entries),
1688                                              p,
1689                                              test_object_offset,
1690                                              direction,
1691                                              ret, offset, NULL);
1692 }
1693
1694 int journal_file_move_to_entry_by_monotonic_for_data(
1695                 JournalFile *f,
1696                 uint64_t data_offset,
1697                 sd_id128_t boot_id,
1698                 uint64_t monotonic,
1699                 direction_t direction,
1700                 Object **ret, uint64_t *offset) {
1701
1702         char t[9+32+1] = "_BOOT_ID=";
1703         Object *o, *d;
1704         int r;
1705         uint64_t b, z;
1706
1707         assert(f);
1708
1709         /* First, seek by time */
1710         sd_id128_to_string(boot_id, t + 9);
1711         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1712         if (r < 0)
1713                 return r;
1714         if (r == 0)
1715                 return -ENOENT;
1716
1717         r = generic_array_bisect_plus_one(f,
1718                                           le64toh(o->data.entry_offset),
1719                                           le64toh(o->data.entry_array_offset),
1720                                           le64toh(o->data.n_entries),
1721                                           monotonic,
1722                                           test_object_monotonic,
1723                                           direction,
1724                                           NULL, &z, NULL);
1725         if (r <= 0)
1726                 return r;
1727
1728         /* And now, continue seeking until we find an entry that
1729          * exists in both bisection arrays */
1730
1731         for (;;) {
1732                 Object *qo;
1733                 uint64_t p, q;
1734
1735                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1736                 if (r < 0)
1737                         return r;
1738
1739                 r = generic_array_bisect_plus_one(f,
1740                                                   le64toh(d->data.entry_offset),
1741                                                   le64toh(d->data.entry_array_offset),
1742                                                   le64toh(d->data.n_entries),
1743                                                   z,
1744                                                   test_object_offset,
1745                                                   direction,
1746                                                   NULL, &p, NULL);
1747                 if (r <= 0)
1748                         return r;
1749
1750                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1751                 if (r < 0)
1752                         return r;
1753
1754                 r = generic_array_bisect_plus_one(f,
1755                                                   le64toh(o->data.entry_offset),
1756                                                   le64toh(o->data.entry_array_offset),
1757                                                   le64toh(o->data.n_entries),
1758                                                   p,
1759                                                   test_object_offset,
1760                                                   direction,
1761                                                   &qo, &q, NULL);
1762
1763                 if (r <= 0)
1764                         return r;
1765
1766                 if (p == q) {
1767                         if (ret)
1768                                 *ret = qo;
1769                         if (offset)
1770                                 *offset = q;
1771
1772                         return 1;
1773                 }
1774
1775                 z = q;
1776         }
1777
1778         return 0;
1779 }
1780
1781 int journal_file_move_to_entry_by_seqnum_for_data(
1782                 JournalFile *f,
1783                 uint64_t data_offset,
1784                 uint64_t seqnum,
1785                 direction_t direction,
1786                 Object **ret, uint64_t *offset) {
1787
1788         Object *d;
1789         int r;
1790
1791         assert(f);
1792
1793         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1794         if (r < 0)
1795                 return r;
1796
1797         return generic_array_bisect_plus_one(f,
1798                                              le64toh(d->data.entry_offset),
1799                                              le64toh(d->data.entry_array_offset),
1800                                              le64toh(d->data.n_entries),
1801                                              seqnum,
1802                                              test_object_seqnum,
1803                                              direction,
1804                                              ret, offset, NULL);
1805 }
1806
1807 int journal_file_move_to_entry_by_realtime_for_data(
1808                 JournalFile *f,
1809                 uint64_t data_offset,
1810                 uint64_t realtime,
1811                 direction_t direction,
1812                 Object **ret, uint64_t *offset) {
1813
1814         Object *d;
1815         int r;
1816
1817         assert(f);
1818
1819         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1820         if (r < 0)
1821                 return r;
1822
1823         return generic_array_bisect_plus_one(f,
1824                                              le64toh(d->data.entry_offset),
1825                                              le64toh(d->data.entry_array_offset),
1826                                              le64toh(d->data.n_entries),
1827                                              realtime,
1828                                              test_object_realtime,
1829                                              direction,
1830                                              ret, offset, NULL);
1831 }
1832
1833 void journal_file_dump(JournalFile *f) {
1834         Object *o;
1835         int r;
1836         uint64_t p;
1837
1838         assert(f);
1839
1840         journal_file_print_header(f);
1841
1842         p = le64toh(f->header->header_size);
1843         while (p != 0) {
1844                 r = journal_file_move_to_object(f, -1, p, &o);
1845                 if (r < 0)
1846                         goto fail;
1847
1848                 switch (o->object.type) {
1849
1850                 case OBJECT_UNUSED:
1851                         printf("Type: OBJECT_UNUSED\n");
1852                         break;
1853
1854                 case OBJECT_DATA:
1855                         printf("Type: OBJECT_DATA\n");
1856                         break;
1857
1858                 case OBJECT_ENTRY:
1859                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1860                                (unsigned long long) le64toh(o->entry.seqnum),
1861                                (unsigned long long) le64toh(o->entry.monotonic),
1862                                (unsigned long long) le64toh(o->entry.realtime));
1863                         break;
1864
1865                 case OBJECT_FIELD_HASH_TABLE:
1866                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1867                         break;
1868
1869                 case OBJECT_DATA_HASH_TABLE:
1870                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1871                         break;
1872
1873                 case OBJECT_ENTRY_ARRAY:
1874                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1875                         break;
1876
1877                 case OBJECT_SIGNATURE:
1878                         printf("Type: OBJECT_SIGNATURE\n");
1879                         break;
1880                 }
1881
1882                 if (o->object.flags & OBJECT_COMPRESSED)
1883                         printf("Flags: COMPRESSED\n");
1884
1885                 if (p == le64toh(f->header->tail_object_offset))
1886                         p = 0;
1887                 else
1888                         p = p + ALIGN64(le64toh(o->object.size));
1889         }
1890
1891         return;
1892 fail:
1893         log_error("File corrupt");
1894 }
1895
1896 void journal_file_print_header(JournalFile *f) {
1897         char a[33], b[33], c[33];
1898         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1899
1900         assert(f);
1901
1902         printf("File Path: %s\n"
1903                "File ID: %s\n"
1904                "Machine ID: %s\n"
1905                "Boot ID: %s\n"
1906                "Sequential Number ID: %s\n"
1907                "Header size: %llu\n"
1908                "Arena size: %llu\n"
1909                "Data Hash Table Size: %llu\n"
1910                "Field Hash Table Size: %llu\n"
1911                "Objects: %llu\n"
1912                "Entry Objects: %llu\n"
1913                "Rotate Suggested: %s\n"
1914                "Head Sequential Number: %llu\n"
1915                "Tail Sequential Number: %llu\n"
1916                "Head Realtime Timestamp: %s\n"
1917                "Tail Realtime Timestamp: %s\n",
1918                f->path,
1919                sd_id128_to_string(f->header->file_id, a),
1920                sd_id128_to_string(f->header->machine_id, b),
1921                sd_id128_to_string(f->header->boot_id, c),
1922                sd_id128_to_string(f->header->seqnum_id, c),
1923                (unsigned long long) le64toh(f->header->header_size),
1924                (unsigned long long) le64toh(f->header->arena_size),
1925                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1926                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1927                (unsigned long long) le64toh(f->header->n_objects),
1928                (unsigned long long) le64toh(f->header->n_entries),
1929                yes_no(journal_file_rotate_suggested(f)),
1930                (unsigned long long) le64toh(f->header->head_seqnum),
1931                (unsigned long long) le64toh(f->header->tail_seqnum),
1932                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1933                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)));
1934
1935         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1936                 printf("Data Objects: %llu\n"
1937                        "Data Hash Table Fill: %.1f%%\n",
1938                        (unsigned long long) le64toh(f->header->n_data),
1939                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1940
1941         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1942                 printf("Field Objects: %llu\n"
1943                        "Field Hash Table Fill: %.1f%%\n",
1944                        (unsigned long long) le64toh(f->header->n_fields),
1945                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1946 }
1947
1948 int journal_file_open(
1949                 const char *fname,
1950                 int flags,
1951                 mode_t mode,
1952                 JournalMetrics *metrics,
1953                 JournalFile *template,
1954                 JournalFile **ret) {
1955
1956         JournalFile *f;
1957         int r;
1958         bool newly_created = false;
1959
1960         assert(fname);
1961
1962         if ((flags & O_ACCMODE) != O_RDONLY &&
1963             (flags & O_ACCMODE) != O_RDWR)
1964                 return -EINVAL;
1965
1966         if (!endswith(fname, ".journal"))
1967                 return -EINVAL;
1968
1969         f = new0(JournalFile, 1);
1970         if (!f)
1971                 return -ENOMEM;
1972
1973         f->fd = -1;
1974         f->flags = flags;
1975         f->mode = mode;
1976         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1977         f->prot = prot_from_flags(flags);
1978
1979         if (template)
1980                 f->compress = template->compress;
1981
1982         f->path = strdup(fname);
1983         if (!f->path) {
1984                 r = -ENOMEM;
1985                 goto fail;
1986         }
1987
1988         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1989         if (f->fd < 0) {
1990                 r = -errno;
1991                 goto fail;
1992         }
1993
1994         if (fstat(f->fd, &f->last_stat) < 0) {
1995                 r = -errno;
1996                 goto fail;
1997         }
1998
1999         if (f->last_stat.st_size == 0 && f->writable) {
2000                 newly_created = true;
2001
2002                 r = journal_file_init_header(f, template);
2003                 if (r < 0)
2004                         goto fail;
2005
2006                 if (fstat(f->fd, &f->last_stat) < 0) {
2007                         r = -errno;
2008                         goto fail;
2009                 }
2010         }
2011
2012         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2013                 r = -EIO;
2014                 goto fail;
2015         }
2016
2017         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2018         if (f->header == MAP_FAILED) {
2019                 f->header = NULL;
2020                 r = -errno;
2021                 goto fail;
2022         }
2023
2024         if (!newly_created) {
2025                 r = journal_file_verify_header(f);
2026                 if (r < 0)
2027                         goto fail;
2028         }
2029
2030         if (f->writable) {
2031                 if (metrics) {
2032                         journal_default_metrics(metrics, f->fd);
2033                         f->metrics = *metrics;
2034                 } else if (template)
2035                         f->metrics = template->metrics;
2036
2037                 r = journal_file_refresh_header(f);
2038                 if (r < 0)
2039                         goto fail;
2040         }
2041
2042         if (newly_created) {
2043
2044                 r = journal_file_setup_field_hash_table(f);
2045                 if (r < 0)
2046                         goto fail;
2047
2048                 r = journal_file_setup_data_hash_table(f);
2049                 if (r < 0)
2050                         goto fail;
2051         }
2052
2053         r = journal_file_map_field_hash_table(f);
2054         if (r < 0)
2055                 goto fail;
2056
2057         r = journal_file_map_data_hash_table(f);
2058         if (r < 0)
2059                 goto fail;
2060
2061         if (ret)
2062                 *ret = f;
2063
2064         return 0;
2065
2066 fail:
2067         journal_file_close(f);
2068
2069         return r;
2070 }
2071
2072 int journal_file_rotate(JournalFile **f) {
2073         char *p;
2074         size_t l;
2075         JournalFile *old_file, *new_file = NULL;
2076         int r;
2077
2078         assert(f);
2079         assert(*f);
2080
2081         old_file = *f;
2082
2083         if (!old_file->writable)
2084                 return -EINVAL;
2085
2086         if (!endswith(old_file->path, ".journal"))
2087                 return -EINVAL;
2088
2089         l = strlen(old_file->path);
2090
2091         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2092         if (!p)
2093                 return -ENOMEM;
2094
2095         memcpy(p, old_file->path, l - 8);
2096         p[l-8] = '@';
2097         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2098         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2099                  "-%016llx-%016llx.journal",
2100                  (unsigned long long) le64toh((*f)->header->tail_seqnum),
2101                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2102
2103         r = rename(old_file->path, p);
2104         free(p);
2105
2106         if (r < 0)
2107                 return -errno;
2108
2109         old_file->header->state = STATE_ARCHIVED;
2110
2111         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, NULL, old_file, &new_file);
2112         journal_file_close(old_file);
2113
2114         *f = new_file;
2115         return r;
2116 }
2117
2118 int journal_file_open_reliably(
2119                 const char *fname,
2120                 int flags,
2121                 mode_t mode,
2122                 JournalMetrics *metrics,
2123                 JournalFile *template,
2124                 JournalFile **ret) {
2125
2126         int r;
2127         size_t l;
2128         char *p;
2129
2130         r = journal_file_open(fname, flags, mode, metrics, template, ret);
2131         if (r != -EBADMSG && /* corrupted */
2132             r != -ENODATA && /* truncated */
2133             r != -EHOSTDOWN && /* other machine */
2134             r != -EPROTONOSUPPORT) /* incompatible feature */
2135                 return r;
2136
2137         if ((flags & O_ACCMODE) == O_RDONLY)
2138                 return r;
2139
2140         if (!(flags & O_CREAT))
2141                 return r;
2142
2143         /* The file is corrupted. Rotate it away and try it again (but only once) */
2144
2145         l = strlen(fname);
2146         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2147                      (int) (l-8), fname,
2148                      (unsigned long long) now(CLOCK_REALTIME),
2149                      random_ull()) < 0)
2150                 return -ENOMEM;
2151
2152         r = rename(fname, p);
2153         free(p);
2154         if (r < 0)
2155                 return -errno;
2156
2157         log_warning("File %s corrupted, renaming and replacing.", fname);
2158
2159         return journal_file_open(fname, flags, mode, metrics, template, ret);
2160 }
2161
2162 struct vacuum_info {
2163         off_t usage;
2164         char *filename;
2165
2166         uint64_t realtime;
2167         sd_id128_t seqnum_id;
2168         uint64_t seqnum;
2169
2170         bool have_seqnum;
2171 };
2172
2173 static int vacuum_compare(const void *_a, const void *_b) {
2174         const struct vacuum_info *a, *b;
2175
2176         a = _a;
2177         b = _b;
2178
2179         if (a->have_seqnum && b->have_seqnum &&
2180             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
2181                 if (a->seqnum < b->seqnum)
2182                         return -1;
2183                 else if (a->seqnum > b->seqnum)
2184                         return 1;
2185                 else
2186                         return 0;
2187         }
2188
2189         if (a->realtime < b->realtime)
2190                 return -1;
2191         else if (a->realtime > b->realtime)
2192                 return 1;
2193         else if (a->have_seqnum && b->have_seqnum)
2194                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
2195         else
2196                 return strcmp(a->filename, b->filename);
2197 }
2198
2199 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
2200         DIR *d;
2201         int r = 0;
2202         struct vacuum_info *list = NULL;
2203         unsigned n_list = 0, n_allocated = 0, i;
2204         uint64_t sum = 0;
2205
2206         assert(directory);
2207
2208         if (max_use <= 0)
2209                 return 0;
2210
2211         d = opendir(directory);
2212         if (!d)
2213                 return -errno;
2214
2215         for (;;) {
2216                 int k;
2217                 struct dirent buf, *de;
2218                 size_t q;
2219                 struct stat st;
2220                 char *p;
2221                 unsigned long long seqnum = 0, realtime;
2222                 sd_id128_t seqnum_id;
2223                 bool have_seqnum;
2224
2225                 k = readdir_r(d, &buf, &de);
2226                 if (k != 0) {
2227                         r = -k;
2228                         goto finish;
2229                 }
2230
2231                 if (!de)
2232                         break;
2233
2234                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2235                         continue;
2236
2237                 if (!S_ISREG(st.st_mode))
2238                         continue;
2239
2240                 q = strlen(de->d_name);
2241
2242                 if (endswith(de->d_name, ".journal")) {
2243
2244                         /* Vacuum archived files */
2245
2246                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2247                                 continue;
2248
2249                         if (de->d_name[q-8-16-1] != '-' ||
2250                             de->d_name[q-8-16-1-16-1] != '-' ||
2251                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2252                                 continue;
2253
2254                         p = strdup(de->d_name);
2255                         if (!p) {
2256                                 r = -ENOMEM;
2257                                 goto finish;
2258                         }
2259
2260                         de->d_name[q-8-16-1-16-1] = 0;
2261                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2262                                 free(p);
2263                                 continue;
2264                         }
2265
2266                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2267                                 free(p);
2268                                 continue;
2269                         }
2270
2271                         have_seqnum = true;
2272
2273                 } else if (endswith(de->d_name, ".journal~")) {
2274                         unsigned long long tmp;
2275
2276                         /* Vacuum corrupted files */
2277
2278                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2279                                 continue;
2280
2281                         if (de->d_name[q-1-8-16-1] != '-' ||
2282                             de->d_name[q-1-8-16-1-16-1] != '@')
2283                                 continue;
2284
2285                         p = strdup(de->d_name);
2286                         if (!p) {
2287                                 r = -ENOMEM;
2288                                 goto finish;
2289                         }
2290
2291                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2292                                 free(p);
2293                                 continue;
2294                         }
2295
2296                         have_seqnum = false;
2297                 } else
2298                         continue;
2299
2300                 if (n_list >= n_allocated) {
2301                         struct vacuum_info *j;
2302
2303                         n_allocated = MAX(n_allocated * 2U, 8U);
2304                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2305                         if (!j) {
2306                                 free(p);
2307                                 r = -ENOMEM;
2308                                 goto finish;
2309                         }
2310
2311                         list = j;
2312                 }
2313
2314                 list[n_list].filename = p;
2315                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2316                 list[n_list].seqnum = seqnum;
2317                 list[n_list].realtime = realtime;
2318                 list[n_list].seqnum_id = seqnum_id;
2319                 list[n_list].have_seqnum = have_seqnum;
2320
2321                 sum += list[n_list].usage;
2322
2323                 n_list ++;
2324         }
2325
2326         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2327
2328         for(i = 0; i < n_list; i++) {
2329                 struct statvfs ss;
2330
2331                 if (fstatvfs(dirfd(d), &ss) < 0) {
2332                         r = -errno;
2333                         goto finish;
2334                 }
2335
2336                 if (sum <= max_use &&
2337                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2338                         break;
2339
2340                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2341                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2342                         sum -= list[i].usage;
2343                 } else if (errno != ENOENT)
2344                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2345         }
2346
2347 finish:
2348         for (i = 0; i < n_list; i++)
2349                 free(list[i].filename);
2350
2351         free(list);
2352
2353         if (d)
2354                 closedir(d);
2355
2356         return r;
2357 }
2358
2359 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2360         uint64_t i, n;
2361         uint64_t q, xor_hash = 0;
2362         int r;
2363         EntryItem *items;
2364         dual_timestamp ts;
2365
2366         assert(from);
2367         assert(to);
2368         assert(o);
2369         assert(p);
2370
2371         if (!to->writable)
2372                 return -EPERM;
2373
2374         ts.monotonic = le64toh(o->entry.monotonic);
2375         ts.realtime = le64toh(o->entry.realtime);
2376
2377         if (to->tail_entry_monotonic_valid &&
2378             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2379                 return -EINVAL;
2380
2381         n = journal_file_entry_n_items(o);
2382         items = alloca(sizeof(EntryItem) * n);
2383
2384         for (i = 0; i < n; i++) {
2385                 uint64_t l, h;
2386                 le64_t le_hash;
2387                 size_t t;
2388                 void *data;
2389                 Object *u;
2390
2391                 q = le64toh(o->entry.items[i].object_offset);
2392                 le_hash = o->entry.items[i].hash;
2393
2394                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2395                 if (r < 0)
2396                         return r;
2397
2398                 if (le_hash != o->data.hash)
2399                         return -EBADMSG;
2400
2401                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2402                 t = (size_t) l;
2403
2404                 /* We hit the limit on 32bit machines */
2405                 if ((uint64_t) t != l)
2406                         return -E2BIG;
2407
2408                 if (o->object.flags & OBJECT_COMPRESSED) {
2409 #ifdef HAVE_XZ
2410                         uint64_t rsize;
2411
2412                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2413                                 return -EBADMSG;
2414
2415                         data = from->compress_buffer;
2416                         l = rsize;
2417 #else
2418                         return -EPROTONOSUPPORT;
2419 #endif
2420                 } else
2421                         data = o->data.payload;
2422
2423                 r = journal_file_append_data(to, data, l, &u, &h);
2424                 if (r < 0)
2425                         return r;
2426
2427                 xor_hash ^= le64toh(u->data.hash);
2428                 items[i].object_offset = htole64(h);
2429                 items[i].hash = u->data.hash;
2430
2431                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2432                 if (r < 0)
2433                         return r;
2434         }
2435
2436         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2437 }
2438
2439 void journal_default_metrics(JournalMetrics *m, int fd) {
2440         uint64_t fs_size = 0;
2441         struct statvfs ss;
2442         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2443
2444         assert(m);
2445         assert(fd >= 0);
2446
2447         if (fstatvfs(fd, &ss) >= 0)
2448                 fs_size = ss.f_frsize * ss.f_blocks;
2449
2450         if (m->max_use == (uint64_t) -1) {
2451
2452                 if (fs_size > 0) {
2453                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2454
2455                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2456                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2457
2458                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2459                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2460                 } else
2461                         m->max_use = DEFAULT_MAX_USE_LOWER;
2462         } else {
2463                 m->max_use = PAGE_ALIGN(m->max_use);
2464
2465                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2466                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2467         }
2468
2469         if (m->max_size == (uint64_t) -1) {
2470                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2471
2472                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2473                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2474         } else
2475                 m->max_size = PAGE_ALIGN(m->max_size);
2476
2477         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2478                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2479
2480         if (m->max_size*2 > m->max_use)
2481                 m->max_use = m->max_size*2;
2482
2483         if (m->min_size == (uint64_t) -1)
2484                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2485         else {
2486                 m->min_size = PAGE_ALIGN(m->min_size);
2487
2488                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2489                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2490
2491                 if (m->min_size > m->max_size)
2492                         m->max_size = m->min_size;
2493         }
2494
2495         if (m->keep_free == (uint64_t) -1) {
2496
2497                 if (fs_size > 0) {
2498                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2499
2500                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2501                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2502
2503                 } else
2504                         m->keep_free = DEFAULT_KEEP_FREE;
2505         }
2506
2507         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2508                  format_bytes(a, sizeof(a), m->max_use),
2509                  format_bytes(b, sizeof(b), m->max_size),
2510                  format_bytes(c, sizeof(c), m->min_size),
2511                  format_bytes(d, sizeof(d), m->keep_free));
2512 }
2513
2514 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2515         assert(f);
2516         assert(from || to);
2517
2518         if (from) {
2519                 if (f->header->head_entry_realtime == 0)
2520                         return -ENOENT;
2521
2522                 *from = le64toh(f->header->head_entry_realtime);
2523         }
2524
2525         if (to) {
2526                 if (f->header->tail_entry_realtime == 0)
2527                         return -ENOENT;
2528
2529                 *to = le64toh(f->header->tail_entry_realtime);
2530         }
2531
2532         return 1;
2533 }
2534
2535 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2536         char t[9+32+1] = "_BOOT_ID=";
2537         Object *o;
2538         uint64_t p;
2539         int r;
2540
2541         assert(f);
2542         assert(from || to);
2543
2544         sd_id128_to_string(boot_id, t + 9);
2545
2546         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2547         if (r <= 0)
2548                 return r;
2549
2550         if (le64toh(o->data.n_entries) <= 0)
2551                 return 0;
2552
2553         if (from) {
2554                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2555                 if (r < 0)
2556                         return r;
2557
2558                 *from = le64toh(o->entry.monotonic);
2559         }
2560
2561         if (to) {
2562                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2563                 if (r < 0)
2564                         return r;
2565
2566                 r = generic_array_get_plus_one(f,
2567                                                le64toh(o->data.entry_offset),
2568                                                le64toh(o->data.entry_array_offset),
2569                                                le64toh(o->data.n_entries)-1,
2570                                                &o, NULL);
2571                 if (r <= 0)
2572                         return r;
2573
2574                 *to = le64toh(o->entry.monotonic);
2575         }
2576
2577         return 1;
2578 }
2579
2580 bool journal_file_rotate_suggested(JournalFile *f) {
2581         assert(f);
2582
2583         /* If we gained new header fields we gained new features,
2584          * hence suggest a rotation */
2585         if (le64toh(f->header->header_size) < sizeof(Header))
2586                 return true;
2587
2588         /* Let's check if the hash tables grew over a certain fill
2589          * level (75%, borrowing this value from Java's hash table
2590          * implementation), and if so suggest a rotation. To calculate
2591          * the fill level we need the n_data field, which only exists
2592          * in newer versions. */
2593
2594         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2595                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL)
2596                         return true;
2597
2598         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2599                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL)
2600                         return true;
2601
2602         return false;
2603 }