chiark / gitweb /
journal: automatically rotate journal files if the data hash table is full > 75%
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65
66 #define JOURNAL_HEADER_CONTAINS(h, field) \
67         (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
68
69 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
70
71 void journal_file_close(JournalFile *f) {
72         int t;
73
74         assert(f);
75
76         if (f->header) {
77                 if (f->writable)
78                         f->header->state = STATE_OFFLINE;
79
80                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
81         }
82
83         for (t = 0; t < _WINDOW_MAX; t++)
84                 if (f->windows[t].ptr)
85                         munmap(f->windows[t].ptr, f->windows[t].size);
86
87         if (f->fd >= 0)
88                 close_nointr_nofail(f->fd);
89
90         free(f->path);
91
92 #ifdef HAVE_XZ
93         free(f->compress_buffer);
94 #endif
95
96         free(f);
97 }
98
99 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
100         Header h;
101         ssize_t k;
102         int r;
103
104         assert(f);
105
106         zero(h);
107         memcpy(h.signature, signature, 8);
108         h.header_size = htole64(ALIGN64(sizeof(h)));
109
110         r = sd_id128_randomize(&h.file_id);
111         if (r < 0)
112                 return r;
113
114         if (template) {
115                 h.seqnum_id = template->header->seqnum_id;
116                 h.tail_seqnum = template->header->tail_seqnum;
117         } else
118                 h.seqnum_id = h.file_id;
119
120         k = pwrite(f->fd, &h, sizeof(h), 0);
121         if (k < 0)
122                 return -errno;
123
124         if (k != sizeof(h))
125                 return -EIO;
126
127         return 0;
128 }
129
130 static int journal_file_refresh_header(JournalFile *f) {
131         int r;
132         sd_id128_t boot_id;
133
134         assert(f);
135
136         r = sd_id128_get_machine(&f->header->machine_id);
137         if (r < 0)
138                 return r;
139
140         r = sd_id128_get_boot(&boot_id);
141         if (r < 0)
142                 return r;
143
144         if (sd_id128_equal(boot_id, f->header->boot_id))
145                 f->tail_entry_monotonic_valid = true;
146
147         f->header->boot_id = boot_id;
148
149         f->header->state = STATE_ONLINE;
150
151         __sync_synchronize();
152
153         return 0;
154 }
155
156 static int journal_file_verify_header(JournalFile *f) {
157         assert(f);
158
159         if (memcmp(f->header, signature, 8))
160                 return -EBADMSG;
161
162 #ifdef HAVE_XZ
163         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
164                 return -EPROTONOSUPPORT;
165 #else
166         if (f->header->incompatible_flags != 0)
167                 return -EPROTONOSUPPORT;
168 #endif
169
170         /* The first addition was n_data, so check that we are at least this large */
171         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
172                 return -EBADMSG;
173
174         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
175                 return -ENODATA;
176
177         if (f->writable) {
178                 uint8_t state;
179                 sd_id128_t machine_id;
180                 int r;
181
182                 r = sd_id128_get_machine(&machine_id);
183                 if (r < 0)
184                         return r;
185
186                 if (!sd_id128_equal(machine_id, f->header->machine_id))
187                         return -EHOSTDOWN;
188
189                 state = f->header->state;
190
191                 if (state == STATE_ONLINE)
192                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
193                         /* FIXME: immediately rotate */
194                 else if (state == STATE_ARCHIVED)
195                         return -ESHUTDOWN;
196                 else if (state != STATE_OFFLINE)
197                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
198         }
199
200         return 0;
201 }
202
203 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
204         uint64_t old_size, new_size;
205         int r;
206
207         assert(f);
208
209         /* We assume that this file is not sparse, and we know that
210          * for sure, since we always call posix_fallocate()
211          * ourselves */
212
213         old_size =
214                 le64toh(f->header->header_size) +
215                 le64toh(f->header->arena_size);
216
217         new_size = PAGE_ALIGN(offset + size);
218         if (new_size < le64toh(f->header->header_size))
219                 new_size = le64toh(f->header->header_size);
220
221         if (new_size <= old_size)
222                 return 0;
223
224         if (f->metrics.max_size > 0 &&
225             new_size > f->metrics.max_size)
226                 return -E2BIG;
227
228         if (new_size > f->metrics.min_size &&
229             f->metrics.keep_free > 0) {
230                 struct statvfs svfs;
231
232                 if (fstatvfs(f->fd, &svfs) >= 0) {
233                         uint64_t available;
234
235                         available = svfs.f_bfree * svfs.f_bsize;
236
237                         if (available >= f->metrics.keep_free)
238                                 available -= f->metrics.keep_free;
239                         else
240                                 available = 0;
241
242                         if (new_size - old_size > available)
243                                 return -E2BIG;
244                 }
245         }
246
247         /* Note that the glibc fallocate() fallback is very
248            inefficient, hence we try to minimize the allocation area
249            as we can. */
250         r = posix_fallocate(f->fd, old_size, new_size - old_size);
251         if (r != 0)
252                 return -r;
253
254         if (fstat(f->fd, &f->last_stat) < 0)
255                 return -errno;
256
257         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
258
259         return 0;
260 }
261
262 static int journal_file_map(
263                 JournalFile *f,
264                 uint64_t offset,
265                 uint64_t size,
266                 void **_window,
267                 uint64_t *_woffset,
268                 uint64_t *_wsize,
269                 void **ret) {
270
271         uint64_t woffset, wsize;
272         void *window;
273
274         assert(f);
275         assert(size > 0);
276         assert(ret);
277
278         woffset = offset & ~((uint64_t) page_size() - 1ULL);
279         wsize = size + (offset - woffset);
280         wsize = PAGE_ALIGN(wsize);
281
282         /* Avoid SIGBUS on invalid accesses */
283         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
284                 return -EADDRNOTAVAIL;
285
286         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
287         if (window == MAP_FAILED)
288                 return -errno;
289
290         if (_window)
291                 *_window = window;
292
293         if (_woffset)
294                 *_woffset = woffset;
295
296         if (_wsize)
297                 *_wsize = wsize;
298
299         *ret = (uint8_t*) window + (offset - woffset);
300
301         return 0;
302 }
303
304 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
305         void *p = NULL;
306         uint64_t delta;
307         int r;
308         Window *w;
309
310         assert(f);
311         assert(ret);
312         assert(wt >= 0);
313         assert(wt < _WINDOW_MAX);
314
315         if (offset + size > (uint64_t) f->last_stat.st_size) {
316                 /* Hmm, out of range? Let's refresh the fstat() data
317                  * first, before we trust that check. */
318
319                 if (fstat(f->fd, &f->last_stat) < 0 ||
320                     offset + size > (uint64_t) f->last_stat.st_size)
321                         return -EADDRNOTAVAIL;
322         }
323
324         w = f->windows + wt;
325
326         if (_likely_(w->ptr &&
327                      w->offset <= offset &&
328                      w->offset + w->size >= offset + size)) {
329
330                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
331                 return 0;
332         }
333
334         if (w->ptr) {
335                 if (munmap(w->ptr, w->size) < 0)
336                         return -errno;
337
338                 w->ptr = NULL;
339                 w->size = w->offset = 0;
340         }
341
342         if (size < DEFAULT_WINDOW_SIZE) {
343                 /* If the default window size is larger then what was
344                  * asked for extend the mapping a bit in the hope to
345                  * minimize needed remappings later on. We add half
346                  * the window space before and half behind the
347                  * requested mapping */
348
349                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
350
351                 if (delta > offset)
352                         delta = offset;
353
354                 offset -= delta;
355                 size = DEFAULT_WINDOW_SIZE;
356         } else
357                 delta = 0;
358
359         if (offset + size > (uint64_t) f->last_stat.st_size)
360                 size = (uint64_t) f->last_stat.st_size - offset;
361
362         if (size <= 0)
363                 return -EADDRNOTAVAIL;
364
365         r = journal_file_map(f,
366                              offset, size,
367                              &w->ptr, &w->offset, &w->size,
368                              &p);
369
370         if (r < 0)
371                 return r;
372
373         *ret = (uint8_t*) p + delta;
374         return 0;
375 }
376
377 static bool verify_hash(Object *o) {
378         uint64_t h1, h2;
379
380         assert(o);
381
382         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
383                 h1 = le64toh(o->data.hash);
384                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
385         } else if (o->object.type == OBJECT_FIELD) {
386                 h1 = le64toh(o->field.hash);
387                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
388         } else
389                 return true;
390
391         return h1 == h2;
392 }
393
394 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
395         int r;
396         void *t;
397         Object *o;
398         uint64_t s;
399
400         assert(f);
401         assert(ret);
402         assert(type < _OBJECT_TYPE_MAX);
403
404         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
405         if (r < 0)
406                 return r;
407
408         o = (Object*) t;
409         s = le64toh(o->object.size);
410
411         if (s < sizeof(ObjectHeader))
412                 return -EBADMSG;
413
414         if (type >= 0 && o->object.type != type)
415                 return -EBADMSG;
416
417         if (s > sizeof(ObjectHeader)) {
418                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
419                 if (r < 0)
420                         return r;
421
422                 o = (Object*) t;
423         }
424
425         if (!verify_hash(o))
426                 return -EBADMSG;
427
428         *ret = o;
429         return 0;
430 }
431
432 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
433         uint64_t r;
434
435         assert(f);
436
437         r = le64toh(f->header->tail_seqnum) + 1;
438
439         if (seqnum) {
440                 /* If an external seqnum counter was passed, we update
441                  * both the local and the external one, and set it to
442                  * the maximum of both */
443
444                 if (*seqnum + 1 > r)
445                         r = *seqnum + 1;
446
447                 *seqnum = r;
448         }
449
450         f->header->tail_seqnum = htole64(r);
451
452         if (f->header->head_seqnum == 0)
453                 f->header->head_seqnum = htole64(r);
454
455         return r;
456 }
457
458 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
459         int r;
460         uint64_t p;
461         Object *tail, *o;
462         void *t;
463
464         assert(f);
465         assert(size >= sizeof(ObjectHeader));
466         assert(offset);
467         assert(ret);
468
469         p = le64toh(f->header->tail_object_offset);
470         if (p == 0)
471                 p = le64toh(f->header->header_size);
472         else {
473                 r = journal_file_move_to_object(f, -1, p, &tail);
474                 if (r < 0)
475                         return r;
476
477                 p += ALIGN64(le64toh(tail->object.size));
478         }
479
480         r = journal_file_allocate(f, p, size);
481         if (r < 0)
482                 return r;
483
484         r = journal_file_move_to(f, type, p, size, &t);
485         if (r < 0)
486                 return r;
487
488         o = (Object*) t;
489
490         zero(o->object);
491         o->object.type = type;
492         o->object.size = htole64(size);
493
494         f->header->tail_object_offset = htole64(p);
495         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
496
497         *ret = o;
498         *offset = p;
499
500         return 0;
501 }
502
503 static int journal_file_setup_data_hash_table(JournalFile *f) {
504         uint64_t s, p;
505         Object *o;
506         int r;
507
508         assert(f);
509
510         s = DEFAULT_DATA_HASH_TABLE_SIZE;
511         r = journal_file_append_object(f,
512                                        OBJECT_DATA_HASH_TABLE,
513                                        offsetof(Object, hash_table.items) + s,
514                                        &o, &p);
515         if (r < 0)
516                 return r;
517
518         memset(o->hash_table.items, 0, s);
519
520         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
521         f->header->data_hash_table_size = htole64(s);
522
523         return 0;
524 }
525
526 static int journal_file_setup_field_hash_table(JournalFile *f) {
527         uint64_t s, p;
528         Object *o;
529         int r;
530
531         assert(f);
532
533         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
534         r = journal_file_append_object(f,
535                                        OBJECT_FIELD_HASH_TABLE,
536                                        offsetof(Object, hash_table.items) + s,
537                                        &o, &p);
538         if (r < 0)
539                 return r;
540
541         memset(o->hash_table.items, 0, s);
542
543         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
544         f->header->field_hash_table_size = htole64(s);
545
546         return 0;
547 }
548
549 static int journal_file_map_data_hash_table(JournalFile *f) {
550         uint64_t s, p;
551         void *t;
552         int r;
553
554         assert(f);
555
556         p = le64toh(f->header->data_hash_table_offset);
557         s = le64toh(f->header->data_hash_table_size);
558
559         r = journal_file_move_to(f,
560                                  WINDOW_DATA_HASH_TABLE,
561                                  p, s,
562                                  &t);
563         if (r < 0)
564                 return r;
565
566         f->data_hash_table = t;
567         return 0;
568 }
569
570 static int journal_file_map_field_hash_table(JournalFile *f) {
571         uint64_t s, p;
572         void *t;
573         int r;
574
575         assert(f);
576
577         p = le64toh(f->header->field_hash_table_offset);
578         s = le64toh(f->header->field_hash_table_size);
579
580         r = journal_file_move_to(f,
581                                  WINDOW_FIELD_HASH_TABLE,
582                                  p, s,
583                                  &t);
584         if (r < 0)
585                 return r;
586
587         f->field_hash_table = t;
588         return 0;
589 }
590
591 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
592         uint64_t p, h;
593         int r;
594
595         assert(f);
596         assert(o);
597         assert(offset > 0);
598         assert(o->object.type == OBJECT_DATA);
599
600         /* This might alter the window we are looking at */
601
602         o->data.next_hash_offset = o->data.next_field_offset = 0;
603         o->data.entry_offset = o->data.entry_array_offset = 0;
604         o->data.n_entries = 0;
605
606         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
607         p = le64toh(f->data_hash_table[h].tail_hash_offset);
608         if (p == 0) {
609                 /* Only entry in the hash table is easy */
610                 f->data_hash_table[h].head_hash_offset = htole64(offset);
611         } else {
612                 /* Move back to the previous data object, to patch in
613                  * pointer */
614
615                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
616                 if (r < 0)
617                         return r;
618
619                 o->data.next_hash_offset = htole64(offset);
620         }
621
622         f->data_hash_table[h].tail_hash_offset = htole64(offset);
623
624         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
625                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
626
627         return 0;
628 }
629
630 int journal_file_find_data_object_with_hash(
631                 JournalFile *f,
632                 const void *data, uint64_t size, uint64_t hash,
633                 Object **ret, uint64_t *offset) {
634
635         uint64_t p, osize, h;
636         int r;
637
638         assert(f);
639         assert(data || size == 0);
640
641         osize = offsetof(Object, data.payload) + size;
642
643         if (f->header->data_hash_table_size == 0)
644                 return -EBADMSG;
645
646         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
647         p = le64toh(f->data_hash_table[h].head_hash_offset);
648
649         while (p > 0) {
650                 Object *o;
651
652                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
653                 if (r < 0)
654                         return r;
655
656                 if (le64toh(o->data.hash) != hash)
657                         goto next;
658
659                 if (o->object.flags & OBJECT_COMPRESSED) {
660 #ifdef HAVE_XZ
661                         uint64_t l, rsize;
662
663                         l = le64toh(o->object.size);
664                         if (l <= offsetof(Object, data.payload))
665                                 return -EBADMSG;
666
667                         l -= offsetof(Object, data.payload);
668
669                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
670                                 return -EBADMSG;
671
672                         if (rsize == size &&
673                             memcmp(f->compress_buffer, data, size) == 0) {
674
675                                 if (ret)
676                                         *ret = o;
677
678                                 if (offset)
679                                         *offset = p;
680
681                                 return 1;
682                         }
683 #else
684                         return -EPROTONOSUPPORT;
685 #endif
686
687                 } else if (le64toh(o->object.size) == osize &&
688                            memcmp(o->data.payload, data, size) == 0) {
689
690                         if (ret)
691                                 *ret = o;
692
693                         if (offset)
694                                 *offset = p;
695
696                         return 1;
697                 }
698
699         next:
700                 p = le64toh(o->data.next_hash_offset);
701         }
702
703         return 0;
704 }
705
706 int journal_file_find_data_object(
707                 JournalFile *f,
708                 const void *data, uint64_t size,
709                 Object **ret, uint64_t *offset) {
710
711         uint64_t hash;
712
713         assert(f);
714         assert(data || size == 0);
715
716         hash = hash64(data, size);
717
718         return journal_file_find_data_object_with_hash(f,
719                                                        data, size, hash,
720                                                        ret, offset);
721 }
722
723 static int journal_file_append_data(
724                 JournalFile *f,
725                 const void *data, uint64_t size,
726                 Object **ret, uint64_t *offset) {
727
728         uint64_t hash, p;
729         uint64_t osize;
730         Object *o;
731         int r;
732         bool compressed = false;
733
734         assert(f);
735         assert(data || size == 0);
736
737         hash = hash64(data, size);
738
739         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
740         if (r < 0)
741                 return r;
742         else if (r > 0) {
743
744                 if (ret)
745                         *ret = o;
746
747                 if (offset)
748                         *offset = p;
749
750                 return 0;
751         }
752
753         osize = offsetof(Object, data.payload) + size;
754         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
755         if (r < 0)
756                 return r;
757
758         o->data.hash = htole64(hash);
759
760 #ifdef HAVE_XZ
761         if (f->compress &&
762             size >= COMPRESSION_SIZE_THRESHOLD) {
763                 uint64_t rsize;
764
765                 compressed = compress_blob(data, size, o->data.payload, &rsize);
766
767                 if (compressed) {
768                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
769                         o->object.flags |= OBJECT_COMPRESSED;
770
771                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
772
773                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
774                 }
775         }
776 #endif
777
778         if (!compressed)
779                 memcpy(o->data.payload, data, size);
780
781         r = journal_file_link_data(f, o, p, hash);
782         if (r < 0)
783                 return r;
784
785         /* The linking might have altered the window, so let's
786          * refresh our pointer */
787         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
788         if (r < 0)
789                 return r;
790
791         if (ret)
792                 *ret = o;
793
794         if (offset)
795                 *offset = p;
796
797         return 0;
798 }
799
800 uint64_t journal_file_entry_n_items(Object *o) {
801         assert(o);
802         assert(o->object.type == OBJECT_ENTRY);
803
804         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
805 }
806
807 static uint64_t journal_file_entry_array_n_items(Object *o) {
808         assert(o);
809         assert(o->object.type == OBJECT_ENTRY_ARRAY);
810
811         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
812 }
813
814 static int link_entry_into_array(JournalFile *f,
815                                  le64_t *first,
816                                  le64_t *idx,
817                                  uint64_t p) {
818         int r;
819         uint64_t n = 0, ap = 0, q, i, a, hidx;
820         Object *o;
821
822         assert(f);
823         assert(first);
824         assert(idx);
825         assert(p > 0);
826
827         a = le64toh(*first);
828         i = hidx = le64toh(*idx);
829         while (a > 0) {
830
831                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
832                 if (r < 0)
833                         return r;
834
835                 n = journal_file_entry_array_n_items(o);
836                 if (i < n) {
837                         o->entry_array.items[i] = htole64(p);
838                         *idx = htole64(hidx + 1);
839                         return 0;
840                 }
841
842                 i -= n;
843                 ap = a;
844                 a = le64toh(o->entry_array.next_entry_array_offset);
845         }
846
847         if (hidx > n)
848                 n = (hidx+1) * 2;
849         else
850                 n = n * 2;
851
852         if (n < 4)
853                 n = 4;
854
855         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
856                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
857                                        &o, &q);
858         if (r < 0)
859                 return r;
860
861         o->entry_array.items[i] = htole64(p);
862
863         if (ap == 0)
864                 *first = htole64(q);
865         else {
866                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
867                 if (r < 0)
868                         return r;
869
870                 o->entry_array.next_entry_array_offset = htole64(q);
871         }
872
873         *idx = htole64(hidx + 1);
874
875         return 0;
876 }
877
878 static int link_entry_into_array_plus_one(JournalFile *f,
879                                           le64_t *extra,
880                                           le64_t *first,
881                                           le64_t *idx,
882                                           uint64_t p) {
883
884         int r;
885
886         assert(f);
887         assert(extra);
888         assert(first);
889         assert(idx);
890         assert(p > 0);
891
892         if (*idx == 0)
893                 *extra = htole64(p);
894         else {
895                 le64_t i;
896
897                 i = htole64(le64toh(*idx) - 1);
898                 r = link_entry_into_array(f, first, &i, p);
899                 if (r < 0)
900                         return r;
901         }
902
903         *idx = htole64(le64toh(*idx) + 1);
904         return 0;
905 }
906
907 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
908         uint64_t p;
909         int r;
910         assert(f);
911         assert(o);
912         assert(offset > 0);
913
914         p = le64toh(o->entry.items[i].object_offset);
915         if (p == 0)
916                 return -EINVAL;
917
918         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
919         if (r < 0)
920                 return r;
921
922         return link_entry_into_array_plus_one(f,
923                                               &o->data.entry_offset,
924                                               &o->data.entry_array_offset,
925                                               &o->data.n_entries,
926                                               offset);
927 }
928
929 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
930         uint64_t n, i;
931         int r;
932
933         assert(f);
934         assert(o);
935         assert(offset > 0);
936         assert(o->object.type == OBJECT_ENTRY);
937
938         __sync_synchronize();
939
940         /* Link up the entry itself */
941         r = link_entry_into_array(f,
942                                   &f->header->entry_array_offset,
943                                   &f->header->n_entries,
944                                   offset);
945         if (r < 0)
946                 return r;
947
948         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
949
950         if (f->header->head_entry_realtime == 0)
951                 f->header->head_entry_realtime = o->entry.realtime;
952
953         f->header->tail_entry_realtime = o->entry.realtime;
954         f->header->tail_entry_monotonic = o->entry.monotonic;
955
956         f->tail_entry_monotonic_valid = true;
957
958         /* Link up the items */
959         n = journal_file_entry_n_items(o);
960         for (i = 0; i < n; i++) {
961                 r = journal_file_link_entry_item(f, o, offset, i);
962                 if (r < 0)
963                         return r;
964         }
965
966         return 0;
967 }
968
969 static int journal_file_append_entry_internal(
970                 JournalFile *f,
971                 const dual_timestamp *ts,
972                 uint64_t xor_hash,
973                 const EntryItem items[], unsigned n_items,
974                 uint64_t *seqnum,
975                 Object **ret, uint64_t *offset) {
976         uint64_t np;
977         uint64_t osize;
978         Object *o;
979         int r;
980
981         assert(f);
982         assert(items || n_items == 0);
983         assert(ts);
984
985         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
986
987         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
988         if (r < 0)
989                 return r;
990
991         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
992         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
993         o->entry.realtime = htole64(ts->realtime);
994         o->entry.monotonic = htole64(ts->monotonic);
995         o->entry.xor_hash = htole64(xor_hash);
996         o->entry.boot_id = f->header->boot_id;
997
998         r = journal_file_link_entry(f, o, np);
999         if (r < 0)
1000                 return r;
1001
1002         if (ret)
1003                 *ret = o;
1004
1005         if (offset)
1006                 *offset = np;
1007
1008         return 0;
1009 }
1010
1011 void journal_file_post_change(JournalFile *f) {
1012         assert(f);
1013
1014         /* inotify() does not receive IN_MODIFY events from file
1015          * accesses done via mmap(). After each access we hence
1016          * trigger IN_MODIFY by truncating the journal file to its
1017          * current size which triggers IN_MODIFY. */
1018
1019         __sync_synchronize();
1020
1021         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1022                 log_error("Failed to to truncate file to its own size: %m");
1023 }
1024
1025 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1026         unsigned i;
1027         EntryItem *items;
1028         int r;
1029         uint64_t xor_hash = 0;
1030         struct dual_timestamp _ts;
1031
1032         assert(f);
1033         assert(iovec || n_iovec == 0);
1034
1035         if (!f->writable)
1036                 return -EPERM;
1037
1038         if (!ts) {
1039                 dual_timestamp_get(&_ts);
1040                 ts = &_ts;
1041         }
1042
1043         if (f->tail_entry_monotonic_valid &&
1044             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1045                 return -EINVAL;
1046
1047         items = alloca(sizeof(EntryItem) * n_iovec);
1048
1049         for (i = 0; i < n_iovec; i++) {
1050                 uint64_t p;
1051                 Object *o;
1052
1053                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1054                 if (r < 0)
1055                         return r;
1056
1057                 xor_hash ^= le64toh(o->data.hash);
1058                 items[i].object_offset = htole64(p);
1059                 items[i].hash = o->data.hash;
1060         }
1061
1062         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1063
1064         journal_file_post_change(f);
1065
1066         return r;
1067 }
1068
1069 static int generic_array_get(JournalFile *f,
1070                              uint64_t first,
1071                              uint64_t i,
1072                              Object **ret, uint64_t *offset) {
1073
1074         Object *o;
1075         uint64_t p = 0, a;
1076         int r;
1077
1078         assert(f);
1079
1080         a = first;
1081         while (a > 0) {
1082                 uint64_t n;
1083
1084                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1085                 if (r < 0)
1086                         return r;
1087
1088                 n = journal_file_entry_array_n_items(o);
1089                 if (i < n) {
1090                         p = le64toh(o->entry_array.items[i]);
1091                         break;
1092                 }
1093
1094                 i -= n;
1095                 a = le64toh(o->entry_array.next_entry_array_offset);
1096         }
1097
1098         if (a <= 0 || p <= 0)
1099                 return 0;
1100
1101         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1102         if (r < 0)
1103                 return r;
1104
1105         if (ret)
1106                 *ret = o;
1107
1108         if (offset)
1109                 *offset = p;
1110
1111         return 1;
1112 }
1113
1114 static int generic_array_get_plus_one(JournalFile *f,
1115                                       uint64_t extra,
1116                                       uint64_t first,
1117                                       uint64_t i,
1118                                       Object **ret, uint64_t *offset) {
1119
1120         Object *o;
1121
1122         assert(f);
1123
1124         if (i == 0) {
1125                 int r;
1126
1127                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1128                 if (r < 0)
1129                         return r;
1130
1131                 if (ret)
1132                         *ret = o;
1133
1134                 if (offset)
1135                         *offset = extra;
1136
1137                 return 1;
1138         }
1139
1140         return generic_array_get(f, first, i-1, ret, offset);
1141 }
1142
1143 enum {
1144         TEST_FOUND,
1145         TEST_LEFT,
1146         TEST_RIGHT
1147 };
1148
1149 static int generic_array_bisect(JournalFile *f,
1150                                 uint64_t first,
1151                                 uint64_t n,
1152                                 uint64_t needle,
1153                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1154                                 direction_t direction,
1155                                 Object **ret,
1156                                 uint64_t *offset,
1157                                 uint64_t *idx) {
1158
1159         uint64_t a, p, t = 0, i = 0, last_p = 0;
1160         bool subtract_one = false;
1161         Object *o, *array = NULL;
1162         int r;
1163
1164         assert(f);
1165         assert(test_object);
1166
1167         a = first;
1168         while (a > 0) {
1169                 uint64_t left, right, k, lp;
1170
1171                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1172                 if (r < 0)
1173                         return r;
1174
1175                 k = journal_file_entry_array_n_items(array);
1176                 right = MIN(k, n);
1177                 if (right <= 0)
1178                         return 0;
1179
1180                 i = right - 1;
1181                 lp = p = le64toh(array->entry_array.items[i]);
1182                 if (p <= 0)
1183                         return -EBADMSG;
1184
1185                 r = test_object(f, p, needle);
1186                 if (r < 0)
1187                         return r;
1188
1189                 if (r == TEST_FOUND)
1190                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1191
1192                 if (r == TEST_RIGHT) {
1193                         left = 0;
1194                         right -= 1;
1195                         for (;;) {
1196                                 if (left == right) {
1197                                         if (direction == DIRECTION_UP)
1198                                                 subtract_one = true;
1199
1200                                         i = left;
1201                                         goto found;
1202                                 }
1203
1204                                 assert(left < right);
1205
1206                                 i = (left + right) / 2;
1207                                 p = le64toh(array->entry_array.items[i]);
1208                                 if (p <= 0)
1209                                         return -EBADMSG;
1210
1211                                 r = test_object(f, p, needle);
1212                                 if (r < 0)
1213                                         return r;
1214
1215                                 if (r == TEST_FOUND)
1216                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1217
1218                                 if (r == TEST_RIGHT)
1219                                         right = i;
1220                                 else
1221                                         left = i + 1;
1222                         }
1223                 }
1224
1225                 if (k > n) {
1226                         if (direction == DIRECTION_UP) {
1227                                 i = n;
1228                                 subtract_one = true;
1229                                 goto found;
1230                         }
1231
1232                         return 0;
1233                 }
1234
1235                 last_p = lp;
1236
1237                 n -= k;
1238                 t += k;
1239                 a = le64toh(array->entry_array.next_entry_array_offset);
1240         }
1241
1242         return 0;
1243
1244 found:
1245         if (subtract_one && t == 0 && i == 0)
1246                 return 0;
1247
1248         if (subtract_one && i == 0)
1249                 p = last_p;
1250         else if (subtract_one)
1251                 p = le64toh(array->entry_array.items[i-1]);
1252         else
1253                 p = le64toh(array->entry_array.items[i]);
1254
1255         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1256         if (r < 0)
1257                 return r;
1258
1259         if (ret)
1260                 *ret = o;
1261
1262         if (offset)
1263                 *offset = p;
1264
1265         if (idx)
1266                 *idx = t + i + (subtract_one ? -1 : 0);
1267
1268         return 1;
1269 }
1270
1271 static int generic_array_bisect_plus_one(JournalFile *f,
1272                                          uint64_t extra,
1273                                          uint64_t first,
1274                                          uint64_t n,
1275                                          uint64_t needle,
1276                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1277                                          direction_t direction,
1278                                          Object **ret,
1279                                          uint64_t *offset,
1280                                          uint64_t *idx) {
1281
1282         int r;
1283         bool step_back = false;
1284         Object *o;
1285
1286         assert(f);
1287         assert(test_object);
1288
1289         if (n <= 0)
1290                 return 0;
1291
1292         /* This bisects the array in object 'first', but first checks
1293          * an extra  */
1294         r = test_object(f, extra, needle);
1295         if (r < 0)
1296                 return r;
1297
1298         if (r == TEST_FOUND)
1299                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1300
1301         /* if we are looking with DIRECTION_UP then we need to first
1302            see if in the actual array there is a matching entry, and
1303            return the last one of that. But if there isn't any we need
1304            to return this one. Hence remember this, and return it
1305            below. */
1306         if (r == TEST_LEFT)
1307                 step_back = direction == DIRECTION_UP;
1308
1309         if (r == TEST_RIGHT) {
1310                 if (direction == DIRECTION_DOWN)
1311                         goto found;
1312                 else
1313                         return 0;
1314         }
1315
1316         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1317
1318         if (r == 0 && step_back)
1319                 goto found;
1320
1321         if (r > 0 && idx)
1322                 (*idx) ++;
1323
1324         return r;
1325
1326 found:
1327         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1328         if (r < 0)
1329                 return r;
1330
1331         if (ret)
1332                 *ret = o;
1333
1334         if (offset)
1335                 *offset = extra;
1336
1337         if (idx)
1338                 *idx = 0;
1339
1340         return 1;
1341 }
1342
1343 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1344         assert(f);
1345         assert(p > 0);
1346
1347         if (p == needle)
1348                 return TEST_FOUND;
1349         else if (p < needle)
1350                 return TEST_LEFT;
1351         else
1352                 return TEST_RIGHT;
1353 }
1354
1355 int journal_file_move_to_entry_by_offset(
1356                 JournalFile *f,
1357                 uint64_t p,
1358                 direction_t direction,
1359                 Object **ret,
1360                 uint64_t *offset) {
1361
1362         return generic_array_bisect(f,
1363                                     le64toh(f->header->entry_array_offset),
1364                                     le64toh(f->header->n_entries),
1365                                     p,
1366                                     test_object_offset,
1367                                     direction,
1368                                     ret, offset, NULL);
1369 }
1370
1371
1372 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1373         Object *o;
1374         int r;
1375
1376         assert(f);
1377         assert(p > 0);
1378
1379         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1380         if (r < 0)
1381                 return r;
1382
1383         if (le64toh(o->entry.seqnum) == needle)
1384                 return TEST_FOUND;
1385         else if (le64toh(o->entry.seqnum) < needle)
1386                 return TEST_LEFT;
1387         else
1388                 return TEST_RIGHT;
1389 }
1390
1391 int journal_file_move_to_entry_by_seqnum(
1392                 JournalFile *f,
1393                 uint64_t seqnum,
1394                 direction_t direction,
1395                 Object **ret,
1396                 uint64_t *offset) {
1397
1398         return generic_array_bisect(f,
1399                                     le64toh(f->header->entry_array_offset),
1400                                     le64toh(f->header->n_entries),
1401                                     seqnum,
1402                                     test_object_seqnum,
1403                                     direction,
1404                                     ret, offset, NULL);
1405 }
1406
1407 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1408         Object *o;
1409         int r;
1410
1411         assert(f);
1412         assert(p > 0);
1413
1414         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1415         if (r < 0)
1416                 return r;
1417
1418         if (le64toh(o->entry.realtime) == needle)
1419                 return TEST_FOUND;
1420         else if (le64toh(o->entry.realtime) < needle)
1421                 return TEST_LEFT;
1422         else
1423                 return TEST_RIGHT;
1424 }
1425
1426 int journal_file_move_to_entry_by_realtime(
1427                 JournalFile *f,
1428                 uint64_t realtime,
1429                 direction_t direction,
1430                 Object **ret,
1431                 uint64_t *offset) {
1432
1433         return generic_array_bisect(f,
1434                                     le64toh(f->header->entry_array_offset),
1435                                     le64toh(f->header->n_entries),
1436                                     realtime,
1437                                     test_object_realtime,
1438                                     direction,
1439                                     ret, offset, NULL);
1440 }
1441
1442 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1443         Object *o;
1444         int r;
1445
1446         assert(f);
1447         assert(p > 0);
1448
1449         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1450         if (r < 0)
1451                 return r;
1452
1453         if (le64toh(o->entry.monotonic) == needle)
1454                 return TEST_FOUND;
1455         else if (le64toh(o->entry.monotonic) < needle)
1456                 return TEST_LEFT;
1457         else
1458                 return TEST_RIGHT;
1459 }
1460
1461 int journal_file_move_to_entry_by_monotonic(
1462                 JournalFile *f,
1463                 sd_id128_t boot_id,
1464                 uint64_t monotonic,
1465                 direction_t direction,
1466                 Object **ret,
1467                 uint64_t *offset) {
1468
1469         char t[9+32+1] = "_BOOT_ID=";
1470         Object *o;
1471         int r;
1472
1473         assert(f);
1474
1475         sd_id128_to_string(boot_id, t + 9);
1476         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1477         if (r < 0)
1478                 return r;
1479         if (r == 0)
1480                 return -ENOENT;
1481
1482         return generic_array_bisect_plus_one(f,
1483                                              le64toh(o->data.entry_offset),
1484                                              le64toh(o->data.entry_array_offset),
1485                                              le64toh(o->data.n_entries),
1486                                              monotonic,
1487                                              test_object_monotonic,
1488                                              direction,
1489                                              ret, offset, NULL);
1490 }
1491
1492 int journal_file_next_entry(
1493                 JournalFile *f,
1494                 Object *o, uint64_t p,
1495                 direction_t direction,
1496                 Object **ret, uint64_t *offset) {
1497
1498         uint64_t i, n;
1499         int r;
1500
1501         assert(f);
1502         assert(p > 0 || !o);
1503
1504         n = le64toh(f->header->n_entries);
1505         if (n <= 0)
1506                 return 0;
1507
1508         if (!o)
1509                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1510         else {
1511                 if (o->object.type != OBJECT_ENTRY)
1512                         return -EINVAL;
1513
1514                 r = generic_array_bisect(f,
1515                                          le64toh(f->header->entry_array_offset),
1516                                          le64toh(f->header->n_entries),
1517                                          p,
1518                                          test_object_offset,
1519                                          DIRECTION_DOWN,
1520                                          NULL, NULL,
1521                                          &i);
1522                 if (r <= 0)
1523                         return r;
1524
1525                 if (direction == DIRECTION_DOWN) {
1526                         if (i >= n - 1)
1527                                 return 0;
1528
1529                         i++;
1530                 } else {
1531                         if (i <= 0)
1532                                 return 0;
1533
1534                         i--;
1535                 }
1536         }
1537
1538         /* And jump to it */
1539         return generic_array_get(f,
1540                                  le64toh(f->header->entry_array_offset),
1541                                  i,
1542                                  ret, offset);
1543 }
1544
1545 int journal_file_skip_entry(
1546                 JournalFile *f,
1547                 Object *o, uint64_t p,
1548                 int64_t skip,
1549                 Object **ret, uint64_t *offset) {
1550
1551         uint64_t i, n;
1552         int r;
1553
1554         assert(f);
1555         assert(o);
1556         assert(p > 0);
1557
1558         if (o->object.type != OBJECT_ENTRY)
1559                 return -EINVAL;
1560
1561         r = generic_array_bisect(f,
1562                                  le64toh(f->header->entry_array_offset),
1563                                  le64toh(f->header->n_entries),
1564                                  p,
1565                                  test_object_offset,
1566                                  DIRECTION_DOWN,
1567                                  NULL, NULL,
1568                                  &i);
1569         if (r <= 0)
1570                 return r;
1571
1572         /* Calculate new index */
1573         if (skip < 0) {
1574                 if ((uint64_t) -skip >= i)
1575                         i = 0;
1576                 else
1577                         i = i - (uint64_t) -skip;
1578         } else
1579                 i  += (uint64_t) skip;
1580
1581         n = le64toh(f->header->n_entries);
1582         if (n <= 0)
1583                 return -EBADMSG;
1584
1585         if (i >= n)
1586                 i = n-1;
1587
1588         return generic_array_get(f,
1589                                  le64toh(f->header->entry_array_offset),
1590                                  i,
1591                                  ret, offset);
1592 }
1593
1594 int journal_file_next_entry_for_data(
1595                 JournalFile *f,
1596                 Object *o, uint64_t p,
1597                 uint64_t data_offset,
1598                 direction_t direction,
1599                 Object **ret, uint64_t *offset) {
1600
1601         uint64_t n, i;
1602         int r;
1603         Object *d;
1604
1605         assert(f);
1606         assert(p > 0 || !o);
1607
1608         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1609         if (r < 0)
1610                 return r;
1611
1612         n = le64toh(d->data.n_entries);
1613         if (n <= 0)
1614                 return n;
1615
1616         if (!o)
1617                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1618         else {
1619                 if (o->object.type != OBJECT_ENTRY)
1620                         return -EINVAL;
1621
1622                 r = generic_array_bisect_plus_one(f,
1623                                                   le64toh(d->data.entry_offset),
1624                                                   le64toh(d->data.entry_array_offset),
1625                                                   le64toh(d->data.n_entries),
1626                                                   p,
1627                                                   test_object_offset,
1628                                                   DIRECTION_DOWN,
1629                                                   NULL, NULL,
1630                                                   &i);
1631
1632                 if (r <= 0)
1633                         return r;
1634
1635                 if (direction == DIRECTION_DOWN) {
1636                         if (i >= n - 1)
1637                                 return 0;
1638
1639                         i++;
1640                 } else {
1641                         if (i <= 0)
1642                                 return 0;
1643
1644                         i--;
1645                 }
1646
1647         }
1648
1649         return generic_array_get_plus_one(f,
1650                                           le64toh(d->data.entry_offset),
1651                                           le64toh(d->data.entry_array_offset),
1652                                           i,
1653                                           ret, offset);
1654 }
1655
1656 int journal_file_move_to_entry_by_offset_for_data(
1657                 JournalFile *f,
1658                 uint64_t data_offset,
1659                 uint64_t p,
1660                 direction_t direction,
1661                 Object **ret, uint64_t *offset) {
1662
1663         int r;
1664         Object *d;
1665
1666         assert(f);
1667
1668         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1669         if (r < 0)
1670                 return r;
1671
1672         return generic_array_bisect_plus_one(f,
1673                                              le64toh(d->data.entry_offset),
1674                                              le64toh(d->data.entry_array_offset),
1675                                              le64toh(d->data.n_entries),
1676                                              p,
1677                                              test_object_offset,
1678                                              direction,
1679                                              ret, offset, NULL);
1680 }
1681
1682 int journal_file_move_to_entry_by_monotonic_for_data(
1683                 JournalFile *f,
1684                 uint64_t data_offset,
1685                 sd_id128_t boot_id,
1686                 uint64_t monotonic,
1687                 direction_t direction,
1688                 Object **ret, uint64_t *offset) {
1689
1690         char t[9+32+1] = "_BOOT_ID=";
1691         Object *o, *d;
1692         int r;
1693         uint64_t b, z;
1694
1695         assert(f);
1696
1697         /* First, seek by time */
1698         sd_id128_to_string(boot_id, t + 9);
1699         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1700         if (r < 0)
1701                 return r;
1702         if (r == 0)
1703                 return -ENOENT;
1704
1705         r = generic_array_bisect_plus_one(f,
1706                                           le64toh(o->data.entry_offset),
1707                                           le64toh(o->data.entry_array_offset),
1708                                           le64toh(o->data.n_entries),
1709                                           monotonic,
1710                                           test_object_monotonic,
1711                                           direction,
1712                                           NULL, &z, NULL);
1713         if (r <= 0)
1714                 return r;
1715
1716         /* And now, continue seeking until we find an entry that
1717          * exists in both bisection arrays */
1718
1719         for (;;) {
1720                 Object *qo;
1721                 uint64_t p, q;
1722
1723                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1724                 if (r < 0)
1725                         return r;
1726
1727                 r = generic_array_bisect_plus_one(f,
1728                                                   le64toh(d->data.entry_offset),
1729                                                   le64toh(d->data.entry_array_offset),
1730                                                   le64toh(d->data.n_entries),
1731                                                   z,
1732                                                   test_object_offset,
1733                                                   direction,
1734                                                   NULL, &p, NULL);
1735                 if (r <= 0)
1736                         return r;
1737
1738                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1739                 if (r < 0)
1740                         return r;
1741
1742                 r = generic_array_bisect_plus_one(f,
1743                                                   le64toh(o->data.entry_offset),
1744                                                   le64toh(o->data.entry_array_offset),
1745                                                   le64toh(o->data.n_entries),
1746                                                   p,
1747                                                   test_object_offset,
1748                                                   direction,
1749                                                   &qo, &q, NULL);
1750
1751                 if (r <= 0)
1752                         return r;
1753
1754                 if (p == q) {
1755                         if (ret)
1756                                 *ret = qo;
1757                         if (offset)
1758                                 *offset = q;
1759
1760                         return 1;
1761                 }
1762
1763                 z = q;
1764         }
1765
1766         return 0;
1767 }
1768
1769 int journal_file_move_to_entry_by_seqnum_for_data(
1770                 JournalFile *f,
1771                 uint64_t data_offset,
1772                 uint64_t seqnum,
1773                 direction_t direction,
1774                 Object **ret, uint64_t *offset) {
1775
1776         Object *d;
1777         int r;
1778
1779         assert(f);
1780
1781         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1782         if (r < 0)
1783                 return r;
1784
1785         return generic_array_bisect_plus_one(f,
1786                                              le64toh(d->data.entry_offset),
1787                                              le64toh(d->data.entry_array_offset),
1788                                              le64toh(d->data.n_entries),
1789                                              seqnum,
1790                                              test_object_seqnum,
1791                                              direction,
1792                                              ret, offset, NULL);
1793 }
1794
1795 int journal_file_move_to_entry_by_realtime_for_data(
1796                 JournalFile *f,
1797                 uint64_t data_offset,
1798                 uint64_t realtime,
1799                 direction_t direction,
1800                 Object **ret, uint64_t *offset) {
1801
1802         Object *d;
1803         int r;
1804
1805         assert(f);
1806
1807         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1808         if (r < 0)
1809                 return r;
1810
1811         return generic_array_bisect_plus_one(f,
1812                                              le64toh(d->data.entry_offset),
1813                                              le64toh(d->data.entry_array_offset),
1814                                              le64toh(d->data.n_entries),
1815                                              realtime,
1816                                              test_object_realtime,
1817                                              direction,
1818                                              ret, offset, NULL);
1819 }
1820
1821 void journal_file_dump(JournalFile *f) {
1822         Object *o;
1823         int r;
1824         uint64_t p;
1825
1826         assert(f);
1827
1828         journal_file_print_header(f);
1829
1830         p = le64toh(f->header->header_size);
1831         while (p != 0) {
1832                 r = journal_file_move_to_object(f, -1, p, &o);
1833                 if (r < 0)
1834                         goto fail;
1835
1836                 switch (o->object.type) {
1837
1838                 case OBJECT_UNUSED:
1839                         printf("Type: OBJECT_UNUSED\n");
1840                         break;
1841
1842                 case OBJECT_DATA:
1843                         printf("Type: OBJECT_DATA\n");
1844                         break;
1845
1846                 case OBJECT_ENTRY:
1847                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1848                                (unsigned long long) le64toh(o->entry.seqnum),
1849                                (unsigned long long) le64toh(o->entry.monotonic),
1850                                (unsigned long long) le64toh(o->entry.realtime));
1851                         break;
1852
1853                 case OBJECT_FIELD_HASH_TABLE:
1854                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1855                         break;
1856
1857                 case OBJECT_DATA_HASH_TABLE:
1858                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1859                         break;
1860
1861                 case OBJECT_ENTRY_ARRAY:
1862                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1863                         break;
1864
1865                 case OBJECT_SIGNATURE:
1866                         printf("Type: OBJECT_SIGNATURE\n");
1867                         break;
1868                 }
1869
1870                 if (o->object.flags & OBJECT_COMPRESSED)
1871                         printf("Flags: COMPRESSED\n");
1872
1873                 if (p == le64toh(f->header->tail_object_offset))
1874                         p = 0;
1875                 else
1876                         p = p + ALIGN64(le64toh(o->object.size));
1877         }
1878
1879         return;
1880 fail:
1881         log_error("File corrupt");
1882 }
1883
1884 void journal_file_print_header(JournalFile *f) {
1885         char a[33], b[33], c[33];
1886         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1887
1888         assert(f);
1889
1890         printf("File Path: %s\n"
1891                "File ID: %s\n"
1892                "Machine ID: %s\n"
1893                "Boot ID: %s\n"
1894                "Sequential Number ID: %s\n"
1895                "Header size: %llu\n"
1896                "Arena size: %llu\n"
1897                "Data Hash Table Size: %llu\n"
1898                "Field Hash Table Size: %llu\n"
1899                "Objects: %llu\n"
1900                "Entry Objects: %llu\n"
1901                "Rotate Suggested: %s\n"
1902                "Head Sequential Number: %llu\n"
1903                "Tail Sequential Number: %llu\n"
1904                "Head Realtime Timestamp: %s\n"
1905                "Tail Realtime Timestamp: %s\n",
1906                f->path,
1907                sd_id128_to_string(f->header->file_id, a),
1908                sd_id128_to_string(f->header->machine_id, b),
1909                sd_id128_to_string(f->header->boot_id, c),
1910                sd_id128_to_string(f->header->seqnum_id, c),
1911                (unsigned long long) le64toh(f->header->header_size),
1912                (unsigned long long) le64toh(f->header->arena_size),
1913                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1914                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1915                (unsigned long long) le64toh(f->header->n_objects),
1916                (unsigned long long) le64toh(f->header->n_entries),
1917                yes_no(journal_file_rotate_suggested(f)),
1918                (unsigned long long) le64toh(f->header->head_seqnum),
1919                (unsigned long long) le64toh(f->header->tail_seqnum),
1920                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1921                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)));
1922
1923         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1924                 printf("Data Objects: %llu\n"
1925                        "Data Hash Table Fill: %.1f%%\n",
1926                        (unsigned long long) le64toh(f->header->n_data),
1927                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1928
1929         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1930                 printf("Field Objects: %llu\n"
1931                        "Field Hash Table Fill: %.1f%%\n",
1932                        (unsigned long long) le64toh(f->header->n_fields),
1933                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1934 }
1935
1936 int journal_file_open(
1937                 const char *fname,
1938                 int flags,
1939                 mode_t mode,
1940                 JournalFile *template,
1941                 JournalFile **ret) {
1942
1943         JournalFile *f;
1944         int r;
1945         bool newly_created = false;
1946
1947         assert(fname);
1948
1949         if ((flags & O_ACCMODE) != O_RDONLY &&
1950             (flags & O_ACCMODE) != O_RDWR)
1951                 return -EINVAL;
1952
1953         if (!endswith(fname, ".journal"))
1954                 return -EINVAL;
1955
1956         f = new0(JournalFile, 1);
1957         if (!f)
1958                 return -ENOMEM;
1959
1960         f->fd = -1;
1961         f->flags = flags;
1962         f->mode = mode;
1963         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1964         f->prot = prot_from_flags(flags);
1965
1966         if (template) {
1967                 f->metrics = template->metrics;
1968                 f->compress = template->compress;
1969         }
1970
1971         f->path = strdup(fname);
1972         if (!f->path) {
1973                 r = -ENOMEM;
1974                 goto fail;
1975         }
1976
1977         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1978         if (f->fd < 0) {
1979                 r = -errno;
1980                 goto fail;
1981         }
1982
1983         if (fstat(f->fd, &f->last_stat) < 0) {
1984                 r = -errno;
1985                 goto fail;
1986         }
1987
1988         if (f->last_stat.st_size == 0 && f->writable) {
1989                 newly_created = true;
1990
1991                 r = journal_file_init_header(f, template);
1992                 if (r < 0)
1993                         goto fail;
1994
1995                 if (fstat(f->fd, &f->last_stat) < 0) {
1996                         r = -errno;
1997                         goto fail;
1998                 }
1999         }
2000
2001         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2002                 r = -EIO;
2003                 goto fail;
2004         }
2005
2006         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2007         if (f->header == MAP_FAILED) {
2008                 f->header = NULL;
2009                 r = -errno;
2010                 goto fail;
2011         }
2012
2013         if (!newly_created) {
2014                 r = journal_file_verify_header(f);
2015                 if (r < 0)
2016                         goto fail;
2017         }
2018
2019         if (f->writable) {
2020                 r = journal_file_refresh_header(f);
2021                 if (r < 0)
2022                         goto fail;
2023         }
2024
2025         if (newly_created) {
2026
2027                 r = journal_file_setup_field_hash_table(f);
2028                 if (r < 0)
2029                         goto fail;
2030
2031                 r = journal_file_setup_data_hash_table(f);
2032                 if (r < 0)
2033                         goto fail;
2034         }
2035
2036         r = journal_file_map_field_hash_table(f);
2037         if (r < 0)
2038                 goto fail;
2039
2040         r = journal_file_map_data_hash_table(f);
2041         if (r < 0)
2042                 goto fail;
2043
2044         if (ret)
2045                 *ret = f;
2046
2047         return 0;
2048
2049 fail:
2050         journal_file_close(f);
2051
2052         return r;
2053 }
2054
2055 int journal_file_rotate(JournalFile **f) {
2056         char *p;
2057         size_t l;
2058         JournalFile *old_file, *new_file = NULL;
2059         int r;
2060
2061         assert(f);
2062         assert(*f);
2063
2064         old_file = *f;
2065
2066         if (!old_file->writable)
2067                 return -EINVAL;
2068
2069         if (!endswith(old_file->path, ".journal"))
2070                 return -EINVAL;
2071
2072         l = strlen(old_file->path);
2073
2074         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2075         if (!p)
2076                 return -ENOMEM;
2077
2078         memcpy(p, old_file->path, l - 8);
2079         p[l-8] = '@';
2080         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2081         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2082                  "-%016llx-%016llx.journal",
2083                  (unsigned long long) le64toh((*f)->header->tail_seqnum),
2084                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2085
2086         r = rename(old_file->path, p);
2087         free(p);
2088
2089         if (r < 0)
2090                 return -errno;
2091
2092         old_file->header->state = STATE_ARCHIVED;
2093
2094         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
2095         journal_file_close(old_file);
2096
2097         *f = new_file;
2098         return r;
2099 }
2100
2101 int journal_file_open_reliably(
2102                 const char *fname,
2103                 int flags,
2104                 mode_t mode,
2105                 JournalFile *template,
2106                 JournalFile **ret) {
2107
2108         int r;
2109         size_t l;
2110         char *p;
2111
2112         r = journal_file_open(fname, flags, mode, template, ret);
2113         if (r != -EBADMSG && /* corrupted */
2114             r != -ENODATA && /* truncated */
2115             r != -EHOSTDOWN && /* other machine */
2116             r != -EPROTONOSUPPORT) /* incompatible feature */
2117                 return r;
2118
2119         if ((flags & O_ACCMODE) == O_RDONLY)
2120                 return r;
2121
2122         if (!(flags & O_CREAT))
2123                 return r;
2124
2125         /* The file is corrupted. Rotate it away and try it again (but only once) */
2126
2127         l = strlen(fname);
2128         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2129                      (int) (l-8), fname,
2130                      (unsigned long long) now(CLOCK_REALTIME),
2131                      random_ull()) < 0)
2132                 return -ENOMEM;
2133
2134         r = rename(fname, p);
2135         free(p);
2136         if (r < 0)
2137                 return -errno;
2138
2139         log_warning("File %s corrupted, renaming and replacing.", fname);
2140
2141         return journal_file_open(fname, flags, mode, template, ret);
2142 }
2143
2144 struct vacuum_info {
2145         off_t usage;
2146         char *filename;
2147
2148         uint64_t realtime;
2149         sd_id128_t seqnum_id;
2150         uint64_t seqnum;
2151
2152         bool have_seqnum;
2153 };
2154
2155 static int vacuum_compare(const void *_a, const void *_b) {
2156         const struct vacuum_info *a, *b;
2157
2158         a = _a;
2159         b = _b;
2160
2161         if (a->have_seqnum && b->have_seqnum &&
2162             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
2163                 if (a->seqnum < b->seqnum)
2164                         return -1;
2165                 else if (a->seqnum > b->seqnum)
2166                         return 1;
2167                 else
2168                         return 0;
2169         }
2170
2171         if (a->realtime < b->realtime)
2172                 return -1;
2173         else if (a->realtime > b->realtime)
2174                 return 1;
2175         else if (a->have_seqnum && b->have_seqnum)
2176                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
2177         else
2178                 return strcmp(a->filename, b->filename);
2179 }
2180
2181 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
2182         DIR *d;
2183         int r = 0;
2184         struct vacuum_info *list = NULL;
2185         unsigned n_list = 0, n_allocated = 0, i;
2186         uint64_t sum = 0;
2187
2188         assert(directory);
2189
2190         if (max_use <= 0)
2191                 return 0;
2192
2193         d = opendir(directory);
2194         if (!d)
2195                 return -errno;
2196
2197         for (;;) {
2198                 int k;
2199                 struct dirent buf, *de;
2200                 size_t q;
2201                 struct stat st;
2202                 char *p;
2203                 unsigned long long seqnum = 0, realtime;
2204                 sd_id128_t seqnum_id;
2205                 bool have_seqnum;
2206
2207                 k = readdir_r(d, &buf, &de);
2208                 if (k != 0) {
2209                         r = -k;
2210                         goto finish;
2211                 }
2212
2213                 if (!de)
2214                         break;
2215
2216                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2217                         continue;
2218
2219                 if (!S_ISREG(st.st_mode))
2220                         continue;
2221
2222                 q = strlen(de->d_name);
2223
2224                 if (endswith(de->d_name, ".journal")) {
2225
2226                         /* Vacuum archived files */
2227
2228                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2229                                 continue;
2230
2231                         if (de->d_name[q-8-16-1] != '-' ||
2232                             de->d_name[q-8-16-1-16-1] != '-' ||
2233                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2234                                 continue;
2235
2236                         p = strdup(de->d_name);
2237                         if (!p) {
2238                                 r = -ENOMEM;
2239                                 goto finish;
2240                         }
2241
2242                         de->d_name[q-8-16-1-16-1] = 0;
2243                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2244                                 free(p);
2245                                 continue;
2246                         }
2247
2248                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2249                                 free(p);
2250                                 continue;
2251                         }
2252
2253                         have_seqnum = true;
2254
2255                 } else if (endswith(de->d_name, ".journal~")) {
2256                         unsigned long long tmp;
2257
2258                         /* Vacuum corrupted files */
2259
2260                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2261                                 continue;
2262
2263                         if (de->d_name[q-1-8-16-1] != '-' ||
2264                             de->d_name[q-1-8-16-1-16-1] != '@')
2265                                 continue;
2266
2267                         p = strdup(de->d_name);
2268                         if (!p) {
2269                                 r = -ENOMEM;
2270                                 goto finish;
2271                         }
2272
2273                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2274                                 free(p);
2275                                 continue;
2276                         }
2277
2278                         have_seqnum = false;
2279                 } else
2280                         continue;
2281
2282                 if (n_list >= n_allocated) {
2283                         struct vacuum_info *j;
2284
2285                         n_allocated = MAX(n_allocated * 2U, 8U);
2286                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2287                         if (!j) {
2288                                 free(p);
2289                                 r = -ENOMEM;
2290                                 goto finish;
2291                         }
2292
2293                         list = j;
2294                 }
2295
2296                 list[n_list].filename = p;
2297                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2298                 list[n_list].seqnum = seqnum;
2299                 list[n_list].realtime = realtime;
2300                 list[n_list].seqnum_id = seqnum_id;
2301                 list[n_list].have_seqnum = have_seqnum;
2302
2303                 sum += list[n_list].usage;
2304
2305                 n_list ++;
2306         }
2307
2308         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2309
2310         for(i = 0; i < n_list; i++) {
2311                 struct statvfs ss;
2312
2313                 if (fstatvfs(dirfd(d), &ss) < 0) {
2314                         r = -errno;
2315                         goto finish;
2316                 }
2317
2318                 if (sum <= max_use &&
2319                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2320                         break;
2321
2322                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2323                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2324                         sum -= list[i].usage;
2325                 } else if (errno != ENOENT)
2326                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2327         }
2328
2329 finish:
2330         for (i = 0; i < n_list; i++)
2331                 free(list[i].filename);
2332
2333         free(list);
2334
2335         if (d)
2336                 closedir(d);
2337
2338         return r;
2339 }
2340
2341 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2342         uint64_t i, n;
2343         uint64_t q, xor_hash = 0;
2344         int r;
2345         EntryItem *items;
2346         dual_timestamp ts;
2347
2348         assert(from);
2349         assert(to);
2350         assert(o);
2351         assert(p);
2352
2353         if (!to->writable)
2354                 return -EPERM;
2355
2356         ts.monotonic = le64toh(o->entry.monotonic);
2357         ts.realtime = le64toh(o->entry.realtime);
2358
2359         if (to->tail_entry_monotonic_valid &&
2360             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2361                 return -EINVAL;
2362
2363         n = journal_file_entry_n_items(o);
2364         items = alloca(sizeof(EntryItem) * n);
2365
2366         for (i = 0; i < n; i++) {
2367                 uint64_t l, h;
2368                 le64_t le_hash;
2369                 size_t t;
2370                 void *data;
2371                 Object *u;
2372
2373                 q = le64toh(o->entry.items[i].object_offset);
2374                 le_hash = o->entry.items[i].hash;
2375
2376                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2377                 if (r < 0)
2378                         return r;
2379
2380                 if (le_hash != o->data.hash)
2381                         return -EBADMSG;
2382
2383                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2384                 t = (size_t) l;
2385
2386                 /* We hit the limit on 32bit machines */
2387                 if ((uint64_t) t != l)
2388                         return -E2BIG;
2389
2390                 if (o->object.flags & OBJECT_COMPRESSED) {
2391 #ifdef HAVE_XZ
2392                         uint64_t rsize;
2393
2394                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2395                                 return -EBADMSG;
2396
2397                         data = from->compress_buffer;
2398                         l = rsize;
2399 #else
2400                         return -EPROTONOSUPPORT;
2401 #endif
2402                 } else
2403                         data = o->data.payload;
2404
2405                 r = journal_file_append_data(to, data, l, &u, &h);
2406                 if (r < 0)
2407                         return r;
2408
2409                 xor_hash ^= le64toh(u->data.hash);
2410                 items[i].object_offset = htole64(h);
2411                 items[i].hash = u->data.hash;
2412
2413                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2414                 if (r < 0)
2415                         return r;
2416         }
2417
2418         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2419 }
2420
2421 void journal_default_metrics(JournalMetrics *m, int fd) {
2422         uint64_t fs_size = 0;
2423         struct statvfs ss;
2424         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2425
2426         assert(m);
2427         assert(fd >= 0);
2428
2429         if (fstatvfs(fd, &ss) >= 0)
2430                 fs_size = ss.f_frsize * ss.f_blocks;
2431
2432         if (m->max_use == (uint64_t) -1) {
2433
2434                 if (fs_size > 0) {
2435                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2436
2437                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2438                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2439
2440                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2441                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2442                 } else
2443                         m->max_use = DEFAULT_MAX_USE_LOWER;
2444         } else {
2445                 m->max_use = PAGE_ALIGN(m->max_use);
2446
2447                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2448                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2449         }
2450
2451         if (m->max_size == (uint64_t) -1) {
2452                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2453
2454                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2455                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2456         } else
2457                 m->max_size = PAGE_ALIGN(m->max_size);
2458
2459         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2460                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2461
2462         if (m->max_size*2 > m->max_use)
2463                 m->max_use = m->max_size*2;
2464
2465         if (m->min_size == (uint64_t) -1)
2466                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2467         else {
2468                 m->min_size = PAGE_ALIGN(m->min_size);
2469
2470                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2471                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2472
2473                 if (m->min_size > m->max_size)
2474                         m->max_size = m->min_size;
2475         }
2476
2477         if (m->keep_free == (uint64_t) -1) {
2478
2479                 if (fs_size > 0) {
2480                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2481
2482                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2483                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2484
2485                 } else
2486                         m->keep_free = DEFAULT_KEEP_FREE;
2487         }
2488
2489         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2490                  format_bytes(a, sizeof(a), m->max_use),
2491                  format_bytes(b, sizeof(b), m->max_size),
2492                  format_bytes(c, sizeof(c), m->min_size),
2493                  format_bytes(d, sizeof(d), m->keep_free));
2494 }
2495
2496 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2497         assert(f);
2498         assert(from || to);
2499
2500         if (from) {
2501                 if (f->header->head_entry_realtime == 0)
2502                         return -ENOENT;
2503
2504                 *from = le64toh(f->header->head_entry_realtime);
2505         }
2506
2507         if (to) {
2508                 if (f->header->tail_entry_realtime == 0)
2509                         return -ENOENT;
2510
2511                 *to = le64toh(f->header->tail_entry_realtime);
2512         }
2513
2514         return 1;
2515 }
2516
2517 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2518         char t[9+32+1] = "_BOOT_ID=";
2519         Object *o;
2520         uint64_t p;
2521         int r;
2522
2523         assert(f);
2524         assert(from || to);
2525
2526         sd_id128_to_string(boot_id, t + 9);
2527
2528         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2529         if (r <= 0)
2530                 return r;
2531
2532         if (le64toh(o->data.n_entries) <= 0)
2533                 return 0;
2534
2535         if (from) {
2536                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2537                 if (r < 0)
2538                         return r;
2539
2540                 *from = le64toh(o->entry.monotonic);
2541         }
2542
2543         if (to) {
2544                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2545                 if (r < 0)
2546                         return r;
2547
2548                 r = generic_array_get_plus_one(f,
2549                                                le64toh(o->data.entry_offset),
2550                                                le64toh(o->data.entry_array_offset),
2551                                                le64toh(o->data.n_entries)-1,
2552                                                &o, NULL);
2553                 if (r <= 0)
2554                         return r;
2555
2556                 *to = le64toh(o->entry.monotonic);
2557         }
2558
2559         return 1;
2560 }
2561
2562 bool journal_file_rotate_suggested(JournalFile *f) {
2563         assert(f);
2564
2565         /* If we gained new header fields we gained new features,
2566          * hence suggest a rotation */
2567         if (le64toh(f->header->header_size) < sizeof(Header))
2568                 return true;
2569
2570         /* Let's check if the hash tables grew over a certain fill
2571          * level (75%, borrowing this value from Java's hash table
2572          * implementation), and if so suggest a rotation. To calculate
2573          * the fill level we need the n_data field, which only exists
2574          * in newer versions. */
2575
2576         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2577                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL)
2578                         return true;
2579
2580         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2581                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL)
2582                         return true;
2583
2584         return false;
2585 }