chiark / gitweb /
journal: immediately rotate when the journal was previously not closed properly
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65
66 #define JOURNAL_HEADER_CONTAINS(h, field) \
67         (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
68
69 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
70
71 void journal_file_close(JournalFile *f) {
72         int t;
73
74         assert(f);
75
76         if (f->header) {
77                 if (f->writable)
78                         f->header->state = STATE_OFFLINE;
79
80                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
81         }
82
83         for (t = 0; t < _WINDOW_MAX; t++)
84                 if (f->windows[t].ptr)
85                         munmap(f->windows[t].ptr, f->windows[t].size);
86
87         if (f->fd >= 0)
88                 close_nointr_nofail(f->fd);
89
90         free(f->path);
91
92 #ifdef HAVE_XZ
93         free(f->compress_buffer);
94 #endif
95
96         free(f);
97 }
98
99 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
100         Header h;
101         ssize_t k;
102         int r;
103
104         assert(f);
105
106         zero(h);
107         memcpy(h.signature, signature, 8);
108         h.header_size = htole64(ALIGN64(sizeof(h)));
109
110         r = sd_id128_randomize(&h.file_id);
111         if (r < 0)
112                 return r;
113
114         if (template) {
115                 h.seqnum_id = template->header->seqnum_id;
116                 h.tail_seqnum = template->header->tail_seqnum;
117         } else
118                 h.seqnum_id = h.file_id;
119
120         k = pwrite(f->fd, &h, sizeof(h), 0);
121         if (k < 0)
122                 return -errno;
123
124         if (k != sizeof(h))
125                 return -EIO;
126
127         return 0;
128 }
129
130 static int journal_file_refresh_header(JournalFile *f) {
131         int r;
132         sd_id128_t boot_id;
133
134         assert(f);
135
136         r = sd_id128_get_machine(&f->header->machine_id);
137         if (r < 0)
138                 return r;
139
140         r = sd_id128_get_boot(&boot_id);
141         if (r < 0)
142                 return r;
143
144         if (sd_id128_equal(boot_id, f->header->boot_id))
145                 f->tail_entry_monotonic_valid = true;
146
147         f->header->boot_id = boot_id;
148
149         f->header->state = STATE_ONLINE;
150
151         __sync_synchronize();
152
153         return 0;
154 }
155
156 static int journal_file_verify_header(JournalFile *f) {
157         assert(f);
158
159         if (memcmp(f->header, signature, 8))
160                 return -EBADMSG;
161
162 #ifdef HAVE_XZ
163         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
164                 return -EPROTONOSUPPORT;
165 #else
166         if (f->header->incompatible_flags != 0)
167                 return -EPROTONOSUPPORT;
168 #endif
169
170         /* The first addition was n_data, so check that we are at least this large */
171         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
172                 return -EBADMSG;
173
174         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
175                 return -ENODATA;
176
177         if (f->writable) {
178                 uint8_t state;
179                 sd_id128_t machine_id;
180                 int r;
181
182                 r = sd_id128_get_machine(&machine_id);
183                 if (r < 0)
184                         return r;
185
186                 if (!sd_id128_equal(machine_id, f->header->machine_id))
187                         return -EHOSTDOWN;
188
189                 state = f->header->state;
190
191                 if (state == STATE_ONLINE) {
192                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
193                         return -EBUSY;
194                 } else if (state == STATE_ARCHIVED)
195                         return -ESHUTDOWN;
196                 else if (state != STATE_OFFLINE) {
197                         log_debug("Journal file %s has unknown state %u.", f->path, state);
198                         return -EBUSY;
199                 }
200         }
201
202         return 0;
203 }
204
205 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
206         uint64_t old_size, new_size;
207         int r;
208
209         assert(f);
210
211         /* We assume that this file is not sparse, and we know that
212          * for sure, since we always call posix_fallocate()
213          * ourselves */
214
215         old_size =
216                 le64toh(f->header->header_size) +
217                 le64toh(f->header->arena_size);
218
219         new_size = PAGE_ALIGN(offset + size);
220         if (new_size < le64toh(f->header->header_size))
221                 new_size = le64toh(f->header->header_size);
222
223         if (new_size <= old_size)
224                 return 0;
225
226         if (f->metrics.max_size > 0 &&
227             new_size > f->metrics.max_size)
228                 return -E2BIG;
229
230         if (new_size > f->metrics.min_size &&
231             f->metrics.keep_free > 0) {
232                 struct statvfs svfs;
233
234                 if (fstatvfs(f->fd, &svfs) >= 0) {
235                         uint64_t available;
236
237                         available = svfs.f_bfree * svfs.f_bsize;
238
239                         if (available >= f->metrics.keep_free)
240                                 available -= f->metrics.keep_free;
241                         else
242                                 available = 0;
243
244                         if (new_size - old_size > available)
245                                 return -E2BIG;
246                 }
247         }
248
249         /* Note that the glibc fallocate() fallback is very
250            inefficient, hence we try to minimize the allocation area
251            as we can. */
252         r = posix_fallocate(f->fd, old_size, new_size - old_size);
253         if (r != 0)
254                 return -r;
255
256         if (fstat(f->fd, &f->last_stat) < 0)
257                 return -errno;
258
259         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
260
261         return 0;
262 }
263
264 static int journal_file_map(
265                 JournalFile *f,
266                 uint64_t offset,
267                 uint64_t size,
268                 void **_window,
269                 uint64_t *_woffset,
270                 uint64_t *_wsize,
271                 void **ret) {
272
273         uint64_t woffset, wsize;
274         void *window;
275
276         assert(f);
277         assert(size > 0);
278         assert(ret);
279
280         woffset = offset & ~((uint64_t) page_size() - 1ULL);
281         wsize = size + (offset - woffset);
282         wsize = PAGE_ALIGN(wsize);
283
284         /* Avoid SIGBUS on invalid accesses */
285         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
286                 return -EADDRNOTAVAIL;
287
288         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
289         if (window == MAP_FAILED)
290                 return -errno;
291
292         if (_window)
293                 *_window = window;
294
295         if (_woffset)
296                 *_woffset = woffset;
297
298         if (_wsize)
299                 *_wsize = wsize;
300
301         *ret = (uint8_t*) window + (offset - woffset);
302
303         return 0;
304 }
305
306 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
307         void *p = NULL;
308         uint64_t delta;
309         int r;
310         Window *w;
311
312         assert(f);
313         assert(ret);
314         assert(wt >= 0);
315         assert(wt < _WINDOW_MAX);
316
317         if (offset + size > (uint64_t) f->last_stat.st_size) {
318                 /* Hmm, out of range? Let's refresh the fstat() data
319                  * first, before we trust that check. */
320
321                 if (fstat(f->fd, &f->last_stat) < 0 ||
322                     offset + size > (uint64_t) f->last_stat.st_size)
323                         return -EADDRNOTAVAIL;
324         }
325
326         w = f->windows + wt;
327
328         if (_likely_(w->ptr &&
329                      w->offset <= offset &&
330                      w->offset + w->size >= offset + size)) {
331
332                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
333                 return 0;
334         }
335
336         if (w->ptr) {
337                 if (munmap(w->ptr, w->size) < 0)
338                         return -errno;
339
340                 w->ptr = NULL;
341                 w->size = w->offset = 0;
342         }
343
344         if (size < DEFAULT_WINDOW_SIZE) {
345                 /* If the default window size is larger then what was
346                  * asked for extend the mapping a bit in the hope to
347                  * minimize needed remappings later on. We add half
348                  * the window space before and half behind the
349                  * requested mapping */
350
351                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
352
353                 if (delta > offset)
354                         delta = offset;
355
356                 offset -= delta;
357                 size = DEFAULT_WINDOW_SIZE;
358         } else
359                 delta = 0;
360
361         if (offset + size > (uint64_t) f->last_stat.st_size)
362                 size = (uint64_t) f->last_stat.st_size - offset;
363
364         if (size <= 0)
365                 return -EADDRNOTAVAIL;
366
367         r = journal_file_map(f,
368                              offset, size,
369                              &w->ptr, &w->offset, &w->size,
370                              &p);
371
372         if (r < 0)
373                 return r;
374
375         *ret = (uint8_t*) p + delta;
376         return 0;
377 }
378
379 static bool verify_hash(Object *o) {
380         uint64_t h1, h2;
381
382         assert(o);
383
384         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
385                 h1 = le64toh(o->data.hash);
386                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
387         } else if (o->object.type == OBJECT_FIELD) {
388                 h1 = le64toh(o->field.hash);
389                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
390         } else
391                 return true;
392
393         return h1 == h2;
394 }
395
396 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
397         int r;
398         void *t;
399         Object *o;
400         uint64_t s;
401
402         assert(f);
403         assert(ret);
404         assert(type < _OBJECT_TYPE_MAX);
405
406         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
407         if (r < 0)
408                 return r;
409
410         o = (Object*) t;
411         s = le64toh(o->object.size);
412
413         if (s < sizeof(ObjectHeader))
414                 return -EBADMSG;
415
416         if (type >= 0 && o->object.type != type)
417                 return -EBADMSG;
418
419         if (s > sizeof(ObjectHeader)) {
420                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
421                 if (r < 0)
422                         return r;
423
424                 o = (Object*) t;
425         }
426
427         if (!verify_hash(o))
428                 return -EBADMSG;
429
430         *ret = o;
431         return 0;
432 }
433
434 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
435         uint64_t r;
436
437         assert(f);
438
439         r = le64toh(f->header->tail_seqnum) + 1;
440
441         if (seqnum) {
442                 /* If an external seqnum counter was passed, we update
443                  * both the local and the external one, and set it to
444                  * the maximum of both */
445
446                 if (*seqnum + 1 > r)
447                         r = *seqnum + 1;
448
449                 *seqnum = r;
450         }
451
452         f->header->tail_seqnum = htole64(r);
453
454         if (f->header->head_seqnum == 0)
455                 f->header->head_seqnum = htole64(r);
456
457         return r;
458 }
459
460 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
461         int r;
462         uint64_t p;
463         Object *tail, *o;
464         void *t;
465
466         assert(f);
467         assert(size >= sizeof(ObjectHeader));
468         assert(offset);
469         assert(ret);
470
471         p = le64toh(f->header->tail_object_offset);
472         if (p == 0)
473                 p = le64toh(f->header->header_size);
474         else {
475                 r = journal_file_move_to_object(f, -1, p, &tail);
476                 if (r < 0)
477                         return r;
478
479                 p += ALIGN64(le64toh(tail->object.size));
480         }
481
482         r = journal_file_allocate(f, p, size);
483         if (r < 0)
484                 return r;
485
486         r = journal_file_move_to(f, type, p, size, &t);
487         if (r < 0)
488                 return r;
489
490         o = (Object*) t;
491
492         zero(o->object);
493         o->object.type = type;
494         o->object.size = htole64(size);
495
496         f->header->tail_object_offset = htole64(p);
497         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
498
499         *ret = o;
500         *offset = p;
501
502         return 0;
503 }
504
505 static int journal_file_setup_data_hash_table(JournalFile *f) {
506         uint64_t s, p;
507         Object *o;
508         int r;
509
510         assert(f);
511
512         s = DEFAULT_DATA_HASH_TABLE_SIZE;
513         r = journal_file_append_object(f,
514                                        OBJECT_DATA_HASH_TABLE,
515                                        offsetof(Object, hash_table.items) + s,
516                                        &o, &p);
517         if (r < 0)
518                 return r;
519
520         memset(o->hash_table.items, 0, s);
521
522         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
523         f->header->data_hash_table_size = htole64(s);
524
525         return 0;
526 }
527
528 static int journal_file_setup_field_hash_table(JournalFile *f) {
529         uint64_t s, p;
530         Object *o;
531         int r;
532
533         assert(f);
534
535         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
536         r = journal_file_append_object(f,
537                                        OBJECT_FIELD_HASH_TABLE,
538                                        offsetof(Object, hash_table.items) + s,
539                                        &o, &p);
540         if (r < 0)
541                 return r;
542
543         memset(o->hash_table.items, 0, s);
544
545         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
546         f->header->field_hash_table_size = htole64(s);
547
548         return 0;
549 }
550
551 static int journal_file_map_data_hash_table(JournalFile *f) {
552         uint64_t s, p;
553         void *t;
554         int r;
555
556         assert(f);
557
558         p = le64toh(f->header->data_hash_table_offset);
559         s = le64toh(f->header->data_hash_table_size);
560
561         r = journal_file_move_to(f,
562                                  WINDOW_DATA_HASH_TABLE,
563                                  p, s,
564                                  &t);
565         if (r < 0)
566                 return r;
567
568         f->data_hash_table = t;
569         return 0;
570 }
571
572 static int journal_file_map_field_hash_table(JournalFile *f) {
573         uint64_t s, p;
574         void *t;
575         int r;
576
577         assert(f);
578
579         p = le64toh(f->header->field_hash_table_offset);
580         s = le64toh(f->header->field_hash_table_size);
581
582         r = journal_file_move_to(f,
583                                  WINDOW_FIELD_HASH_TABLE,
584                                  p, s,
585                                  &t);
586         if (r < 0)
587                 return r;
588
589         f->field_hash_table = t;
590         return 0;
591 }
592
593 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
594         uint64_t p, h;
595         int r;
596
597         assert(f);
598         assert(o);
599         assert(offset > 0);
600         assert(o->object.type == OBJECT_DATA);
601
602         /* This might alter the window we are looking at */
603
604         o->data.next_hash_offset = o->data.next_field_offset = 0;
605         o->data.entry_offset = o->data.entry_array_offset = 0;
606         o->data.n_entries = 0;
607
608         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
609         p = le64toh(f->data_hash_table[h].tail_hash_offset);
610         if (p == 0) {
611                 /* Only entry in the hash table is easy */
612                 f->data_hash_table[h].head_hash_offset = htole64(offset);
613         } else {
614                 /* Move back to the previous data object, to patch in
615                  * pointer */
616
617                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
618                 if (r < 0)
619                         return r;
620
621                 o->data.next_hash_offset = htole64(offset);
622         }
623
624         f->data_hash_table[h].tail_hash_offset = htole64(offset);
625
626         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
627                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
628
629         return 0;
630 }
631
632 int journal_file_find_data_object_with_hash(
633                 JournalFile *f,
634                 const void *data, uint64_t size, uint64_t hash,
635                 Object **ret, uint64_t *offset) {
636
637         uint64_t p, osize, h;
638         int r;
639
640         assert(f);
641         assert(data || size == 0);
642
643         osize = offsetof(Object, data.payload) + size;
644
645         if (f->header->data_hash_table_size == 0)
646                 return -EBADMSG;
647
648         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
649         p = le64toh(f->data_hash_table[h].head_hash_offset);
650
651         while (p > 0) {
652                 Object *o;
653
654                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
655                 if (r < 0)
656                         return r;
657
658                 if (le64toh(o->data.hash) != hash)
659                         goto next;
660
661                 if (o->object.flags & OBJECT_COMPRESSED) {
662 #ifdef HAVE_XZ
663                         uint64_t l, rsize;
664
665                         l = le64toh(o->object.size);
666                         if (l <= offsetof(Object, data.payload))
667                                 return -EBADMSG;
668
669                         l -= offsetof(Object, data.payload);
670
671                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
672                                 return -EBADMSG;
673
674                         if (rsize == size &&
675                             memcmp(f->compress_buffer, data, size) == 0) {
676
677                                 if (ret)
678                                         *ret = o;
679
680                                 if (offset)
681                                         *offset = p;
682
683                                 return 1;
684                         }
685 #else
686                         return -EPROTONOSUPPORT;
687 #endif
688
689                 } else if (le64toh(o->object.size) == osize &&
690                            memcmp(o->data.payload, data, size) == 0) {
691
692                         if (ret)
693                                 *ret = o;
694
695                         if (offset)
696                                 *offset = p;
697
698                         return 1;
699                 }
700
701         next:
702                 p = le64toh(o->data.next_hash_offset);
703         }
704
705         return 0;
706 }
707
708 int journal_file_find_data_object(
709                 JournalFile *f,
710                 const void *data, uint64_t size,
711                 Object **ret, uint64_t *offset) {
712
713         uint64_t hash;
714
715         assert(f);
716         assert(data || size == 0);
717
718         hash = hash64(data, size);
719
720         return journal_file_find_data_object_with_hash(f,
721                                                        data, size, hash,
722                                                        ret, offset);
723 }
724
725 static int journal_file_append_data(
726                 JournalFile *f,
727                 const void *data, uint64_t size,
728                 Object **ret, uint64_t *offset) {
729
730         uint64_t hash, p;
731         uint64_t osize;
732         Object *o;
733         int r;
734         bool compressed = false;
735
736         assert(f);
737         assert(data || size == 0);
738
739         hash = hash64(data, size);
740
741         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
742         if (r < 0)
743                 return r;
744         else if (r > 0) {
745
746                 if (ret)
747                         *ret = o;
748
749                 if (offset)
750                         *offset = p;
751
752                 return 0;
753         }
754
755         osize = offsetof(Object, data.payload) + size;
756         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
757         if (r < 0)
758                 return r;
759
760         o->data.hash = htole64(hash);
761
762 #ifdef HAVE_XZ
763         if (f->compress &&
764             size >= COMPRESSION_SIZE_THRESHOLD) {
765                 uint64_t rsize;
766
767                 compressed = compress_blob(data, size, o->data.payload, &rsize);
768
769                 if (compressed) {
770                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
771                         o->object.flags |= OBJECT_COMPRESSED;
772
773                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
774
775                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
776                 }
777         }
778 #endif
779
780         if (!compressed)
781                 memcpy(o->data.payload, data, size);
782
783         r = journal_file_link_data(f, o, p, hash);
784         if (r < 0)
785                 return r;
786
787         /* The linking might have altered the window, so let's
788          * refresh our pointer */
789         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
790         if (r < 0)
791                 return r;
792
793         if (ret)
794                 *ret = o;
795
796         if (offset)
797                 *offset = p;
798
799         return 0;
800 }
801
802 uint64_t journal_file_entry_n_items(Object *o) {
803         assert(o);
804         assert(o->object.type == OBJECT_ENTRY);
805
806         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
807 }
808
809 static uint64_t journal_file_entry_array_n_items(Object *o) {
810         assert(o);
811         assert(o->object.type == OBJECT_ENTRY_ARRAY);
812
813         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
814 }
815
816 static int link_entry_into_array(JournalFile *f,
817                                  le64_t *first,
818                                  le64_t *idx,
819                                  uint64_t p) {
820         int r;
821         uint64_t n = 0, ap = 0, q, i, a, hidx;
822         Object *o;
823
824         assert(f);
825         assert(first);
826         assert(idx);
827         assert(p > 0);
828
829         a = le64toh(*first);
830         i = hidx = le64toh(*idx);
831         while (a > 0) {
832
833                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
834                 if (r < 0)
835                         return r;
836
837                 n = journal_file_entry_array_n_items(o);
838                 if (i < n) {
839                         o->entry_array.items[i] = htole64(p);
840                         *idx = htole64(hidx + 1);
841                         return 0;
842                 }
843
844                 i -= n;
845                 ap = a;
846                 a = le64toh(o->entry_array.next_entry_array_offset);
847         }
848
849         if (hidx > n)
850                 n = (hidx+1) * 2;
851         else
852                 n = n * 2;
853
854         if (n < 4)
855                 n = 4;
856
857         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
858                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
859                                        &o, &q);
860         if (r < 0)
861                 return r;
862
863         o->entry_array.items[i] = htole64(p);
864
865         if (ap == 0)
866                 *first = htole64(q);
867         else {
868                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
869                 if (r < 0)
870                         return r;
871
872                 o->entry_array.next_entry_array_offset = htole64(q);
873         }
874
875         *idx = htole64(hidx + 1);
876
877         return 0;
878 }
879
880 static int link_entry_into_array_plus_one(JournalFile *f,
881                                           le64_t *extra,
882                                           le64_t *first,
883                                           le64_t *idx,
884                                           uint64_t p) {
885
886         int r;
887
888         assert(f);
889         assert(extra);
890         assert(first);
891         assert(idx);
892         assert(p > 0);
893
894         if (*idx == 0)
895                 *extra = htole64(p);
896         else {
897                 le64_t i;
898
899                 i = htole64(le64toh(*idx) - 1);
900                 r = link_entry_into_array(f, first, &i, p);
901                 if (r < 0)
902                         return r;
903         }
904
905         *idx = htole64(le64toh(*idx) + 1);
906         return 0;
907 }
908
909 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
910         uint64_t p;
911         int r;
912         assert(f);
913         assert(o);
914         assert(offset > 0);
915
916         p = le64toh(o->entry.items[i].object_offset);
917         if (p == 0)
918                 return -EINVAL;
919
920         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
921         if (r < 0)
922                 return r;
923
924         return link_entry_into_array_plus_one(f,
925                                               &o->data.entry_offset,
926                                               &o->data.entry_array_offset,
927                                               &o->data.n_entries,
928                                               offset);
929 }
930
931 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
932         uint64_t n, i;
933         int r;
934
935         assert(f);
936         assert(o);
937         assert(offset > 0);
938         assert(o->object.type == OBJECT_ENTRY);
939
940         __sync_synchronize();
941
942         /* Link up the entry itself */
943         r = link_entry_into_array(f,
944                                   &f->header->entry_array_offset,
945                                   &f->header->n_entries,
946                                   offset);
947         if (r < 0)
948                 return r;
949
950         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
951
952         if (f->header->head_entry_realtime == 0)
953                 f->header->head_entry_realtime = o->entry.realtime;
954
955         f->header->tail_entry_realtime = o->entry.realtime;
956         f->header->tail_entry_monotonic = o->entry.monotonic;
957
958         f->tail_entry_monotonic_valid = true;
959
960         /* Link up the items */
961         n = journal_file_entry_n_items(o);
962         for (i = 0; i < n; i++) {
963                 r = journal_file_link_entry_item(f, o, offset, i);
964                 if (r < 0)
965                         return r;
966         }
967
968         return 0;
969 }
970
971 static int journal_file_append_entry_internal(
972                 JournalFile *f,
973                 const dual_timestamp *ts,
974                 uint64_t xor_hash,
975                 const EntryItem items[], unsigned n_items,
976                 uint64_t *seqnum,
977                 Object **ret, uint64_t *offset) {
978         uint64_t np;
979         uint64_t osize;
980         Object *o;
981         int r;
982
983         assert(f);
984         assert(items || n_items == 0);
985         assert(ts);
986
987         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
988
989         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
990         if (r < 0)
991                 return r;
992
993         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
994         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
995         o->entry.realtime = htole64(ts->realtime);
996         o->entry.monotonic = htole64(ts->monotonic);
997         o->entry.xor_hash = htole64(xor_hash);
998         o->entry.boot_id = f->header->boot_id;
999
1000         r = journal_file_link_entry(f, o, np);
1001         if (r < 0)
1002                 return r;
1003
1004         if (ret)
1005                 *ret = o;
1006
1007         if (offset)
1008                 *offset = np;
1009
1010         return 0;
1011 }
1012
1013 void journal_file_post_change(JournalFile *f) {
1014         assert(f);
1015
1016         /* inotify() does not receive IN_MODIFY events from file
1017          * accesses done via mmap(). After each access we hence
1018          * trigger IN_MODIFY by truncating the journal file to its
1019          * current size which triggers IN_MODIFY. */
1020
1021         __sync_synchronize();
1022
1023         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1024                 log_error("Failed to to truncate file to its own size: %m");
1025 }
1026
1027 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1028         unsigned i;
1029         EntryItem *items;
1030         int r;
1031         uint64_t xor_hash = 0;
1032         struct dual_timestamp _ts;
1033
1034         assert(f);
1035         assert(iovec || n_iovec == 0);
1036
1037         if (!f->writable)
1038                 return -EPERM;
1039
1040         if (!ts) {
1041                 dual_timestamp_get(&_ts);
1042                 ts = &_ts;
1043         }
1044
1045         if (f->tail_entry_monotonic_valid &&
1046             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1047                 return -EINVAL;
1048
1049         items = alloca(sizeof(EntryItem) * n_iovec);
1050
1051         for (i = 0; i < n_iovec; i++) {
1052                 uint64_t p;
1053                 Object *o;
1054
1055                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1056                 if (r < 0)
1057                         return r;
1058
1059                 xor_hash ^= le64toh(o->data.hash);
1060                 items[i].object_offset = htole64(p);
1061                 items[i].hash = o->data.hash;
1062         }
1063
1064         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1065
1066         journal_file_post_change(f);
1067
1068         return r;
1069 }
1070
1071 static int generic_array_get(JournalFile *f,
1072                              uint64_t first,
1073                              uint64_t i,
1074                              Object **ret, uint64_t *offset) {
1075
1076         Object *o;
1077         uint64_t p = 0, a;
1078         int r;
1079
1080         assert(f);
1081
1082         a = first;
1083         while (a > 0) {
1084                 uint64_t n;
1085
1086                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1087                 if (r < 0)
1088                         return r;
1089
1090                 n = journal_file_entry_array_n_items(o);
1091                 if (i < n) {
1092                         p = le64toh(o->entry_array.items[i]);
1093                         break;
1094                 }
1095
1096                 i -= n;
1097                 a = le64toh(o->entry_array.next_entry_array_offset);
1098         }
1099
1100         if (a <= 0 || p <= 0)
1101                 return 0;
1102
1103         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1104         if (r < 0)
1105                 return r;
1106
1107         if (ret)
1108                 *ret = o;
1109
1110         if (offset)
1111                 *offset = p;
1112
1113         return 1;
1114 }
1115
1116 static int generic_array_get_plus_one(JournalFile *f,
1117                                       uint64_t extra,
1118                                       uint64_t first,
1119                                       uint64_t i,
1120                                       Object **ret, uint64_t *offset) {
1121
1122         Object *o;
1123
1124         assert(f);
1125
1126         if (i == 0) {
1127                 int r;
1128
1129                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1130                 if (r < 0)
1131                         return r;
1132
1133                 if (ret)
1134                         *ret = o;
1135
1136                 if (offset)
1137                         *offset = extra;
1138
1139                 return 1;
1140         }
1141
1142         return generic_array_get(f, first, i-1, ret, offset);
1143 }
1144
1145 enum {
1146         TEST_FOUND,
1147         TEST_LEFT,
1148         TEST_RIGHT
1149 };
1150
1151 static int generic_array_bisect(JournalFile *f,
1152                                 uint64_t first,
1153                                 uint64_t n,
1154                                 uint64_t needle,
1155                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1156                                 direction_t direction,
1157                                 Object **ret,
1158                                 uint64_t *offset,
1159                                 uint64_t *idx) {
1160
1161         uint64_t a, p, t = 0, i = 0, last_p = 0;
1162         bool subtract_one = false;
1163         Object *o, *array = NULL;
1164         int r;
1165
1166         assert(f);
1167         assert(test_object);
1168
1169         a = first;
1170         while (a > 0) {
1171                 uint64_t left, right, k, lp;
1172
1173                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1174                 if (r < 0)
1175                         return r;
1176
1177                 k = journal_file_entry_array_n_items(array);
1178                 right = MIN(k, n);
1179                 if (right <= 0)
1180                         return 0;
1181
1182                 i = right - 1;
1183                 lp = p = le64toh(array->entry_array.items[i]);
1184                 if (p <= 0)
1185                         return -EBADMSG;
1186
1187                 r = test_object(f, p, needle);
1188                 if (r < 0)
1189                         return r;
1190
1191                 if (r == TEST_FOUND)
1192                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1193
1194                 if (r == TEST_RIGHT) {
1195                         left = 0;
1196                         right -= 1;
1197                         for (;;) {
1198                                 if (left == right) {
1199                                         if (direction == DIRECTION_UP)
1200                                                 subtract_one = true;
1201
1202                                         i = left;
1203                                         goto found;
1204                                 }
1205
1206                                 assert(left < right);
1207
1208                                 i = (left + right) / 2;
1209                                 p = le64toh(array->entry_array.items[i]);
1210                                 if (p <= 0)
1211                                         return -EBADMSG;
1212
1213                                 r = test_object(f, p, needle);
1214                                 if (r < 0)
1215                                         return r;
1216
1217                                 if (r == TEST_FOUND)
1218                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1219
1220                                 if (r == TEST_RIGHT)
1221                                         right = i;
1222                                 else
1223                                         left = i + 1;
1224                         }
1225                 }
1226
1227                 if (k > n) {
1228                         if (direction == DIRECTION_UP) {
1229                                 i = n;
1230                                 subtract_one = true;
1231                                 goto found;
1232                         }
1233
1234                         return 0;
1235                 }
1236
1237                 last_p = lp;
1238
1239                 n -= k;
1240                 t += k;
1241                 a = le64toh(array->entry_array.next_entry_array_offset);
1242         }
1243
1244         return 0;
1245
1246 found:
1247         if (subtract_one && t == 0 && i == 0)
1248                 return 0;
1249
1250         if (subtract_one && i == 0)
1251                 p = last_p;
1252         else if (subtract_one)
1253                 p = le64toh(array->entry_array.items[i-1]);
1254         else
1255                 p = le64toh(array->entry_array.items[i]);
1256
1257         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1258         if (r < 0)
1259                 return r;
1260
1261         if (ret)
1262                 *ret = o;
1263
1264         if (offset)
1265                 *offset = p;
1266
1267         if (idx)
1268                 *idx = t + i + (subtract_one ? -1 : 0);
1269
1270         return 1;
1271 }
1272
1273 static int generic_array_bisect_plus_one(JournalFile *f,
1274                                          uint64_t extra,
1275                                          uint64_t first,
1276                                          uint64_t n,
1277                                          uint64_t needle,
1278                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1279                                          direction_t direction,
1280                                          Object **ret,
1281                                          uint64_t *offset,
1282                                          uint64_t *idx) {
1283
1284         int r;
1285         bool step_back = false;
1286         Object *o;
1287
1288         assert(f);
1289         assert(test_object);
1290
1291         if (n <= 0)
1292                 return 0;
1293
1294         /* This bisects the array in object 'first', but first checks
1295          * an extra  */
1296         r = test_object(f, extra, needle);
1297         if (r < 0)
1298                 return r;
1299
1300         if (r == TEST_FOUND)
1301                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1302
1303         /* if we are looking with DIRECTION_UP then we need to first
1304            see if in the actual array there is a matching entry, and
1305            return the last one of that. But if there isn't any we need
1306            to return this one. Hence remember this, and return it
1307            below. */
1308         if (r == TEST_LEFT)
1309                 step_back = direction == DIRECTION_UP;
1310
1311         if (r == TEST_RIGHT) {
1312                 if (direction == DIRECTION_DOWN)
1313                         goto found;
1314                 else
1315                         return 0;
1316         }
1317
1318         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1319
1320         if (r == 0 && step_back)
1321                 goto found;
1322
1323         if (r > 0 && idx)
1324                 (*idx) ++;
1325
1326         return r;
1327
1328 found:
1329         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1330         if (r < 0)
1331                 return r;
1332
1333         if (ret)
1334                 *ret = o;
1335
1336         if (offset)
1337                 *offset = extra;
1338
1339         if (idx)
1340                 *idx = 0;
1341
1342         return 1;
1343 }
1344
1345 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1346         assert(f);
1347         assert(p > 0);
1348
1349         if (p == needle)
1350                 return TEST_FOUND;
1351         else if (p < needle)
1352                 return TEST_LEFT;
1353         else
1354                 return TEST_RIGHT;
1355 }
1356
1357 int journal_file_move_to_entry_by_offset(
1358                 JournalFile *f,
1359                 uint64_t p,
1360                 direction_t direction,
1361                 Object **ret,
1362                 uint64_t *offset) {
1363
1364         return generic_array_bisect(f,
1365                                     le64toh(f->header->entry_array_offset),
1366                                     le64toh(f->header->n_entries),
1367                                     p,
1368                                     test_object_offset,
1369                                     direction,
1370                                     ret, offset, NULL);
1371 }
1372
1373
1374 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1375         Object *o;
1376         int r;
1377
1378         assert(f);
1379         assert(p > 0);
1380
1381         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1382         if (r < 0)
1383                 return r;
1384
1385         if (le64toh(o->entry.seqnum) == needle)
1386                 return TEST_FOUND;
1387         else if (le64toh(o->entry.seqnum) < needle)
1388                 return TEST_LEFT;
1389         else
1390                 return TEST_RIGHT;
1391 }
1392
1393 int journal_file_move_to_entry_by_seqnum(
1394                 JournalFile *f,
1395                 uint64_t seqnum,
1396                 direction_t direction,
1397                 Object **ret,
1398                 uint64_t *offset) {
1399
1400         return generic_array_bisect(f,
1401                                     le64toh(f->header->entry_array_offset),
1402                                     le64toh(f->header->n_entries),
1403                                     seqnum,
1404                                     test_object_seqnum,
1405                                     direction,
1406                                     ret, offset, NULL);
1407 }
1408
1409 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1410         Object *o;
1411         int r;
1412
1413         assert(f);
1414         assert(p > 0);
1415
1416         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1417         if (r < 0)
1418                 return r;
1419
1420         if (le64toh(o->entry.realtime) == needle)
1421                 return TEST_FOUND;
1422         else if (le64toh(o->entry.realtime) < needle)
1423                 return TEST_LEFT;
1424         else
1425                 return TEST_RIGHT;
1426 }
1427
1428 int journal_file_move_to_entry_by_realtime(
1429                 JournalFile *f,
1430                 uint64_t realtime,
1431                 direction_t direction,
1432                 Object **ret,
1433                 uint64_t *offset) {
1434
1435         return generic_array_bisect(f,
1436                                     le64toh(f->header->entry_array_offset),
1437                                     le64toh(f->header->n_entries),
1438                                     realtime,
1439                                     test_object_realtime,
1440                                     direction,
1441                                     ret, offset, NULL);
1442 }
1443
1444 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1445         Object *o;
1446         int r;
1447
1448         assert(f);
1449         assert(p > 0);
1450
1451         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1452         if (r < 0)
1453                 return r;
1454
1455         if (le64toh(o->entry.monotonic) == needle)
1456                 return TEST_FOUND;
1457         else if (le64toh(o->entry.monotonic) < needle)
1458                 return TEST_LEFT;
1459         else
1460                 return TEST_RIGHT;
1461 }
1462
1463 int journal_file_move_to_entry_by_monotonic(
1464                 JournalFile *f,
1465                 sd_id128_t boot_id,
1466                 uint64_t monotonic,
1467                 direction_t direction,
1468                 Object **ret,
1469                 uint64_t *offset) {
1470
1471         char t[9+32+1] = "_BOOT_ID=";
1472         Object *o;
1473         int r;
1474
1475         assert(f);
1476
1477         sd_id128_to_string(boot_id, t + 9);
1478         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1479         if (r < 0)
1480                 return r;
1481         if (r == 0)
1482                 return -ENOENT;
1483
1484         return generic_array_bisect_plus_one(f,
1485                                              le64toh(o->data.entry_offset),
1486                                              le64toh(o->data.entry_array_offset),
1487                                              le64toh(o->data.n_entries),
1488                                              monotonic,
1489                                              test_object_monotonic,
1490                                              direction,
1491                                              ret, offset, NULL);
1492 }
1493
1494 int journal_file_next_entry(
1495                 JournalFile *f,
1496                 Object *o, uint64_t p,
1497                 direction_t direction,
1498                 Object **ret, uint64_t *offset) {
1499
1500         uint64_t i, n;
1501         int r;
1502
1503         assert(f);
1504         assert(p > 0 || !o);
1505
1506         n = le64toh(f->header->n_entries);
1507         if (n <= 0)
1508                 return 0;
1509
1510         if (!o)
1511                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1512         else {
1513                 if (o->object.type != OBJECT_ENTRY)
1514                         return -EINVAL;
1515
1516                 r = generic_array_bisect(f,
1517                                          le64toh(f->header->entry_array_offset),
1518                                          le64toh(f->header->n_entries),
1519                                          p,
1520                                          test_object_offset,
1521                                          DIRECTION_DOWN,
1522                                          NULL, NULL,
1523                                          &i);
1524                 if (r <= 0)
1525                         return r;
1526
1527                 if (direction == DIRECTION_DOWN) {
1528                         if (i >= n - 1)
1529                                 return 0;
1530
1531                         i++;
1532                 } else {
1533                         if (i <= 0)
1534                                 return 0;
1535
1536                         i--;
1537                 }
1538         }
1539
1540         /* And jump to it */
1541         return generic_array_get(f,
1542                                  le64toh(f->header->entry_array_offset),
1543                                  i,
1544                                  ret, offset);
1545 }
1546
1547 int journal_file_skip_entry(
1548                 JournalFile *f,
1549                 Object *o, uint64_t p,
1550                 int64_t skip,
1551                 Object **ret, uint64_t *offset) {
1552
1553         uint64_t i, n;
1554         int r;
1555
1556         assert(f);
1557         assert(o);
1558         assert(p > 0);
1559
1560         if (o->object.type != OBJECT_ENTRY)
1561                 return -EINVAL;
1562
1563         r = generic_array_bisect(f,
1564                                  le64toh(f->header->entry_array_offset),
1565                                  le64toh(f->header->n_entries),
1566                                  p,
1567                                  test_object_offset,
1568                                  DIRECTION_DOWN,
1569                                  NULL, NULL,
1570                                  &i);
1571         if (r <= 0)
1572                 return r;
1573
1574         /* Calculate new index */
1575         if (skip < 0) {
1576                 if ((uint64_t) -skip >= i)
1577                         i = 0;
1578                 else
1579                         i = i - (uint64_t) -skip;
1580         } else
1581                 i  += (uint64_t) skip;
1582
1583         n = le64toh(f->header->n_entries);
1584         if (n <= 0)
1585                 return -EBADMSG;
1586
1587         if (i >= n)
1588                 i = n-1;
1589
1590         return generic_array_get(f,
1591                                  le64toh(f->header->entry_array_offset),
1592                                  i,
1593                                  ret, offset);
1594 }
1595
1596 int journal_file_next_entry_for_data(
1597                 JournalFile *f,
1598                 Object *o, uint64_t p,
1599                 uint64_t data_offset,
1600                 direction_t direction,
1601                 Object **ret, uint64_t *offset) {
1602
1603         uint64_t n, i;
1604         int r;
1605         Object *d;
1606
1607         assert(f);
1608         assert(p > 0 || !o);
1609
1610         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1611         if (r < 0)
1612                 return r;
1613
1614         n = le64toh(d->data.n_entries);
1615         if (n <= 0)
1616                 return n;
1617
1618         if (!o)
1619                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1620         else {
1621                 if (o->object.type != OBJECT_ENTRY)
1622                         return -EINVAL;
1623
1624                 r = generic_array_bisect_plus_one(f,
1625                                                   le64toh(d->data.entry_offset),
1626                                                   le64toh(d->data.entry_array_offset),
1627                                                   le64toh(d->data.n_entries),
1628                                                   p,
1629                                                   test_object_offset,
1630                                                   DIRECTION_DOWN,
1631                                                   NULL, NULL,
1632                                                   &i);
1633
1634                 if (r <= 0)
1635                         return r;
1636
1637                 if (direction == DIRECTION_DOWN) {
1638                         if (i >= n - 1)
1639                                 return 0;
1640
1641                         i++;
1642                 } else {
1643                         if (i <= 0)
1644                                 return 0;
1645
1646                         i--;
1647                 }
1648
1649         }
1650
1651         return generic_array_get_plus_one(f,
1652                                           le64toh(d->data.entry_offset),
1653                                           le64toh(d->data.entry_array_offset),
1654                                           i,
1655                                           ret, offset);
1656 }
1657
1658 int journal_file_move_to_entry_by_offset_for_data(
1659                 JournalFile *f,
1660                 uint64_t data_offset,
1661                 uint64_t p,
1662                 direction_t direction,
1663                 Object **ret, uint64_t *offset) {
1664
1665         int r;
1666         Object *d;
1667
1668         assert(f);
1669
1670         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1671         if (r < 0)
1672                 return r;
1673
1674         return generic_array_bisect_plus_one(f,
1675                                              le64toh(d->data.entry_offset),
1676                                              le64toh(d->data.entry_array_offset),
1677                                              le64toh(d->data.n_entries),
1678                                              p,
1679                                              test_object_offset,
1680                                              direction,
1681                                              ret, offset, NULL);
1682 }
1683
1684 int journal_file_move_to_entry_by_monotonic_for_data(
1685                 JournalFile *f,
1686                 uint64_t data_offset,
1687                 sd_id128_t boot_id,
1688                 uint64_t monotonic,
1689                 direction_t direction,
1690                 Object **ret, uint64_t *offset) {
1691
1692         char t[9+32+1] = "_BOOT_ID=";
1693         Object *o, *d;
1694         int r;
1695         uint64_t b, z;
1696
1697         assert(f);
1698
1699         /* First, seek by time */
1700         sd_id128_to_string(boot_id, t + 9);
1701         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1702         if (r < 0)
1703                 return r;
1704         if (r == 0)
1705                 return -ENOENT;
1706
1707         r = generic_array_bisect_plus_one(f,
1708                                           le64toh(o->data.entry_offset),
1709                                           le64toh(o->data.entry_array_offset),
1710                                           le64toh(o->data.n_entries),
1711                                           monotonic,
1712                                           test_object_monotonic,
1713                                           direction,
1714                                           NULL, &z, NULL);
1715         if (r <= 0)
1716                 return r;
1717
1718         /* And now, continue seeking until we find an entry that
1719          * exists in both bisection arrays */
1720
1721         for (;;) {
1722                 Object *qo;
1723                 uint64_t p, q;
1724
1725                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1726                 if (r < 0)
1727                         return r;
1728
1729                 r = generic_array_bisect_plus_one(f,
1730                                                   le64toh(d->data.entry_offset),
1731                                                   le64toh(d->data.entry_array_offset),
1732                                                   le64toh(d->data.n_entries),
1733                                                   z,
1734                                                   test_object_offset,
1735                                                   direction,
1736                                                   NULL, &p, NULL);
1737                 if (r <= 0)
1738                         return r;
1739
1740                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1741                 if (r < 0)
1742                         return r;
1743
1744                 r = generic_array_bisect_plus_one(f,
1745                                                   le64toh(o->data.entry_offset),
1746                                                   le64toh(o->data.entry_array_offset),
1747                                                   le64toh(o->data.n_entries),
1748                                                   p,
1749                                                   test_object_offset,
1750                                                   direction,
1751                                                   &qo, &q, NULL);
1752
1753                 if (r <= 0)
1754                         return r;
1755
1756                 if (p == q) {
1757                         if (ret)
1758                                 *ret = qo;
1759                         if (offset)
1760                                 *offset = q;
1761
1762                         return 1;
1763                 }
1764
1765                 z = q;
1766         }
1767
1768         return 0;
1769 }
1770
1771 int journal_file_move_to_entry_by_seqnum_for_data(
1772                 JournalFile *f,
1773                 uint64_t data_offset,
1774                 uint64_t seqnum,
1775                 direction_t direction,
1776                 Object **ret, uint64_t *offset) {
1777
1778         Object *d;
1779         int r;
1780
1781         assert(f);
1782
1783         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1784         if (r < 0)
1785                 return r;
1786
1787         return generic_array_bisect_plus_one(f,
1788                                              le64toh(d->data.entry_offset),
1789                                              le64toh(d->data.entry_array_offset),
1790                                              le64toh(d->data.n_entries),
1791                                              seqnum,
1792                                              test_object_seqnum,
1793                                              direction,
1794                                              ret, offset, NULL);
1795 }
1796
1797 int journal_file_move_to_entry_by_realtime_for_data(
1798                 JournalFile *f,
1799                 uint64_t data_offset,
1800                 uint64_t realtime,
1801                 direction_t direction,
1802                 Object **ret, uint64_t *offset) {
1803
1804         Object *d;
1805         int r;
1806
1807         assert(f);
1808
1809         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1810         if (r < 0)
1811                 return r;
1812
1813         return generic_array_bisect_plus_one(f,
1814                                              le64toh(d->data.entry_offset),
1815                                              le64toh(d->data.entry_array_offset),
1816                                              le64toh(d->data.n_entries),
1817                                              realtime,
1818                                              test_object_realtime,
1819                                              direction,
1820                                              ret, offset, NULL);
1821 }
1822
1823 void journal_file_dump(JournalFile *f) {
1824         Object *o;
1825         int r;
1826         uint64_t p;
1827
1828         assert(f);
1829
1830         journal_file_print_header(f);
1831
1832         p = le64toh(f->header->header_size);
1833         while (p != 0) {
1834                 r = journal_file_move_to_object(f, -1, p, &o);
1835                 if (r < 0)
1836                         goto fail;
1837
1838                 switch (o->object.type) {
1839
1840                 case OBJECT_UNUSED:
1841                         printf("Type: OBJECT_UNUSED\n");
1842                         break;
1843
1844                 case OBJECT_DATA:
1845                         printf("Type: OBJECT_DATA\n");
1846                         break;
1847
1848                 case OBJECT_ENTRY:
1849                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1850                                (unsigned long long) le64toh(o->entry.seqnum),
1851                                (unsigned long long) le64toh(o->entry.monotonic),
1852                                (unsigned long long) le64toh(o->entry.realtime));
1853                         break;
1854
1855                 case OBJECT_FIELD_HASH_TABLE:
1856                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1857                         break;
1858
1859                 case OBJECT_DATA_HASH_TABLE:
1860                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1861                         break;
1862
1863                 case OBJECT_ENTRY_ARRAY:
1864                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1865                         break;
1866
1867                 case OBJECT_SIGNATURE:
1868                         printf("Type: OBJECT_SIGNATURE\n");
1869                         break;
1870                 }
1871
1872                 if (o->object.flags & OBJECT_COMPRESSED)
1873                         printf("Flags: COMPRESSED\n");
1874
1875                 if (p == le64toh(f->header->tail_object_offset))
1876                         p = 0;
1877                 else
1878                         p = p + ALIGN64(le64toh(o->object.size));
1879         }
1880
1881         return;
1882 fail:
1883         log_error("File corrupt");
1884 }
1885
1886 void journal_file_print_header(JournalFile *f) {
1887         char a[33], b[33], c[33];
1888         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1889
1890         assert(f);
1891
1892         printf("File Path: %s\n"
1893                "File ID: %s\n"
1894                "Machine ID: %s\n"
1895                "Boot ID: %s\n"
1896                "Sequential Number ID: %s\n"
1897                "Header size: %llu\n"
1898                "Arena size: %llu\n"
1899                "Data Hash Table Size: %llu\n"
1900                "Field Hash Table Size: %llu\n"
1901                "Objects: %llu\n"
1902                "Entry Objects: %llu\n"
1903                "Rotate Suggested: %s\n"
1904                "Head Sequential Number: %llu\n"
1905                "Tail Sequential Number: %llu\n"
1906                "Head Realtime Timestamp: %s\n"
1907                "Tail Realtime Timestamp: %s\n",
1908                f->path,
1909                sd_id128_to_string(f->header->file_id, a),
1910                sd_id128_to_string(f->header->machine_id, b),
1911                sd_id128_to_string(f->header->boot_id, c),
1912                sd_id128_to_string(f->header->seqnum_id, c),
1913                (unsigned long long) le64toh(f->header->header_size),
1914                (unsigned long long) le64toh(f->header->arena_size),
1915                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1916                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1917                (unsigned long long) le64toh(f->header->n_objects),
1918                (unsigned long long) le64toh(f->header->n_entries),
1919                yes_no(journal_file_rotate_suggested(f)),
1920                (unsigned long long) le64toh(f->header->head_seqnum),
1921                (unsigned long long) le64toh(f->header->tail_seqnum),
1922                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1923                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)));
1924
1925         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1926                 printf("Data Objects: %llu\n"
1927                        "Data Hash Table Fill: %.1f%%\n",
1928                        (unsigned long long) le64toh(f->header->n_data),
1929                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1930
1931         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1932                 printf("Field Objects: %llu\n"
1933                        "Field Hash Table Fill: %.1f%%\n",
1934                        (unsigned long long) le64toh(f->header->n_fields),
1935                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1936 }
1937
1938 int journal_file_open(
1939                 const char *fname,
1940                 int flags,
1941                 mode_t mode,
1942                 JournalFile *template,
1943                 JournalFile **ret) {
1944
1945         JournalFile *f;
1946         int r;
1947         bool newly_created = false;
1948
1949         assert(fname);
1950
1951         if ((flags & O_ACCMODE) != O_RDONLY &&
1952             (flags & O_ACCMODE) != O_RDWR)
1953                 return -EINVAL;
1954
1955         if (!endswith(fname, ".journal"))
1956                 return -EINVAL;
1957
1958         f = new0(JournalFile, 1);
1959         if (!f)
1960                 return -ENOMEM;
1961
1962         f->fd = -1;
1963         f->flags = flags;
1964         f->mode = mode;
1965         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1966         f->prot = prot_from_flags(flags);
1967
1968         if (template) {
1969                 f->metrics = template->metrics;
1970                 f->compress = template->compress;
1971         }
1972
1973         f->path = strdup(fname);
1974         if (!f->path) {
1975                 r = -ENOMEM;
1976                 goto fail;
1977         }
1978
1979         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1980         if (f->fd < 0) {
1981                 r = -errno;
1982                 goto fail;
1983         }
1984
1985         if (fstat(f->fd, &f->last_stat) < 0) {
1986                 r = -errno;
1987                 goto fail;
1988         }
1989
1990         if (f->last_stat.st_size == 0 && f->writable) {
1991                 newly_created = true;
1992
1993                 r = journal_file_init_header(f, template);
1994                 if (r < 0)
1995                         goto fail;
1996
1997                 if (fstat(f->fd, &f->last_stat) < 0) {
1998                         r = -errno;
1999                         goto fail;
2000                 }
2001         }
2002
2003         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2004                 r = -EIO;
2005                 goto fail;
2006         }
2007
2008         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2009         if (f->header == MAP_FAILED) {
2010                 f->header = NULL;
2011                 r = -errno;
2012                 goto fail;
2013         }
2014
2015         if (!newly_created) {
2016                 r = journal_file_verify_header(f);
2017                 if (r < 0)
2018                         goto fail;
2019         }
2020
2021         if (f->writable) {
2022                 r = journal_file_refresh_header(f);
2023                 if (r < 0)
2024                         goto fail;
2025         }
2026
2027         if (newly_created) {
2028
2029                 r = journal_file_setup_field_hash_table(f);
2030                 if (r < 0)
2031                         goto fail;
2032
2033                 r = journal_file_setup_data_hash_table(f);
2034                 if (r < 0)
2035                         goto fail;
2036         }
2037
2038         r = journal_file_map_field_hash_table(f);
2039         if (r < 0)
2040                 goto fail;
2041
2042         r = journal_file_map_data_hash_table(f);
2043         if (r < 0)
2044                 goto fail;
2045
2046         if (ret)
2047                 *ret = f;
2048
2049         return 0;
2050
2051 fail:
2052         journal_file_close(f);
2053
2054         return r;
2055 }
2056
2057 int journal_file_rotate(JournalFile **f) {
2058         char *p;
2059         size_t l;
2060         JournalFile *old_file, *new_file = NULL;
2061         int r;
2062
2063         assert(f);
2064         assert(*f);
2065
2066         old_file = *f;
2067
2068         if (!old_file->writable)
2069                 return -EINVAL;
2070
2071         if (!endswith(old_file->path, ".journal"))
2072                 return -EINVAL;
2073
2074         l = strlen(old_file->path);
2075
2076         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2077         if (!p)
2078                 return -ENOMEM;
2079
2080         memcpy(p, old_file->path, l - 8);
2081         p[l-8] = '@';
2082         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2083         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2084                  "-%016llx-%016llx.journal",
2085                  (unsigned long long) le64toh((*f)->header->tail_seqnum),
2086                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2087
2088         r = rename(old_file->path, p);
2089         free(p);
2090
2091         if (r < 0)
2092                 return -errno;
2093
2094         old_file->header->state = STATE_ARCHIVED;
2095
2096         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
2097         journal_file_close(old_file);
2098
2099         *f = new_file;
2100         return r;
2101 }
2102
2103 int journal_file_open_reliably(
2104                 const char *fname,
2105                 int flags,
2106                 mode_t mode,
2107                 JournalFile *template,
2108                 JournalFile **ret) {
2109
2110         int r;
2111         size_t l;
2112         char *p;
2113
2114         r = journal_file_open(fname, flags, mode, template, ret);
2115         if (r != -EBADMSG && /* corrupted */
2116             r != -ENODATA && /* truncated */
2117             r != -EHOSTDOWN && /* other machine */
2118             r != -EPROTONOSUPPORT) /* incompatible feature */
2119                 return r;
2120
2121         if ((flags & O_ACCMODE) == O_RDONLY)
2122                 return r;
2123
2124         if (!(flags & O_CREAT))
2125                 return r;
2126
2127         /* The file is corrupted. Rotate it away and try it again (but only once) */
2128
2129         l = strlen(fname);
2130         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2131                      (int) (l-8), fname,
2132                      (unsigned long long) now(CLOCK_REALTIME),
2133                      random_ull()) < 0)
2134                 return -ENOMEM;
2135
2136         r = rename(fname, p);
2137         free(p);
2138         if (r < 0)
2139                 return -errno;
2140
2141         log_warning("File %s corrupted, renaming and replacing.", fname);
2142
2143         return journal_file_open(fname, flags, mode, template, ret);
2144 }
2145
2146 struct vacuum_info {
2147         off_t usage;
2148         char *filename;
2149
2150         uint64_t realtime;
2151         sd_id128_t seqnum_id;
2152         uint64_t seqnum;
2153
2154         bool have_seqnum;
2155 };
2156
2157 static int vacuum_compare(const void *_a, const void *_b) {
2158         const struct vacuum_info *a, *b;
2159
2160         a = _a;
2161         b = _b;
2162
2163         if (a->have_seqnum && b->have_seqnum &&
2164             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
2165                 if (a->seqnum < b->seqnum)
2166                         return -1;
2167                 else if (a->seqnum > b->seqnum)
2168                         return 1;
2169                 else
2170                         return 0;
2171         }
2172
2173         if (a->realtime < b->realtime)
2174                 return -1;
2175         else if (a->realtime > b->realtime)
2176                 return 1;
2177         else if (a->have_seqnum && b->have_seqnum)
2178                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
2179         else
2180                 return strcmp(a->filename, b->filename);
2181 }
2182
2183 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
2184         DIR *d;
2185         int r = 0;
2186         struct vacuum_info *list = NULL;
2187         unsigned n_list = 0, n_allocated = 0, i;
2188         uint64_t sum = 0;
2189
2190         assert(directory);
2191
2192         if (max_use <= 0)
2193                 return 0;
2194
2195         d = opendir(directory);
2196         if (!d)
2197                 return -errno;
2198
2199         for (;;) {
2200                 int k;
2201                 struct dirent buf, *de;
2202                 size_t q;
2203                 struct stat st;
2204                 char *p;
2205                 unsigned long long seqnum = 0, realtime;
2206                 sd_id128_t seqnum_id;
2207                 bool have_seqnum;
2208
2209                 k = readdir_r(d, &buf, &de);
2210                 if (k != 0) {
2211                         r = -k;
2212                         goto finish;
2213                 }
2214
2215                 if (!de)
2216                         break;
2217
2218                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2219                         continue;
2220
2221                 if (!S_ISREG(st.st_mode))
2222                         continue;
2223
2224                 q = strlen(de->d_name);
2225
2226                 if (endswith(de->d_name, ".journal")) {
2227
2228                         /* Vacuum archived files */
2229
2230                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2231                                 continue;
2232
2233                         if (de->d_name[q-8-16-1] != '-' ||
2234                             de->d_name[q-8-16-1-16-1] != '-' ||
2235                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2236                                 continue;
2237
2238                         p = strdup(de->d_name);
2239                         if (!p) {
2240                                 r = -ENOMEM;
2241                                 goto finish;
2242                         }
2243
2244                         de->d_name[q-8-16-1-16-1] = 0;
2245                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2246                                 free(p);
2247                                 continue;
2248                         }
2249
2250                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2251                                 free(p);
2252                                 continue;
2253                         }
2254
2255                         have_seqnum = true;
2256
2257                 } else if (endswith(de->d_name, ".journal~")) {
2258                         unsigned long long tmp;
2259
2260                         /* Vacuum corrupted files */
2261
2262                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2263                                 continue;
2264
2265                         if (de->d_name[q-1-8-16-1] != '-' ||
2266                             de->d_name[q-1-8-16-1-16-1] != '@')
2267                                 continue;
2268
2269                         p = strdup(de->d_name);
2270                         if (!p) {
2271                                 r = -ENOMEM;
2272                                 goto finish;
2273                         }
2274
2275                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2276                                 free(p);
2277                                 continue;
2278                         }
2279
2280                         have_seqnum = false;
2281                 } else
2282                         continue;
2283
2284                 if (n_list >= n_allocated) {
2285                         struct vacuum_info *j;
2286
2287                         n_allocated = MAX(n_allocated * 2U, 8U);
2288                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2289                         if (!j) {
2290                                 free(p);
2291                                 r = -ENOMEM;
2292                                 goto finish;
2293                         }
2294
2295                         list = j;
2296                 }
2297
2298                 list[n_list].filename = p;
2299                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2300                 list[n_list].seqnum = seqnum;
2301                 list[n_list].realtime = realtime;
2302                 list[n_list].seqnum_id = seqnum_id;
2303                 list[n_list].have_seqnum = have_seqnum;
2304
2305                 sum += list[n_list].usage;
2306
2307                 n_list ++;
2308         }
2309
2310         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2311
2312         for(i = 0; i < n_list; i++) {
2313                 struct statvfs ss;
2314
2315                 if (fstatvfs(dirfd(d), &ss) < 0) {
2316                         r = -errno;
2317                         goto finish;
2318                 }
2319
2320                 if (sum <= max_use &&
2321                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2322                         break;
2323
2324                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2325                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2326                         sum -= list[i].usage;
2327                 } else if (errno != ENOENT)
2328                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2329         }
2330
2331 finish:
2332         for (i = 0; i < n_list; i++)
2333                 free(list[i].filename);
2334
2335         free(list);
2336
2337         if (d)
2338                 closedir(d);
2339
2340         return r;
2341 }
2342
2343 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2344         uint64_t i, n;
2345         uint64_t q, xor_hash = 0;
2346         int r;
2347         EntryItem *items;
2348         dual_timestamp ts;
2349
2350         assert(from);
2351         assert(to);
2352         assert(o);
2353         assert(p);
2354
2355         if (!to->writable)
2356                 return -EPERM;
2357
2358         ts.monotonic = le64toh(o->entry.monotonic);
2359         ts.realtime = le64toh(o->entry.realtime);
2360
2361         if (to->tail_entry_monotonic_valid &&
2362             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2363                 return -EINVAL;
2364
2365         n = journal_file_entry_n_items(o);
2366         items = alloca(sizeof(EntryItem) * n);
2367
2368         for (i = 0; i < n; i++) {
2369                 uint64_t l, h;
2370                 le64_t le_hash;
2371                 size_t t;
2372                 void *data;
2373                 Object *u;
2374
2375                 q = le64toh(o->entry.items[i].object_offset);
2376                 le_hash = o->entry.items[i].hash;
2377
2378                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2379                 if (r < 0)
2380                         return r;
2381
2382                 if (le_hash != o->data.hash)
2383                         return -EBADMSG;
2384
2385                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2386                 t = (size_t) l;
2387
2388                 /* We hit the limit on 32bit machines */
2389                 if ((uint64_t) t != l)
2390                         return -E2BIG;
2391
2392                 if (o->object.flags & OBJECT_COMPRESSED) {
2393 #ifdef HAVE_XZ
2394                         uint64_t rsize;
2395
2396                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2397                                 return -EBADMSG;
2398
2399                         data = from->compress_buffer;
2400                         l = rsize;
2401 #else
2402                         return -EPROTONOSUPPORT;
2403 #endif
2404                 } else
2405                         data = o->data.payload;
2406
2407                 r = journal_file_append_data(to, data, l, &u, &h);
2408                 if (r < 0)
2409                         return r;
2410
2411                 xor_hash ^= le64toh(u->data.hash);
2412                 items[i].object_offset = htole64(h);
2413                 items[i].hash = u->data.hash;
2414
2415                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2416                 if (r < 0)
2417                         return r;
2418         }
2419
2420         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2421 }
2422
2423 void journal_default_metrics(JournalMetrics *m, int fd) {
2424         uint64_t fs_size = 0;
2425         struct statvfs ss;
2426         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2427
2428         assert(m);
2429         assert(fd >= 0);
2430
2431         if (fstatvfs(fd, &ss) >= 0)
2432                 fs_size = ss.f_frsize * ss.f_blocks;
2433
2434         if (m->max_use == (uint64_t) -1) {
2435
2436                 if (fs_size > 0) {
2437                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2438
2439                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2440                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2441
2442                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2443                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2444                 } else
2445                         m->max_use = DEFAULT_MAX_USE_LOWER;
2446         } else {
2447                 m->max_use = PAGE_ALIGN(m->max_use);
2448
2449                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2450                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2451         }
2452
2453         if (m->max_size == (uint64_t) -1) {
2454                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2455
2456                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2457                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2458         } else
2459                 m->max_size = PAGE_ALIGN(m->max_size);
2460
2461         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2462                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2463
2464         if (m->max_size*2 > m->max_use)
2465                 m->max_use = m->max_size*2;
2466
2467         if (m->min_size == (uint64_t) -1)
2468                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2469         else {
2470                 m->min_size = PAGE_ALIGN(m->min_size);
2471
2472                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2473                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2474
2475                 if (m->min_size > m->max_size)
2476                         m->max_size = m->min_size;
2477         }
2478
2479         if (m->keep_free == (uint64_t) -1) {
2480
2481                 if (fs_size > 0) {
2482                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2483
2484                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2485                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2486
2487                 } else
2488                         m->keep_free = DEFAULT_KEEP_FREE;
2489         }
2490
2491         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2492                  format_bytes(a, sizeof(a), m->max_use),
2493                  format_bytes(b, sizeof(b), m->max_size),
2494                  format_bytes(c, sizeof(c), m->min_size),
2495                  format_bytes(d, sizeof(d), m->keep_free));
2496 }
2497
2498 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2499         assert(f);
2500         assert(from || to);
2501
2502         if (from) {
2503                 if (f->header->head_entry_realtime == 0)
2504                         return -ENOENT;
2505
2506                 *from = le64toh(f->header->head_entry_realtime);
2507         }
2508
2509         if (to) {
2510                 if (f->header->tail_entry_realtime == 0)
2511                         return -ENOENT;
2512
2513                 *to = le64toh(f->header->tail_entry_realtime);
2514         }
2515
2516         return 1;
2517 }
2518
2519 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2520         char t[9+32+1] = "_BOOT_ID=";
2521         Object *o;
2522         uint64_t p;
2523         int r;
2524
2525         assert(f);
2526         assert(from || to);
2527
2528         sd_id128_to_string(boot_id, t + 9);
2529
2530         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2531         if (r <= 0)
2532                 return r;
2533
2534         if (le64toh(o->data.n_entries) <= 0)
2535                 return 0;
2536
2537         if (from) {
2538                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2539                 if (r < 0)
2540                         return r;
2541
2542                 *from = le64toh(o->entry.monotonic);
2543         }
2544
2545         if (to) {
2546                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2547                 if (r < 0)
2548                         return r;
2549
2550                 r = generic_array_get_plus_one(f,
2551                                                le64toh(o->data.entry_offset),
2552                                                le64toh(o->data.entry_array_offset),
2553                                                le64toh(o->data.n_entries)-1,
2554                                                &o, NULL);
2555                 if (r <= 0)
2556                         return r;
2557
2558                 *to = le64toh(o->entry.monotonic);
2559         }
2560
2561         return 1;
2562 }
2563
2564 bool journal_file_rotate_suggested(JournalFile *f) {
2565         assert(f);
2566
2567         /* If we gained new header fields we gained new features,
2568          * hence suggest a rotation */
2569         if (le64toh(f->header->header_size) < sizeof(Header))
2570                 return true;
2571
2572         /* Let's check if the hash tables grew over a certain fill
2573          * level (75%, borrowing this value from Java's hash table
2574          * implementation), and if so suggest a rotation. To calculate
2575          * the fill level we need the n_data field, which only exists
2576          * in newer versions. */
2577
2578         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2579                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL)
2580                         return true;
2581
2582         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2583                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL)
2584                         return true;
2585
2586         return false;
2587 }