chiark / gitweb /
man: document kernel journal fields
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65
66 #define JOURNAL_HEADER_CONTAINS(h, field) \
67         (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
68
69 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
70
71 void journal_file_close(JournalFile *f) {
72         int t;
73
74         assert(f);
75
76         if (f->header) {
77                 /* Mark the file offline. Don't override the archived state if it already is set */
78                 if (f->writable && f->header->state == STATE_ONLINE)
79                         f->header->state = STATE_OFFLINE;
80
81                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
82         }
83
84         for (t = 0; t < _WINDOW_MAX; t++)
85                 if (f->windows[t].ptr)
86                         munmap(f->windows[t].ptr, f->windows[t].size);
87
88         if (f->fd >= 0)
89                 close_nointr_nofail(f->fd);
90
91         free(f->path);
92
93 #ifdef HAVE_XZ
94         free(f->compress_buffer);
95 #endif
96
97         free(f);
98 }
99
100 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
101         Header h;
102         ssize_t k;
103         int r;
104
105         assert(f);
106
107         zero(h);
108         memcpy(h.signature, signature, 8);
109         h.header_size = htole64(ALIGN64(sizeof(h)));
110
111         r = sd_id128_randomize(&h.file_id);
112         if (r < 0)
113                 return r;
114
115         if (template) {
116                 h.seqnum_id = template->header->seqnum_id;
117                 h.tail_seqnum = template->header->tail_seqnum;
118         } else
119                 h.seqnum_id = h.file_id;
120
121         k = pwrite(f->fd, &h, sizeof(h), 0);
122         if (k < 0)
123                 return -errno;
124
125         if (k != sizeof(h))
126                 return -EIO;
127
128         return 0;
129 }
130
131 static int journal_file_refresh_header(JournalFile *f) {
132         int r;
133         sd_id128_t boot_id;
134
135         assert(f);
136
137         r = sd_id128_get_machine(&f->header->machine_id);
138         if (r < 0)
139                 return r;
140
141         r = sd_id128_get_boot(&boot_id);
142         if (r < 0)
143                 return r;
144
145         if (sd_id128_equal(boot_id, f->header->boot_id))
146                 f->tail_entry_monotonic_valid = true;
147
148         f->header->boot_id = boot_id;
149
150         f->header->state = STATE_ONLINE;
151
152         __sync_synchronize();
153
154         return 0;
155 }
156
157 static int journal_file_verify_header(JournalFile *f) {
158         assert(f);
159
160         if (memcmp(f->header, signature, 8))
161                 return -EBADMSG;
162
163 #ifdef HAVE_XZ
164         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
165                 return -EPROTONOSUPPORT;
166 #else
167         if (f->header->incompatible_flags != 0)
168                 return -EPROTONOSUPPORT;
169 #endif
170
171         /* The first addition was n_data, so check that we are at least this large */
172         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
173                 return -EBADMSG;
174
175         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
176                 return -ENODATA;
177
178         if (f->writable) {
179                 uint8_t state;
180                 sd_id128_t machine_id;
181                 int r;
182
183                 r = sd_id128_get_machine(&machine_id);
184                 if (r < 0)
185                         return r;
186
187                 if (!sd_id128_equal(machine_id, f->header->machine_id))
188                         return -EHOSTDOWN;
189
190                 state = f->header->state;
191
192                 if (state == STATE_ONLINE) {
193                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
194                         return -EBUSY;
195                 } else if (state == STATE_ARCHIVED)
196                         return -ESHUTDOWN;
197                 else if (state != STATE_OFFLINE) {
198                         log_debug("Journal file %s has unknown state %u.", f->path, state);
199                         return -EBUSY;
200                 }
201         }
202
203         return 0;
204 }
205
206 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
207         uint64_t old_size, new_size;
208         int r;
209
210         assert(f);
211
212         /* We assume that this file is not sparse, and we know that
213          * for sure, since we always call posix_fallocate()
214          * ourselves */
215
216         old_size =
217                 le64toh(f->header->header_size) +
218                 le64toh(f->header->arena_size);
219
220         new_size = PAGE_ALIGN(offset + size);
221         if (new_size < le64toh(f->header->header_size))
222                 new_size = le64toh(f->header->header_size);
223
224         if (new_size <= old_size)
225                 return 0;
226
227         if (f->metrics.max_size > 0 &&
228             new_size > f->metrics.max_size)
229                 return -E2BIG;
230
231         if (new_size > f->metrics.min_size &&
232             f->metrics.keep_free > 0) {
233                 struct statvfs svfs;
234
235                 if (fstatvfs(f->fd, &svfs) >= 0) {
236                         uint64_t available;
237
238                         available = svfs.f_bfree * svfs.f_bsize;
239
240                         if (available >= f->metrics.keep_free)
241                                 available -= f->metrics.keep_free;
242                         else
243                                 available = 0;
244
245                         if (new_size - old_size > available)
246                                 return -E2BIG;
247                 }
248         }
249
250         /* Note that the glibc fallocate() fallback is very
251            inefficient, hence we try to minimize the allocation area
252            as we can. */
253         r = posix_fallocate(f->fd, old_size, new_size - old_size);
254         if (r != 0)
255                 return -r;
256
257         if (fstat(f->fd, &f->last_stat) < 0)
258                 return -errno;
259
260         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
261
262         return 0;
263 }
264
265 static int journal_file_map(
266                 JournalFile *f,
267                 uint64_t offset,
268                 uint64_t size,
269                 void **_window,
270                 uint64_t *_woffset,
271                 uint64_t *_wsize,
272                 void **ret) {
273
274         uint64_t woffset, wsize;
275         void *window;
276
277         assert(f);
278         assert(size > 0);
279         assert(ret);
280
281         woffset = offset & ~((uint64_t) page_size() - 1ULL);
282         wsize = size + (offset - woffset);
283         wsize = PAGE_ALIGN(wsize);
284
285         /* Avoid SIGBUS on invalid accesses */
286         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
287                 return -EADDRNOTAVAIL;
288
289         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
290         if (window == MAP_FAILED)
291                 return -errno;
292
293         if (_window)
294                 *_window = window;
295
296         if (_woffset)
297                 *_woffset = woffset;
298
299         if (_wsize)
300                 *_wsize = wsize;
301
302         *ret = (uint8_t*) window + (offset - woffset);
303
304         return 0;
305 }
306
307 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
308         void *p = NULL;
309         uint64_t delta;
310         int r;
311         Window *w;
312
313         assert(f);
314         assert(ret);
315         assert(wt >= 0);
316         assert(wt < _WINDOW_MAX);
317
318         if (offset + size > (uint64_t) f->last_stat.st_size) {
319                 /* Hmm, out of range? Let's refresh the fstat() data
320                  * first, before we trust that check. */
321
322                 if (fstat(f->fd, &f->last_stat) < 0 ||
323                     offset + size > (uint64_t) f->last_stat.st_size)
324                         return -EADDRNOTAVAIL;
325         }
326
327         w = f->windows + wt;
328
329         if (_likely_(w->ptr &&
330                      w->offset <= offset &&
331                      w->offset + w->size >= offset + size)) {
332
333                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
334                 return 0;
335         }
336
337         if (w->ptr) {
338                 if (munmap(w->ptr, w->size) < 0)
339                         return -errno;
340
341                 w->ptr = NULL;
342                 w->size = w->offset = 0;
343         }
344
345         if (size < DEFAULT_WINDOW_SIZE) {
346                 /* If the default window size is larger then what was
347                  * asked for extend the mapping a bit in the hope to
348                  * minimize needed remappings later on. We add half
349                  * the window space before and half behind the
350                  * requested mapping */
351
352                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
353
354                 if (delta > offset)
355                         delta = offset;
356
357                 offset -= delta;
358                 size = DEFAULT_WINDOW_SIZE;
359         } else
360                 delta = 0;
361
362         if (offset + size > (uint64_t) f->last_stat.st_size)
363                 size = (uint64_t) f->last_stat.st_size - offset;
364
365         if (size <= 0)
366                 return -EADDRNOTAVAIL;
367
368         r = journal_file_map(f,
369                              offset, size,
370                              &w->ptr, &w->offset, &w->size,
371                              &p);
372
373         if (r < 0)
374                 return r;
375
376         *ret = (uint8_t*) p + delta;
377         return 0;
378 }
379
380 static bool verify_hash(Object *o) {
381         uint64_t h1, h2;
382
383         assert(o);
384
385         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
386                 h1 = le64toh(o->data.hash);
387                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
388         } else if (o->object.type == OBJECT_FIELD) {
389                 h1 = le64toh(o->field.hash);
390                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
391         } else
392                 return true;
393
394         return h1 == h2;
395 }
396
397 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
398         int r;
399         void *t;
400         Object *o;
401         uint64_t s;
402
403         assert(f);
404         assert(ret);
405         assert(type < _OBJECT_TYPE_MAX);
406
407         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
408         if (r < 0)
409                 return r;
410
411         o = (Object*) t;
412         s = le64toh(o->object.size);
413
414         if (s < sizeof(ObjectHeader))
415                 return -EBADMSG;
416
417         if (type >= 0 && o->object.type != type)
418                 return -EBADMSG;
419
420         if (s > sizeof(ObjectHeader)) {
421                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
422                 if (r < 0)
423                         return r;
424
425                 o = (Object*) t;
426         }
427
428         if (!verify_hash(o))
429                 return -EBADMSG;
430
431         *ret = o;
432         return 0;
433 }
434
435 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
436         uint64_t r;
437
438         assert(f);
439
440         r = le64toh(f->header->tail_seqnum) + 1;
441
442         if (seqnum) {
443                 /* If an external seqnum counter was passed, we update
444                  * both the local and the external one, and set it to
445                  * the maximum of both */
446
447                 if (*seqnum + 1 > r)
448                         r = *seqnum + 1;
449
450                 *seqnum = r;
451         }
452
453         f->header->tail_seqnum = htole64(r);
454
455         if (f->header->head_seqnum == 0)
456                 f->header->head_seqnum = htole64(r);
457
458         return r;
459 }
460
461 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
462         int r;
463         uint64_t p;
464         Object *tail, *o;
465         void *t;
466
467         assert(f);
468         assert(size >= sizeof(ObjectHeader));
469         assert(offset);
470         assert(ret);
471
472         p = le64toh(f->header->tail_object_offset);
473         if (p == 0)
474                 p = le64toh(f->header->header_size);
475         else {
476                 r = journal_file_move_to_object(f, -1, p, &tail);
477                 if (r < 0)
478                         return r;
479
480                 p += ALIGN64(le64toh(tail->object.size));
481         }
482
483         r = journal_file_allocate(f, p, size);
484         if (r < 0)
485                 return r;
486
487         r = journal_file_move_to(f, type, p, size, &t);
488         if (r < 0)
489                 return r;
490
491         o = (Object*) t;
492
493         zero(o->object);
494         o->object.type = type;
495         o->object.size = htole64(size);
496
497         f->header->tail_object_offset = htole64(p);
498         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
499
500         *ret = o;
501         *offset = p;
502
503         return 0;
504 }
505
506 static int journal_file_setup_data_hash_table(JournalFile *f) {
507         uint64_t s, p;
508         Object *o;
509         int r;
510
511         assert(f);
512
513         /* We estimate that we need 1 hash table entry per 768 of
514            journal file and we want to make sure we never get beyond
515            75% fill level. Calculate the hash table size for the
516            maximum file size based on these metrics. */
517
518         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
519         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
520                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
521
522         log_info("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
523
524         r = journal_file_append_object(f,
525                                        OBJECT_DATA_HASH_TABLE,
526                                        offsetof(Object, hash_table.items) + s,
527                                        &o, &p);
528         if (r < 0)
529                 return r;
530
531         memset(o->hash_table.items, 0, s);
532
533         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
534         f->header->data_hash_table_size = htole64(s);
535
536         return 0;
537 }
538
539 static int journal_file_setup_field_hash_table(JournalFile *f) {
540         uint64_t s, p;
541         Object *o;
542         int r;
543
544         assert(f);
545
546         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
547         r = journal_file_append_object(f,
548                                        OBJECT_FIELD_HASH_TABLE,
549                                        offsetof(Object, hash_table.items) + s,
550                                        &o, &p);
551         if (r < 0)
552                 return r;
553
554         memset(o->hash_table.items, 0, s);
555
556         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
557         f->header->field_hash_table_size = htole64(s);
558
559         return 0;
560 }
561
562 static int journal_file_map_data_hash_table(JournalFile *f) {
563         uint64_t s, p;
564         void *t;
565         int r;
566
567         assert(f);
568
569         p = le64toh(f->header->data_hash_table_offset);
570         s = le64toh(f->header->data_hash_table_size);
571
572         r = journal_file_move_to(f,
573                                  WINDOW_DATA_HASH_TABLE,
574                                  p, s,
575                                  &t);
576         if (r < 0)
577                 return r;
578
579         f->data_hash_table = t;
580         return 0;
581 }
582
583 static int journal_file_map_field_hash_table(JournalFile *f) {
584         uint64_t s, p;
585         void *t;
586         int r;
587
588         assert(f);
589
590         p = le64toh(f->header->field_hash_table_offset);
591         s = le64toh(f->header->field_hash_table_size);
592
593         r = journal_file_move_to(f,
594                                  WINDOW_FIELD_HASH_TABLE,
595                                  p, s,
596                                  &t);
597         if (r < 0)
598                 return r;
599
600         f->field_hash_table = t;
601         return 0;
602 }
603
604 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
605         uint64_t p, h;
606         int r;
607
608         assert(f);
609         assert(o);
610         assert(offset > 0);
611         assert(o->object.type == OBJECT_DATA);
612
613         /* This might alter the window we are looking at */
614
615         o->data.next_hash_offset = o->data.next_field_offset = 0;
616         o->data.entry_offset = o->data.entry_array_offset = 0;
617         o->data.n_entries = 0;
618
619         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
620         p = le64toh(f->data_hash_table[h].tail_hash_offset);
621         if (p == 0) {
622                 /* Only entry in the hash table is easy */
623                 f->data_hash_table[h].head_hash_offset = htole64(offset);
624         } else {
625                 /* Move back to the previous data object, to patch in
626                  * pointer */
627
628                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
629                 if (r < 0)
630                         return r;
631
632                 o->data.next_hash_offset = htole64(offset);
633         }
634
635         f->data_hash_table[h].tail_hash_offset = htole64(offset);
636
637         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
638                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
639
640         return 0;
641 }
642
643 int journal_file_find_data_object_with_hash(
644                 JournalFile *f,
645                 const void *data, uint64_t size, uint64_t hash,
646                 Object **ret, uint64_t *offset) {
647
648         uint64_t p, osize, h;
649         int r;
650
651         assert(f);
652         assert(data || size == 0);
653
654         osize = offsetof(Object, data.payload) + size;
655
656         if (f->header->data_hash_table_size == 0)
657                 return -EBADMSG;
658
659         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
660         p = le64toh(f->data_hash_table[h].head_hash_offset);
661
662         while (p > 0) {
663                 Object *o;
664
665                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
666                 if (r < 0)
667                         return r;
668
669                 if (le64toh(o->data.hash) != hash)
670                         goto next;
671
672                 if (o->object.flags & OBJECT_COMPRESSED) {
673 #ifdef HAVE_XZ
674                         uint64_t l, rsize;
675
676                         l = le64toh(o->object.size);
677                         if (l <= offsetof(Object, data.payload))
678                                 return -EBADMSG;
679
680                         l -= offsetof(Object, data.payload);
681
682                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
683                                 return -EBADMSG;
684
685                         if (rsize == size &&
686                             memcmp(f->compress_buffer, data, size) == 0) {
687
688                                 if (ret)
689                                         *ret = o;
690
691                                 if (offset)
692                                         *offset = p;
693
694                                 return 1;
695                         }
696 #else
697                         return -EPROTONOSUPPORT;
698 #endif
699
700                 } else if (le64toh(o->object.size) == osize &&
701                            memcmp(o->data.payload, data, size) == 0) {
702
703                         if (ret)
704                                 *ret = o;
705
706                         if (offset)
707                                 *offset = p;
708
709                         return 1;
710                 }
711
712         next:
713                 p = le64toh(o->data.next_hash_offset);
714         }
715
716         return 0;
717 }
718
719 int journal_file_find_data_object(
720                 JournalFile *f,
721                 const void *data, uint64_t size,
722                 Object **ret, uint64_t *offset) {
723
724         uint64_t hash;
725
726         assert(f);
727         assert(data || size == 0);
728
729         hash = hash64(data, size);
730
731         return journal_file_find_data_object_with_hash(f,
732                                                        data, size, hash,
733                                                        ret, offset);
734 }
735
736 static int journal_file_append_data(
737                 JournalFile *f,
738                 const void *data, uint64_t size,
739                 Object **ret, uint64_t *offset) {
740
741         uint64_t hash, p;
742         uint64_t osize;
743         Object *o;
744         int r;
745         bool compressed = false;
746
747         assert(f);
748         assert(data || size == 0);
749
750         hash = hash64(data, size);
751
752         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
753         if (r < 0)
754                 return r;
755         else if (r > 0) {
756
757                 if (ret)
758                         *ret = o;
759
760                 if (offset)
761                         *offset = p;
762
763                 return 0;
764         }
765
766         osize = offsetof(Object, data.payload) + size;
767         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
768         if (r < 0)
769                 return r;
770
771         o->data.hash = htole64(hash);
772
773 #ifdef HAVE_XZ
774         if (f->compress &&
775             size >= COMPRESSION_SIZE_THRESHOLD) {
776                 uint64_t rsize;
777
778                 compressed = compress_blob(data, size, o->data.payload, &rsize);
779
780                 if (compressed) {
781                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
782                         o->object.flags |= OBJECT_COMPRESSED;
783
784                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
785
786                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
787                 }
788         }
789 #endif
790
791         if (!compressed && size > 0)
792                 memcpy(o->data.payload, data, size);
793
794         r = journal_file_link_data(f, o, p, hash);
795         if (r < 0)
796                 return r;
797
798         /* The linking might have altered the window, so let's
799          * refresh our pointer */
800         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
801         if (r < 0)
802                 return r;
803
804         if (ret)
805                 *ret = o;
806
807         if (offset)
808                 *offset = p;
809
810         return 0;
811 }
812
813 uint64_t journal_file_entry_n_items(Object *o) {
814         assert(o);
815         assert(o->object.type == OBJECT_ENTRY);
816
817         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
818 }
819
820 static uint64_t journal_file_entry_array_n_items(Object *o) {
821         assert(o);
822         assert(o->object.type == OBJECT_ENTRY_ARRAY);
823
824         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
825 }
826
827 static int link_entry_into_array(JournalFile *f,
828                                  le64_t *first,
829                                  le64_t *idx,
830                                  uint64_t p) {
831         int r;
832         uint64_t n = 0, ap = 0, q, i, a, hidx;
833         Object *o;
834
835         assert(f);
836         assert(first);
837         assert(idx);
838         assert(p > 0);
839
840         a = le64toh(*first);
841         i = hidx = le64toh(*idx);
842         while (a > 0) {
843
844                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
845                 if (r < 0)
846                         return r;
847
848                 n = journal_file_entry_array_n_items(o);
849                 if (i < n) {
850                         o->entry_array.items[i] = htole64(p);
851                         *idx = htole64(hidx + 1);
852                         return 0;
853                 }
854
855                 i -= n;
856                 ap = a;
857                 a = le64toh(o->entry_array.next_entry_array_offset);
858         }
859
860         if (hidx > n)
861                 n = (hidx+1) * 2;
862         else
863                 n = n * 2;
864
865         if (n < 4)
866                 n = 4;
867
868         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
869                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
870                                        &o, &q);
871         if (r < 0)
872                 return r;
873
874         o->entry_array.items[i] = htole64(p);
875
876         if (ap == 0)
877                 *first = htole64(q);
878         else {
879                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
880                 if (r < 0)
881                         return r;
882
883                 o->entry_array.next_entry_array_offset = htole64(q);
884         }
885
886         *idx = htole64(hidx + 1);
887
888         return 0;
889 }
890
891 static int link_entry_into_array_plus_one(JournalFile *f,
892                                           le64_t *extra,
893                                           le64_t *first,
894                                           le64_t *idx,
895                                           uint64_t p) {
896
897         int r;
898
899         assert(f);
900         assert(extra);
901         assert(first);
902         assert(idx);
903         assert(p > 0);
904
905         if (*idx == 0)
906                 *extra = htole64(p);
907         else {
908                 le64_t i;
909
910                 i = htole64(le64toh(*idx) - 1);
911                 r = link_entry_into_array(f, first, &i, p);
912                 if (r < 0)
913                         return r;
914         }
915
916         *idx = htole64(le64toh(*idx) + 1);
917         return 0;
918 }
919
920 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
921         uint64_t p;
922         int r;
923         assert(f);
924         assert(o);
925         assert(offset > 0);
926
927         p = le64toh(o->entry.items[i].object_offset);
928         if (p == 0)
929                 return -EINVAL;
930
931         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
932         if (r < 0)
933                 return r;
934
935         return link_entry_into_array_plus_one(f,
936                                               &o->data.entry_offset,
937                                               &o->data.entry_array_offset,
938                                               &o->data.n_entries,
939                                               offset);
940 }
941
942 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
943         uint64_t n, i;
944         int r;
945
946         assert(f);
947         assert(o);
948         assert(offset > 0);
949         assert(o->object.type == OBJECT_ENTRY);
950
951         __sync_synchronize();
952
953         /* Link up the entry itself */
954         r = link_entry_into_array(f,
955                                   &f->header->entry_array_offset,
956                                   &f->header->n_entries,
957                                   offset);
958         if (r < 0)
959                 return r;
960
961         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
962
963         if (f->header->head_entry_realtime == 0)
964                 f->header->head_entry_realtime = o->entry.realtime;
965
966         f->header->tail_entry_realtime = o->entry.realtime;
967         f->header->tail_entry_monotonic = o->entry.monotonic;
968
969         f->tail_entry_monotonic_valid = true;
970
971         /* Link up the items */
972         n = journal_file_entry_n_items(o);
973         for (i = 0; i < n; i++) {
974                 r = journal_file_link_entry_item(f, o, offset, i);
975                 if (r < 0)
976                         return r;
977         }
978
979         return 0;
980 }
981
982 static int journal_file_append_entry_internal(
983                 JournalFile *f,
984                 const dual_timestamp *ts,
985                 uint64_t xor_hash,
986                 const EntryItem items[], unsigned n_items,
987                 uint64_t *seqnum,
988                 Object **ret, uint64_t *offset) {
989         uint64_t np;
990         uint64_t osize;
991         Object *o;
992         int r;
993
994         assert(f);
995         assert(items || n_items == 0);
996         assert(ts);
997
998         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
999
1000         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1001         if (r < 0)
1002                 return r;
1003
1004         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
1005         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1006         o->entry.realtime = htole64(ts->realtime);
1007         o->entry.monotonic = htole64(ts->monotonic);
1008         o->entry.xor_hash = htole64(xor_hash);
1009         o->entry.boot_id = f->header->boot_id;
1010
1011         r = journal_file_link_entry(f, o, np);
1012         if (r < 0)
1013                 return r;
1014
1015         if (ret)
1016                 *ret = o;
1017
1018         if (offset)
1019                 *offset = np;
1020
1021         return 0;
1022 }
1023
1024 void journal_file_post_change(JournalFile *f) {
1025         assert(f);
1026
1027         /* inotify() does not receive IN_MODIFY events from file
1028          * accesses done via mmap(). After each access we hence
1029          * trigger IN_MODIFY by truncating the journal file to its
1030          * current size which triggers IN_MODIFY. */
1031
1032         __sync_synchronize();
1033
1034         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1035                 log_error("Failed to to truncate file to its own size: %m");
1036 }
1037
1038 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1039         unsigned i;
1040         EntryItem *items;
1041         int r;
1042         uint64_t xor_hash = 0;
1043         struct dual_timestamp _ts;
1044
1045         assert(f);
1046         assert(iovec || n_iovec == 0);
1047
1048         if (!f->writable)
1049                 return -EPERM;
1050
1051         if (!ts) {
1052                 dual_timestamp_get(&_ts);
1053                 ts = &_ts;
1054         }
1055
1056         if (f->tail_entry_monotonic_valid &&
1057             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1058                 return -EINVAL;
1059
1060         /* alloca() can't take 0, hence let's allocate at least one */
1061         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1062
1063         for (i = 0; i < n_iovec; i++) {
1064                 uint64_t p;
1065                 Object *o;
1066
1067                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1068                 if (r < 0)
1069                         return r;
1070
1071                 xor_hash ^= le64toh(o->data.hash);
1072                 items[i].object_offset = htole64(p);
1073                 items[i].hash = o->data.hash;
1074         }
1075
1076         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1077
1078         journal_file_post_change(f);
1079
1080         return r;
1081 }
1082
1083 static int generic_array_get(JournalFile *f,
1084                              uint64_t first,
1085                              uint64_t i,
1086                              Object **ret, uint64_t *offset) {
1087
1088         Object *o;
1089         uint64_t p = 0, a;
1090         int r;
1091
1092         assert(f);
1093
1094         a = first;
1095         while (a > 0) {
1096                 uint64_t n;
1097
1098                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1099                 if (r < 0)
1100                         return r;
1101
1102                 n = journal_file_entry_array_n_items(o);
1103                 if (i < n) {
1104                         p = le64toh(o->entry_array.items[i]);
1105                         break;
1106                 }
1107
1108                 i -= n;
1109                 a = le64toh(o->entry_array.next_entry_array_offset);
1110         }
1111
1112         if (a <= 0 || p <= 0)
1113                 return 0;
1114
1115         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1116         if (r < 0)
1117                 return r;
1118
1119         if (ret)
1120                 *ret = o;
1121
1122         if (offset)
1123                 *offset = p;
1124
1125         return 1;
1126 }
1127
1128 static int generic_array_get_plus_one(JournalFile *f,
1129                                       uint64_t extra,
1130                                       uint64_t first,
1131                                       uint64_t i,
1132                                       Object **ret, uint64_t *offset) {
1133
1134         Object *o;
1135
1136         assert(f);
1137
1138         if (i == 0) {
1139                 int r;
1140
1141                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1142                 if (r < 0)
1143                         return r;
1144
1145                 if (ret)
1146                         *ret = o;
1147
1148                 if (offset)
1149                         *offset = extra;
1150
1151                 return 1;
1152         }
1153
1154         return generic_array_get(f, first, i-1, ret, offset);
1155 }
1156
1157 enum {
1158         TEST_FOUND,
1159         TEST_LEFT,
1160         TEST_RIGHT
1161 };
1162
1163 static int generic_array_bisect(JournalFile *f,
1164                                 uint64_t first,
1165                                 uint64_t n,
1166                                 uint64_t needle,
1167                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1168                                 direction_t direction,
1169                                 Object **ret,
1170                                 uint64_t *offset,
1171                                 uint64_t *idx) {
1172
1173         uint64_t a, p, t = 0, i = 0, last_p = 0;
1174         bool subtract_one = false;
1175         Object *o, *array = NULL;
1176         int r;
1177
1178         assert(f);
1179         assert(test_object);
1180
1181         a = first;
1182         while (a > 0) {
1183                 uint64_t left, right, k, lp;
1184
1185                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1186                 if (r < 0)
1187                         return r;
1188
1189                 k = journal_file_entry_array_n_items(array);
1190                 right = MIN(k, n);
1191                 if (right <= 0)
1192                         return 0;
1193
1194                 i = right - 1;
1195                 lp = p = le64toh(array->entry_array.items[i]);
1196                 if (p <= 0)
1197                         return -EBADMSG;
1198
1199                 r = test_object(f, p, needle);
1200                 if (r < 0)
1201                         return r;
1202
1203                 if (r == TEST_FOUND)
1204                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1205
1206                 if (r == TEST_RIGHT) {
1207                         left = 0;
1208                         right -= 1;
1209                         for (;;) {
1210                                 if (left == right) {
1211                                         if (direction == DIRECTION_UP)
1212                                                 subtract_one = true;
1213
1214                                         i = left;
1215                                         goto found;
1216                                 }
1217
1218                                 assert(left < right);
1219
1220                                 i = (left + right) / 2;
1221                                 p = le64toh(array->entry_array.items[i]);
1222                                 if (p <= 0)
1223                                         return -EBADMSG;
1224
1225                                 r = test_object(f, p, needle);
1226                                 if (r < 0)
1227                                         return r;
1228
1229                                 if (r == TEST_FOUND)
1230                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1231
1232                                 if (r == TEST_RIGHT)
1233                                         right = i;
1234                                 else
1235                                         left = i + 1;
1236                         }
1237                 }
1238
1239                 if (k > n) {
1240                         if (direction == DIRECTION_UP) {
1241                                 i = n;
1242                                 subtract_one = true;
1243                                 goto found;
1244                         }
1245
1246                         return 0;
1247                 }
1248
1249                 last_p = lp;
1250
1251                 n -= k;
1252                 t += k;
1253                 a = le64toh(array->entry_array.next_entry_array_offset);
1254         }
1255
1256         return 0;
1257
1258 found:
1259         if (subtract_one && t == 0 && i == 0)
1260                 return 0;
1261
1262         if (subtract_one && i == 0)
1263                 p = last_p;
1264         else if (subtract_one)
1265                 p = le64toh(array->entry_array.items[i-1]);
1266         else
1267                 p = le64toh(array->entry_array.items[i]);
1268
1269         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1270         if (r < 0)
1271                 return r;
1272
1273         if (ret)
1274                 *ret = o;
1275
1276         if (offset)
1277                 *offset = p;
1278
1279         if (idx)
1280                 *idx = t + i + (subtract_one ? -1 : 0);
1281
1282         return 1;
1283 }
1284
1285 static int generic_array_bisect_plus_one(JournalFile *f,
1286                                          uint64_t extra,
1287                                          uint64_t first,
1288                                          uint64_t n,
1289                                          uint64_t needle,
1290                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1291                                          direction_t direction,
1292                                          Object **ret,
1293                                          uint64_t *offset,
1294                                          uint64_t *idx) {
1295
1296         int r;
1297         bool step_back = false;
1298         Object *o;
1299
1300         assert(f);
1301         assert(test_object);
1302
1303         if (n <= 0)
1304                 return 0;
1305
1306         /* This bisects the array in object 'first', but first checks
1307          * an extra  */
1308         r = test_object(f, extra, needle);
1309         if (r < 0)
1310                 return r;
1311
1312         if (r == TEST_FOUND)
1313                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1314
1315         /* if we are looking with DIRECTION_UP then we need to first
1316            see if in the actual array there is a matching entry, and
1317            return the last one of that. But if there isn't any we need
1318            to return this one. Hence remember this, and return it
1319            below. */
1320         if (r == TEST_LEFT)
1321                 step_back = direction == DIRECTION_UP;
1322
1323         if (r == TEST_RIGHT) {
1324                 if (direction == DIRECTION_DOWN)
1325                         goto found;
1326                 else
1327                         return 0;
1328         }
1329
1330         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1331
1332         if (r == 0 && step_back)
1333                 goto found;
1334
1335         if (r > 0 && idx)
1336                 (*idx) ++;
1337
1338         return r;
1339
1340 found:
1341         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1342         if (r < 0)
1343                 return r;
1344
1345         if (ret)
1346                 *ret = o;
1347
1348         if (offset)
1349                 *offset = extra;
1350
1351         if (idx)
1352                 *idx = 0;
1353
1354         return 1;
1355 }
1356
1357 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1358         assert(f);
1359         assert(p > 0);
1360
1361         if (p == needle)
1362                 return TEST_FOUND;
1363         else if (p < needle)
1364                 return TEST_LEFT;
1365         else
1366                 return TEST_RIGHT;
1367 }
1368
1369 int journal_file_move_to_entry_by_offset(
1370                 JournalFile *f,
1371                 uint64_t p,
1372                 direction_t direction,
1373                 Object **ret,
1374                 uint64_t *offset) {
1375
1376         return generic_array_bisect(f,
1377                                     le64toh(f->header->entry_array_offset),
1378                                     le64toh(f->header->n_entries),
1379                                     p,
1380                                     test_object_offset,
1381                                     direction,
1382                                     ret, offset, NULL);
1383 }
1384
1385
1386 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1387         Object *o;
1388         int r;
1389
1390         assert(f);
1391         assert(p > 0);
1392
1393         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1394         if (r < 0)
1395                 return r;
1396
1397         if (le64toh(o->entry.seqnum) == needle)
1398                 return TEST_FOUND;
1399         else if (le64toh(o->entry.seqnum) < needle)
1400                 return TEST_LEFT;
1401         else
1402                 return TEST_RIGHT;
1403 }
1404
1405 int journal_file_move_to_entry_by_seqnum(
1406                 JournalFile *f,
1407                 uint64_t seqnum,
1408                 direction_t direction,
1409                 Object **ret,
1410                 uint64_t *offset) {
1411
1412         return generic_array_bisect(f,
1413                                     le64toh(f->header->entry_array_offset),
1414                                     le64toh(f->header->n_entries),
1415                                     seqnum,
1416                                     test_object_seqnum,
1417                                     direction,
1418                                     ret, offset, NULL);
1419 }
1420
1421 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1422         Object *o;
1423         int r;
1424
1425         assert(f);
1426         assert(p > 0);
1427
1428         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1429         if (r < 0)
1430                 return r;
1431
1432         if (le64toh(o->entry.realtime) == needle)
1433                 return TEST_FOUND;
1434         else if (le64toh(o->entry.realtime) < needle)
1435                 return TEST_LEFT;
1436         else
1437                 return TEST_RIGHT;
1438 }
1439
1440 int journal_file_move_to_entry_by_realtime(
1441                 JournalFile *f,
1442                 uint64_t realtime,
1443                 direction_t direction,
1444                 Object **ret,
1445                 uint64_t *offset) {
1446
1447         return generic_array_bisect(f,
1448                                     le64toh(f->header->entry_array_offset),
1449                                     le64toh(f->header->n_entries),
1450                                     realtime,
1451                                     test_object_realtime,
1452                                     direction,
1453                                     ret, offset, NULL);
1454 }
1455
1456 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1457         Object *o;
1458         int r;
1459
1460         assert(f);
1461         assert(p > 0);
1462
1463         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1464         if (r < 0)
1465                 return r;
1466
1467         if (le64toh(o->entry.monotonic) == needle)
1468                 return TEST_FOUND;
1469         else if (le64toh(o->entry.monotonic) < needle)
1470                 return TEST_LEFT;
1471         else
1472                 return TEST_RIGHT;
1473 }
1474
1475 int journal_file_move_to_entry_by_monotonic(
1476                 JournalFile *f,
1477                 sd_id128_t boot_id,
1478                 uint64_t monotonic,
1479                 direction_t direction,
1480                 Object **ret,
1481                 uint64_t *offset) {
1482
1483         char t[9+32+1] = "_BOOT_ID=";
1484         Object *o;
1485         int r;
1486
1487         assert(f);
1488
1489         sd_id128_to_string(boot_id, t + 9);
1490         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1491         if (r < 0)
1492                 return r;
1493         if (r == 0)
1494                 return -ENOENT;
1495
1496         return generic_array_bisect_plus_one(f,
1497                                              le64toh(o->data.entry_offset),
1498                                              le64toh(o->data.entry_array_offset),
1499                                              le64toh(o->data.n_entries),
1500                                              monotonic,
1501                                              test_object_monotonic,
1502                                              direction,
1503                                              ret, offset, NULL);
1504 }
1505
1506 int journal_file_next_entry(
1507                 JournalFile *f,
1508                 Object *o, uint64_t p,
1509                 direction_t direction,
1510                 Object **ret, uint64_t *offset) {
1511
1512         uint64_t i, n;
1513         int r;
1514
1515         assert(f);
1516         assert(p > 0 || !o);
1517
1518         n = le64toh(f->header->n_entries);
1519         if (n <= 0)
1520                 return 0;
1521
1522         if (!o)
1523                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1524         else {
1525                 if (o->object.type != OBJECT_ENTRY)
1526                         return -EINVAL;
1527
1528                 r = generic_array_bisect(f,
1529                                          le64toh(f->header->entry_array_offset),
1530                                          le64toh(f->header->n_entries),
1531                                          p,
1532                                          test_object_offset,
1533                                          DIRECTION_DOWN,
1534                                          NULL, NULL,
1535                                          &i);
1536                 if (r <= 0)
1537                         return r;
1538
1539                 if (direction == DIRECTION_DOWN) {
1540                         if (i >= n - 1)
1541                                 return 0;
1542
1543                         i++;
1544                 } else {
1545                         if (i <= 0)
1546                                 return 0;
1547
1548                         i--;
1549                 }
1550         }
1551
1552         /* And jump to it */
1553         return generic_array_get(f,
1554                                  le64toh(f->header->entry_array_offset),
1555                                  i,
1556                                  ret, offset);
1557 }
1558
1559 int journal_file_skip_entry(
1560                 JournalFile *f,
1561                 Object *o, uint64_t p,
1562                 int64_t skip,
1563                 Object **ret, uint64_t *offset) {
1564
1565         uint64_t i, n;
1566         int r;
1567
1568         assert(f);
1569         assert(o);
1570         assert(p > 0);
1571
1572         if (o->object.type != OBJECT_ENTRY)
1573                 return -EINVAL;
1574
1575         r = generic_array_bisect(f,
1576                                  le64toh(f->header->entry_array_offset),
1577                                  le64toh(f->header->n_entries),
1578                                  p,
1579                                  test_object_offset,
1580                                  DIRECTION_DOWN,
1581                                  NULL, NULL,
1582                                  &i);
1583         if (r <= 0)
1584                 return r;
1585
1586         /* Calculate new index */
1587         if (skip < 0) {
1588                 if ((uint64_t) -skip >= i)
1589                         i = 0;
1590                 else
1591                         i = i - (uint64_t) -skip;
1592         } else
1593                 i  += (uint64_t) skip;
1594
1595         n = le64toh(f->header->n_entries);
1596         if (n <= 0)
1597                 return -EBADMSG;
1598
1599         if (i >= n)
1600                 i = n-1;
1601
1602         return generic_array_get(f,
1603                                  le64toh(f->header->entry_array_offset),
1604                                  i,
1605                                  ret, offset);
1606 }
1607
1608 int journal_file_next_entry_for_data(
1609                 JournalFile *f,
1610                 Object *o, uint64_t p,
1611                 uint64_t data_offset,
1612                 direction_t direction,
1613                 Object **ret, uint64_t *offset) {
1614
1615         uint64_t n, i;
1616         int r;
1617         Object *d;
1618
1619         assert(f);
1620         assert(p > 0 || !o);
1621
1622         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1623         if (r < 0)
1624                 return r;
1625
1626         n = le64toh(d->data.n_entries);
1627         if (n <= 0)
1628                 return n;
1629
1630         if (!o)
1631                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1632         else {
1633                 if (o->object.type != OBJECT_ENTRY)
1634                         return -EINVAL;
1635
1636                 r = generic_array_bisect_plus_one(f,
1637                                                   le64toh(d->data.entry_offset),
1638                                                   le64toh(d->data.entry_array_offset),
1639                                                   le64toh(d->data.n_entries),
1640                                                   p,
1641                                                   test_object_offset,
1642                                                   DIRECTION_DOWN,
1643                                                   NULL, NULL,
1644                                                   &i);
1645
1646                 if (r <= 0)
1647                         return r;
1648
1649                 if (direction == DIRECTION_DOWN) {
1650                         if (i >= n - 1)
1651                                 return 0;
1652
1653                         i++;
1654                 } else {
1655                         if (i <= 0)
1656                                 return 0;
1657
1658                         i--;
1659                 }
1660
1661         }
1662
1663         return generic_array_get_plus_one(f,
1664                                           le64toh(d->data.entry_offset),
1665                                           le64toh(d->data.entry_array_offset),
1666                                           i,
1667                                           ret, offset);
1668 }
1669
1670 int journal_file_move_to_entry_by_offset_for_data(
1671                 JournalFile *f,
1672                 uint64_t data_offset,
1673                 uint64_t p,
1674                 direction_t direction,
1675                 Object **ret, uint64_t *offset) {
1676
1677         int r;
1678         Object *d;
1679
1680         assert(f);
1681
1682         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1683         if (r < 0)
1684                 return r;
1685
1686         return generic_array_bisect_plus_one(f,
1687                                              le64toh(d->data.entry_offset),
1688                                              le64toh(d->data.entry_array_offset),
1689                                              le64toh(d->data.n_entries),
1690                                              p,
1691                                              test_object_offset,
1692                                              direction,
1693                                              ret, offset, NULL);
1694 }
1695
1696 int journal_file_move_to_entry_by_monotonic_for_data(
1697                 JournalFile *f,
1698                 uint64_t data_offset,
1699                 sd_id128_t boot_id,
1700                 uint64_t monotonic,
1701                 direction_t direction,
1702                 Object **ret, uint64_t *offset) {
1703
1704         char t[9+32+1] = "_BOOT_ID=";
1705         Object *o, *d;
1706         int r;
1707         uint64_t b, z;
1708
1709         assert(f);
1710
1711         /* First, seek by time */
1712         sd_id128_to_string(boot_id, t + 9);
1713         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1714         if (r < 0)
1715                 return r;
1716         if (r == 0)
1717                 return -ENOENT;
1718
1719         r = generic_array_bisect_plus_one(f,
1720                                           le64toh(o->data.entry_offset),
1721                                           le64toh(o->data.entry_array_offset),
1722                                           le64toh(o->data.n_entries),
1723                                           monotonic,
1724                                           test_object_monotonic,
1725                                           direction,
1726                                           NULL, &z, NULL);
1727         if (r <= 0)
1728                 return r;
1729
1730         /* And now, continue seeking until we find an entry that
1731          * exists in both bisection arrays */
1732
1733         for (;;) {
1734                 Object *qo;
1735                 uint64_t p, q;
1736
1737                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1738                 if (r < 0)
1739                         return r;
1740
1741                 r = generic_array_bisect_plus_one(f,
1742                                                   le64toh(d->data.entry_offset),
1743                                                   le64toh(d->data.entry_array_offset),
1744                                                   le64toh(d->data.n_entries),
1745                                                   z,
1746                                                   test_object_offset,
1747                                                   direction,
1748                                                   NULL, &p, NULL);
1749                 if (r <= 0)
1750                         return r;
1751
1752                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1753                 if (r < 0)
1754                         return r;
1755
1756                 r = generic_array_bisect_plus_one(f,
1757                                                   le64toh(o->data.entry_offset),
1758                                                   le64toh(o->data.entry_array_offset),
1759                                                   le64toh(o->data.n_entries),
1760                                                   p,
1761                                                   test_object_offset,
1762                                                   direction,
1763                                                   &qo, &q, NULL);
1764
1765                 if (r <= 0)
1766                         return r;
1767
1768                 if (p == q) {
1769                         if (ret)
1770                                 *ret = qo;
1771                         if (offset)
1772                                 *offset = q;
1773
1774                         return 1;
1775                 }
1776
1777                 z = q;
1778         }
1779
1780         return 0;
1781 }
1782
1783 int journal_file_move_to_entry_by_seqnum_for_data(
1784                 JournalFile *f,
1785                 uint64_t data_offset,
1786                 uint64_t seqnum,
1787                 direction_t direction,
1788                 Object **ret, uint64_t *offset) {
1789
1790         Object *d;
1791         int r;
1792
1793         assert(f);
1794
1795         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1796         if (r < 0)
1797                 return r;
1798
1799         return generic_array_bisect_plus_one(f,
1800                                              le64toh(d->data.entry_offset),
1801                                              le64toh(d->data.entry_array_offset),
1802                                              le64toh(d->data.n_entries),
1803                                              seqnum,
1804                                              test_object_seqnum,
1805                                              direction,
1806                                              ret, offset, NULL);
1807 }
1808
1809 int journal_file_move_to_entry_by_realtime_for_data(
1810                 JournalFile *f,
1811                 uint64_t data_offset,
1812                 uint64_t realtime,
1813                 direction_t direction,
1814                 Object **ret, uint64_t *offset) {
1815
1816         Object *d;
1817         int r;
1818
1819         assert(f);
1820
1821         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1822         if (r < 0)
1823                 return r;
1824
1825         return generic_array_bisect_plus_one(f,
1826                                              le64toh(d->data.entry_offset),
1827                                              le64toh(d->data.entry_array_offset),
1828                                              le64toh(d->data.n_entries),
1829                                              realtime,
1830                                              test_object_realtime,
1831                                              direction,
1832                                              ret, offset, NULL);
1833 }
1834
1835 void journal_file_dump(JournalFile *f) {
1836         Object *o;
1837         int r;
1838         uint64_t p;
1839
1840         assert(f);
1841
1842         journal_file_print_header(f);
1843
1844         p = le64toh(f->header->header_size);
1845         while (p != 0) {
1846                 r = journal_file_move_to_object(f, -1, p, &o);
1847                 if (r < 0)
1848                         goto fail;
1849
1850                 switch (o->object.type) {
1851
1852                 case OBJECT_UNUSED:
1853                         printf("Type: OBJECT_UNUSED\n");
1854                         break;
1855
1856                 case OBJECT_DATA:
1857                         printf("Type: OBJECT_DATA\n");
1858                         break;
1859
1860                 case OBJECT_ENTRY:
1861                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1862                                (unsigned long long) le64toh(o->entry.seqnum),
1863                                (unsigned long long) le64toh(o->entry.monotonic),
1864                                (unsigned long long) le64toh(o->entry.realtime));
1865                         break;
1866
1867                 case OBJECT_FIELD_HASH_TABLE:
1868                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1869                         break;
1870
1871                 case OBJECT_DATA_HASH_TABLE:
1872                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1873                         break;
1874
1875                 case OBJECT_ENTRY_ARRAY:
1876                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1877                         break;
1878
1879                 case OBJECT_SIGNATURE:
1880                         printf("Type: OBJECT_SIGNATURE\n");
1881                         break;
1882                 }
1883
1884                 if (o->object.flags & OBJECT_COMPRESSED)
1885                         printf("Flags: COMPRESSED\n");
1886
1887                 if (p == le64toh(f->header->tail_object_offset))
1888                         p = 0;
1889                 else
1890                         p = p + ALIGN64(le64toh(o->object.size));
1891         }
1892
1893         return;
1894 fail:
1895         log_error("File corrupt");
1896 }
1897
1898 void journal_file_print_header(JournalFile *f) {
1899         char a[33], b[33], c[33];
1900         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1901
1902         assert(f);
1903
1904         printf("File Path: %s\n"
1905                "File ID: %s\n"
1906                "Machine ID: %s\n"
1907                "Boot ID: %s\n"
1908                "Sequential Number ID: %s\n"
1909                "State: %s\n"
1910                "Compatible Flags:%s%s\n"
1911                "Incompatible Flags:%s%s\n"
1912                "Header size: %llu\n"
1913                "Arena size: %llu\n"
1914                "Data Hash Table Size: %llu\n"
1915                "Field Hash Table Size: %llu\n"
1916                "Objects: %llu\n"
1917                "Entry Objects: %llu\n"
1918                "Rotate Suggested: %s\n"
1919                "Head Sequential Number: %llu\n"
1920                "Tail Sequential Number: %llu\n"
1921                "Head Realtime Timestamp: %s\n"
1922                "Tail Realtime Timestamp: %s\n",
1923                f->path,
1924                sd_id128_to_string(f->header->file_id, a),
1925                sd_id128_to_string(f->header->machine_id, b),
1926                sd_id128_to_string(f->header->boot_id, c),
1927                sd_id128_to_string(f->header->seqnum_id, c),
1928                f->header->state == STATE_OFFLINE ? "offline" :
1929                f->header->state == STATE_ONLINE ? "online" :
1930                f->header->state == STATE_ARCHIVED ? "archived" : "unknown",
1931                (f->header->compatible_flags & HEADER_COMPATIBLE_SIGNED) ? " SIGNED" : "",
1932                (f->header->compatible_flags & ~HEADER_COMPATIBLE_SIGNED) ? " ???" : "",
1933                (f->header->incompatible_flags & HEADER_INCOMPATIBLE_COMPRESSED) ? " COMPRESSED" : "",
1934                (f->header->incompatible_flags & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
1935                (unsigned long long) le64toh(f->header->header_size),
1936                (unsigned long long) le64toh(f->header->arena_size),
1937                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1938                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1939                (unsigned long long) le64toh(f->header->n_objects),
1940                (unsigned long long) le64toh(f->header->n_entries),
1941                yes_no(journal_file_rotate_suggested(f)),
1942                (unsigned long long) le64toh(f->header->head_seqnum),
1943                (unsigned long long) le64toh(f->header->tail_seqnum),
1944                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1945                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)));
1946
1947         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1948                 printf("Data Objects: %llu\n"
1949                        "Data Hash Table Fill: %.1f%%\n",
1950                        (unsigned long long) le64toh(f->header->n_data),
1951                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1952
1953         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1954                 printf("Field Objects: %llu\n"
1955                        "Field Hash Table Fill: %.1f%%\n",
1956                        (unsigned long long) le64toh(f->header->n_fields),
1957                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1958 }
1959
1960 int journal_file_open(
1961                 const char *fname,
1962                 int flags,
1963                 mode_t mode,
1964                 JournalMetrics *metrics,
1965                 JournalFile *template,
1966                 JournalFile **ret) {
1967
1968         JournalFile *f;
1969         int r;
1970         bool newly_created = false;
1971
1972         assert(fname);
1973
1974         if ((flags & O_ACCMODE) != O_RDONLY &&
1975             (flags & O_ACCMODE) != O_RDWR)
1976                 return -EINVAL;
1977
1978         if (!endswith(fname, ".journal"))
1979                 return -EINVAL;
1980
1981         f = new0(JournalFile, 1);
1982         if (!f)
1983                 return -ENOMEM;
1984
1985         f->fd = -1;
1986         f->flags = flags;
1987         f->mode = mode;
1988         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1989         f->prot = prot_from_flags(flags);
1990
1991         if (template)
1992                 f->compress = template->compress;
1993
1994         f->path = strdup(fname);
1995         if (!f->path) {
1996                 r = -ENOMEM;
1997                 goto fail;
1998         }
1999
2000         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2001         if (f->fd < 0) {
2002                 r = -errno;
2003                 goto fail;
2004         }
2005
2006         if (fstat(f->fd, &f->last_stat) < 0) {
2007                 r = -errno;
2008                 goto fail;
2009         }
2010
2011         if (f->last_stat.st_size == 0 && f->writable) {
2012                 newly_created = true;
2013
2014                 r = journal_file_init_header(f, template);
2015                 if (r < 0)
2016                         goto fail;
2017
2018                 if (fstat(f->fd, &f->last_stat) < 0) {
2019                         r = -errno;
2020                         goto fail;
2021                 }
2022         }
2023
2024         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2025                 r = -EIO;
2026                 goto fail;
2027         }
2028
2029         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2030         if (f->header == MAP_FAILED) {
2031                 f->header = NULL;
2032                 r = -errno;
2033                 goto fail;
2034         }
2035
2036         if (!newly_created) {
2037                 r = journal_file_verify_header(f);
2038                 if (r < 0)
2039                         goto fail;
2040         }
2041
2042         if (f->writable) {
2043                 if (metrics) {
2044                         journal_default_metrics(metrics, f->fd);
2045                         f->metrics = *metrics;
2046                 } else if (template)
2047                         f->metrics = template->metrics;
2048
2049                 r = journal_file_refresh_header(f);
2050                 if (r < 0)
2051                         goto fail;
2052         }
2053
2054         if (newly_created) {
2055
2056                 r = journal_file_setup_field_hash_table(f);
2057                 if (r < 0)
2058                         goto fail;
2059
2060                 r = journal_file_setup_data_hash_table(f);
2061                 if (r < 0)
2062                         goto fail;
2063         }
2064
2065         r = journal_file_map_field_hash_table(f);
2066         if (r < 0)
2067                 goto fail;
2068
2069         r = journal_file_map_data_hash_table(f);
2070         if (r < 0)
2071                 goto fail;
2072
2073         if (ret)
2074                 *ret = f;
2075
2076         return 0;
2077
2078 fail:
2079         journal_file_close(f);
2080
2081         return r;
2082 }
2083
2084 int journal_file_rotate(JournalFile **f) {
2085         char *p;
2086         size_t l;
2087         JournalFile *old_file, *new_file = NULL;
2088         int r;
2089
2090         assert(f);
2091         assert(*f);
2092
2093         old_file = *f;
2094
2095         if (!old_file->writable)
2096                 return -EINVAL;
2097
2098         if (!endswith(old_file->path, ".journal"))
2099                 return -EINVAL;
2100
2101         l = strlen(old_file->path);
2102
2103         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2104         if (!p)
2105                 return -ENOMEM;
2106
2107         memcpy(p, old_file->path, l - 8);
2108         p[l-8] = '@';
2109         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2110         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2111                  "-%016llx-%016llx.journal",
2112                  (unsigned long long) le64toh((*f)->header->tail_seqnum),
2113                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2114
2115         r = rename(old_file->path, p);
2116         free(p);
2117
2118         if (r < 0)
2119                 return -errno;
2120
2121         old_file->header->state = STATE_ARCHIVED;
2122
2123         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, NULL, old_file, &new_file);
2124         journal_file_close(old_file);
2125
2126         *f = new_file;
2127         return r;
2128 }
2129
2130 int journal_file_open_reliably(
2131                 const char *fname,
2132                 int flags,
2133                 mode_t mode,
2134                 JournalMetrics *metrics,
2135                 JournalFile *template,
2136                 JournalFile **ret) {
2137
2138         int r;
2139         size_t l;
2140         char *p;
2141
2142         r = journal_file_open(fname, flags, mode, metrics, template, ret);
2143         if (r != -EBADMSG && /* corrupted */
2144             r != -ENODATA && /* truncated */
2145             r != -EHOSTDOWN && /* other machine */
2146             r != -EPROTONOSUPPORT && /* incompatible feature */
2147             r != -EBUSY && /* unclean shutdown */
2148             r != -ESHUTDOWN /* already archived */)
2149                 return r;
2150
2151         if ((flags & O_ACCMODE) == O_RDONLY)
2152                 return r;
2153
2154         if (!(flags & O_CREAT))
2155                 return r;
2156
2157         /* The file is corrupted. Rotate it away and try it again (but only once) */
2158
2159         l = strlen(fname);
2160         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2161                      (int) (l-8), fname,
2162                      (unsigned long long) now(CLOCK_REALTIME),
2163                      random_ull()) < 0)
2164                 return -ENOMEM;
2165
2166         r = rename(fname, p);
2167         free(p);
2168         if (r < 0)
2169                 return -errno;
2170
2171         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2172
2173         return journal_file_open(fname, flags, mode, metrics, template, ret);
2174 }
2175
2176 struct vacuum_info {
2177         off_t usage;
2178         char *filename;
2179
2180         uint64_t realtime;
2181         sd_id128_t seqnum_id;
2182         uint64_t seqnum;
2183
2184         bool have_seqnum;
2185 };
2186
2187 static int vacuum_compare(const void *_a, const void *_b) {
2188         const struct vacuum_info *a, *b;
2189
2190         a = _a;
2191         b = _b;
2192
2193         if (a->have_seqnum && b->have_seqnum &&
2194             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
2195                 if (a->seqnum < b->seqnum)
2196                         return -1;
2197                 else if (a->seqnum > b->seqnum)
2198                         return 1;
2199                 else
2200                         return 0;
2201         }
2202
2203         if (a->realtime < b->realtime)
2204                 return -1;
2205         else if (a->realtime > b->realtime)
2206                 return 1;
2207         else if (a->have_seqnum && b->have_seqnum)
2208                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
2209         else
2210                 return strcmp(a->filename, b->filename);
2211 }
2212
2213 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
2214         DIR *d;
2215         int r = 0;
2216         struct vacuum_info *list = NULL;
2217         unsigned n_list = 0, n_allocated = 0, i;
2218         uint64_t sum = 0;
2219
2220         assert(directory);
2221
2222         if (max_use <= 0)
2223                 return 0;
2224
2225         d = opendir(directory);
2226         if (!d)
2227                 return -errno;
2228
2229         for (;;) {
2230                 int k;
2231                 struct dirent buf, *de;
2232                 size_t q;
2233                 struct stat st;
2234                 char *p;
2235                 unsigned long long seqnum = 0, realtime;
2236                 sd_id128_t seqnum_id;
2237                 bool have_seqnum;
2238
2239                 k = readdir_r(d, &buf, &de);
2240                 if (k != 0) {
2241                         r = -k;
2242                         goto finish;
2243                 }
2244
2245                 if (!de)
2246                         break;
2247
2248                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2249                         continue;
2250
2251                 if (!S_ISREG(st.st_mode))
2252                         continue;
2253
2254                 q = strlen(de->d_name);
2255
2256                 if (endswith(de->d_name, ".journal")) {
2257
2258                         /* Vacuum archived files */
2259
2260                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2261                                 continue;
2262
2263                         if (de->d_name[q-8-16-1] != '-' ||
2264                             de->d_name[q-8-16-1-16-1] != '-' ||
2265                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2266                                 continue;
2267
2268                         p = strdup(de->d_name);
2269                         if (!p) {
2270                                 r = -ENOMEM;
2271                                 goto finish;
2272                         }
2273
2274                         de->d_name[q-8-16-1-16-1] = 0;
2275                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2276                                 free(p);
2277                                 continue;
2278                         }
2279
2280                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2281                                 free(p);
2282                                 continue;
2283                         }
2284
2285                         have_seqnum = true;
2286
2287                 } else if (endswith(de->d_name, ".journal~")) {
2288                         unsigned long long tmp;
2289
2290                         /* Vacuum corrupted files */
2291
2292                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2293                                 continue;
2294
2295                         if (de->d_name[q-1-8-16-1] != '-' ||
2296                             de->d_name[q-1-8-16-1-16-1] != '@')
2297                                 continue;
2298
2299                         p = strdup(de->d_name);
2300                         if (!p) {
2301                                 r = -ENOMEM;
2302                                 goto finish;
2303                         }
2304
2305                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2306                                 free(p);
2307                                 continue;
2308                         }
2309
2310                         have_seqnum = false;
2311                 } else
2312                         continue;
2313
2314                 if (n_list >= n_allocated) {
2315                         struct vacuum_info *j;
2316
2317                         n_allocated = MAX(n_allocated * 2U, 8U);
2318                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2319                         if (!j) {
2320                                 free(p);
2321                                 r = -ENOMEM;
2322                                 goto finish;
2323                         }
2324
2325                         list = j;
2326                 }
2327
2328                 list[n_list].filename = p;
2329                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2330                 list[n_list].seqnum = seqnum;
2331                 list[n_list].realtime = realtime;
2332                 list[n_list].seqnum_id = seqnum_id;
2333                 list[n_list].have_seqnum = have_seqnum;
2334
2335                 sum += list[n_list].usage;
2336
2337                 n_list ++;
2338         }
2339
2340         if (n_list > 0)
2341                 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2342
2343         for(i = 0; i < n_list; i++) {
2344                 struct statvfs ss;
2345
2346                 if (fstatvfs(dirfd(d), &ss) < 0) {
2347                         r = -errno;
2348                         goto finish;
2349                 }
2350
2351                 if (sum <= max_use &&
2352                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2353                         break;
2354
2355                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2356                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2357                         sum -= list[i].usage;
2358                 } else if (errno != ENOENT)
2359                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2360         }
2361
2362 finish:
2363         for (i = 0; i < n_list; i++)
2364                 free(list[i].filename);
2365
2366         free(list);
2367
2368         if (d)
2369                 closedir(d);
2370
2371         return r;
2372 }
2373
2374 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2375         uint64_t i, n;
2376         uint64_t q, xor_hash = 0;
2377         int r;
2378         EntryItem *items;
2379         dual_timestamp ts;
2380
2381         assert(from);
2382         assert(to);
2383         assert(o);
2384         assert(p);
2385
2386         if (!to->writable)
2387                 return -EPERM;
2388
2389         ts.monotonic = le64toh(o->entry.monotonic);
2390         ts.realtime = le64toh(o->entry.realtime);
2391
2392         if (to->tail_entry_monotonic_valid &&
2393             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2394                 return -EINVAL;
2395
2396         n = journal_file_entry_n_items(o);
2397         items = alloca(sizeof(EntryItem) * n);
2398
2399         for (i = 0; i < n; i++) {
2400                 uint64_t l, h;
2401                 le64_t le_hash;
2402                 size_t t;
2403                 void *data;
2404                 Object *u;
2405
2406                 q = le64toh(o->entry.items[i].object_offset);
2407                 le_hash = o->entry.items[i].hash;
2408
2409                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2410                 if (r < 0)
2411                         return r;
2412
2413                 if (le_hash != o->data.hash)
2414                         return -EBADMSG;
2415
2416                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2417                 t = (size_t) l;
2418
2419                 /* We hit the limit on 32bit machines */
2420                 if ((uint64_t) t != l)
2421                         return -E2BIG;
2422
2423                 if (o->object.flags & OBJECT_COMPRESSED) {
2424 #ifdef HAVE_XZ
2425                         uint64_t rsize;
2426
2427                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2428                                 return -EBADMSG;
2429
2430                         data = from->compress_buffer;
2431                         l = rsize;
2432 #else
2433                         return -EPROTONOSUPPORT;
2434 #endif
2435                 } else
2436                         data = o->data.payload;
2437
2438                 r = journal_file_append_data(to, data, l, &u, &h);
2439                 if (r < 0)
2440                         return r;
2441
2442                 xor_hash ^= le64toh(u->data.hash);
2443                 items[i].object_offset = htole64(h);
2444                 items[i].hash = u->data.hash;
2445
2446                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2447                 if (r < 0)
2448                         return r;
2449         }
2450
2451         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2452 }
2453
2454 void journal_default_metrics(JournalMetrics *m, int fd) {
2455         uint64_t fs_size = 0;
2456         struct statvfs ss;
2457         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2458
2459         assert(m);
2460         assert(fd >= 0);
2461
2462         if (fstatvfs(fd, &ss) >= 0)
2463                 fs_size = ss.f_frsize * ss.f_blocks;
2464
2465         if (m->max_use == (uint64_t) -1) {
2466
2467                 if (fs_size > 0) {
2468                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2469
2470                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2471                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2472
2473                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2474                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2475                 } else
2476                         m->max_use = DEFAULT_MAX_USE_LOWER;
2477         } else {
2478                 m->max_use = PAGE_ALIGN(m->max_use);
2479
2480                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2481                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2482         }
2483
2484         if (m->max_size == (uint64_t) -1) {
2485                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2486
2487                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2488                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2489         } else
2490                 m->max_size = PAGE_ALIGN(m->max_size);
2491
2492         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2493                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2494
2495         if (m->max_size*2 > m->max_use)
2496                 m->max_use = m->max_size*2;
2497
2498         if (m->min_size == (uint64_t) -1)
2499                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2500         else {
2501                 m->min_size = PAGE_ALIGN(m->min_size);
2502
2503                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2504                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2505
2506                 if (m->min_size > m->max_size)
2507                         m->max_size = m->min_size;
2508         }
2509
2510         if (m->keep_free == (uint64_t) -1) {
2511
2512                 if (fs_size > 0) {
2513                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2514
2515                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2516                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2517
2518                 } else
2519                         m->keep_free = DEFAULT_KEEP_FREE;
2520         }
2521
2522         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2523                  format_bytes(a, sizeof(a), m->max_use),
2524                  format_bytes(b, sizeof(b), m->max_size),
2525                  format_bytes(c, sizeof(c), m->min_size),
2526                  format_bytes(d, sizeof(d), m->keep_free));
2527 }
2528
2529 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2530         assert(f);
2531         assert(from || to);
2532
2533         if (from) {
2534                 if (f->header->head_entry_realtime == 0)
2535                         return -ENOENT;
2536
2537                 *from = le64toh(f->header->head_entry_realtime);
2538         }
2539
2540         if (to) {
2541                 if (f->header->tail_entry_realtime == 0)
2542                         return -ENOENT;
2543
2544                 *to = le64toh(f->header->tail_entry_realtime);
2545         }
2546
2547         return 1;
2548 }
2549
2550 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2551         char t[9+32+1] = "_BOOT_ID=";
2552         Object *o;
2553         uint64_t p;
2554         int r;
2555
2556         assert(f);
2557         assert(from || to);
2558
2559         sd_id128_to_string(boot_id, t + 9);
2560
2561         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2562         if (r <= 0)
2563                 return r;
2564
2565         if (le64toh(o->data.n_entries) <= 0)
2566                 return 0;
2567
2568         if (from) {
2569                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2570                 if (r < 0)
2571                         return r;
2572
2573                 *from = le64toh(o->entry.monotonic);
2574         }
2575
2576         if (to) {
2577                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2578                 if (r < 0)
2579                         return r;
2580
2581                 r = generic_array_get_plus_one(f,
2582                                                le64toh(o->data.entry_offset),
2583                                                le64toh(o->data.entry_array_offset),
2584                                                le64toh(o->data.n_entries)-1,
2585                                                &o, NULL);
2586                 if (r <= 0)
2587                         return r;
2588
2589                 *to = le64toh(o->entry.monotonic);
2590         }
2591
2592         return 1;
2593 }
2594
2595 bool journal_file_rotate_suggested(JournalFile *f) {
2596         assert(f);
2597
2598         /* If we gained new header fields we gained new features,
2599          * hence suggest a rotation */
2600         if (le64toh(f->header->header_size) < sizeof(Header)) {
2601                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2602                 return true;
2603         }
2604
2605         /* Let's check if the hash tables grew over a certain fill
2606          * level (75%, borrowing this value from Java's hash table
2607          * implementation), and if so suggest a rotation. To calculate
2608          * the fill level we need the n_data field, which only exists
2609          * in newer versions. */
2610
2611         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2612                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2613                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2614                                   f->path,
2615                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2616                                   (unsigned long long) le64toh(f->header->n_data),
2617                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2618                                   (unsigned long long) (f->last_stat.st_size),
2619                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2620                         return true;
2621                 }
2622
2623         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2624                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2625                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2626                                   f->path,
2627                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2628                                   (unsigned long long) le64toh(f->header->n_fields),
2629                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2630                         return true;
2631                 }
2632
2633         return false;
2634 }