chiark / gitweb /
test: allow deletion of temporary files from normal fs
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65
66 #define JOURNAL_HEADER_CONTAINS(h, field) \
67         (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
68
69 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
70
71 void journal_file_close(JournalFile *f) {
72         int t;
73
74         assert(f);
75
76         if (f->header) {
77                 /* Mark the file offline. Don't override the archived state if it already is set */
78                 if (f->writable && f->header->state == STATE_ONLINE)
79                         f->header->state = STATE_OFFLINE;
80
81                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
82         }
83
84         for (t = 0; t < _WINDOW_MAX; t++)
85                 if (f->windows[t].ptr)
86                         munmap(f->windows[t].ptr, f->windows[t].size);
87
88         if (f->fd >= 0)
89                 close_nointr_nofail(f->fd);
90
91         free(f->path);
92
93 #ifdef HAVE_XZ
94         free(f->compress_buffer);
95 #endif
96
97         free(f);
98 }
99
100 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
101         Header h;
102         ssize_t k;
103         int r;
104
105         assert(f);
106
107         zero(h);
108         memcpy(h.signature, signature, 8);
109         h.header_size = htole64(ALIGN64(sizeof(h)));
110
111         r = sd_id128_randomize(&h.file_id);
112         if (r < 0)
113                 return r;
114
115         if (template) {
116                 h.seqnum_id = template->header->seqnum_id;
117                 h.tail_seqnum = template->header->tail_seqnum;
118         } else
119                 h.seqnum_id = h.file_id;
120
121         k = pwrite(f->fd, &h, sizeof(h), 0);
122         if (k < 0)
123                 return -errno;
124
125         if (k != sizeof(h))
126                 return -EIO;
127
128         return 0;
129 }
130
131 static int journal_file_refresh_header(JournalFile *f) {
132         int r;
133         sd_id128_t boot_id;
134
135         assert(f);
136
137         r = sd_id128_get_machine(&f->header->machine_id);
138         if (r < 0)
139                 return r;
140
141         r = sd_id128_get_boot(&boot_id);
142         if (r < 0)
143                 return r;
144
145         if (sd_id128_equal(boot_id, f->header->boot_id))
146                 f->tail_entry_monotonic_valid = true;
147
148         f->header->boot_id = boot_id;
149
150         f->header->state = STATE_ONLINE;
151
152         __sync_synchronize();
153
154         return 0;
155 }
156
157 static int journal_file_verify_header(JournalFile *f) {
158         assert(f);
159
160         if (memcmp(f->header, signature, 8))
161                 return -EBADMSG;
162
163 #ifdef HAVE_XZ
164         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
165                 return -EPROTONOSUPPORT;
166 #else
167         if (f->header->incompatible_flags != 0)
168                 return -EPROTONOSUPPORT;
169 #endif
170
171         /* The first addition was n_data, so check that we are at least this large */
172         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
173                 return -EBADMSG;
174
175         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
176                 return -ENODATA;
177
178         if (f->writable) {
179                 uint8_t state;
180                 sd_id128_t machine_id;
181                 int r;
182
183                 r = sd_id128_get_machine(&machine_id);
184                 if (r < 0)
185                         return r;
186
187                 if (!sd_id128_equal(machine_id, f->header->machine_id))
188                         return -EHOSTDOWN;
189
190                 state = f->header->state;
191
192                 if (state == STATE_ONLINE) {
193                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
194                         return -EBUSY;
195                 } else if (state == STATE_ARCHIVED)
196                         return -ESHUTDOWN;
197                 else if (state != STATE_OFFLINE) {
198                         log_debug("Journal file %s has unknown state %u.", f->path, state);
199                         return -EBUSY;
200                 }
201         }
202
203         return 0;
204 }
205
206 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
207         uint64_t old_size, new_size;
208         int r;
209
210         assert(f);
211
212         /* We assume that this file is not sparse, and we know that
213          * for sure, since we always call posix_fallocate()
214          * ourselves */
215
216         old_size =
217                 le64toh(f->header->header_size) +
218                 le64toh(f->header->arena_size);
219
220         new_size = PAGE_ALIGN(offset + size);
221         if (new_size < le64toh(f->header->header_size))
222                 new_size = le64toh(f->header->header_size);
223
224         if (new_size <= old_size)
225                 return 0;
226
227         if (f->metrics.max_size > 0 &&
228             new_size > f->metrics.max_size)
229                 return -E2BIG;
230
231         if (new_size > f->metrics.min_size &&
232             f->metrics.keep_free > 0) {
233                 struct statvfs svfs;
234
235                 if (fstatvfs(f->fd, &svfs) >= 0) {
236                         uint64_t available;
237
238                         available = svfs.f_bfree * svfs.f_bsize;
239
240                         if (available >= f->metrics.keep_free)
241                                 available -= f->metrics.keep_free;
242                         else
243                                 available = 0;
244
245                         if (new_size - old_size > available)
246                                 return -E2BIG;
247                 }
248         }
249
250         /* Note that the glibc fallocate() fallback is very
251            inefficient, hence we try to minimize the allocation area
252            as we can. */
253         r = posix_fallocate(f->fd, old_size, new_size - old_size);
254         if (r != 0)
255                 return -r;
256
257         if (fstat(f->fd, &f->last_stat) < 0)
258                 return -errno;
259
260         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
261
262         return 0;
263 }
264
265 static int journal_file_map(
266                 JournalFile *f,
267                 uint64_t offset,
268                 uint64_t size,
269                 void **_window,
270                 uint64_t *_woffset,
271                 uint64_t *_wsize,
272                 void **ret) {
273
274         uint64_t woffset, wsize;
275         void *window;
276
277         assert(f);
278         assert(size > 0);
279         assert(ret);
280
281         woffset = offset & ~((uint64_t) page_size() - 1ULL);
282         wsize = size + (offset - woffset);
283         wsize = PAGE_ALIGN(wsize);
284
285         /* Avoid SIGBUS on invalid accesses */
286         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
287                 return -EADDRNOTAVAIL;
288
289         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
290         if (window == MAP_FAILED)
291                 return -errno;
292
293         if (_window)
294                 *_window = window;
295
296         if (_woffset)
297                 *_woffset = woffset;
298
299         if (_wsize)
300                 *_wsize = wsize;
301
302         *ret = (uint8_t*) window + (offset - woffset);
303
304         return 0;
305 }
306
307 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
308         void *p = NULL;
309         uint64_t delta;
310         int r;
311         Window *w;
312
313         assert(f);
314         assert(ret);
315         assert(wt >= 0);
316         assert(wt < _WINDOW_MAX);
317
318         if (offset + size > (uint64_t) f->last_stat.st_size) {
319                 /* Hmm, out of range? Let's refresh the fstat() data
320                  * first, before we trust that check. */
321
322                 if (fstat(f->fd, &f->last_stat) < 0 ||
323                     offset + size > (uint64_t) f->last_stat.st_size)
324                         return -EADDRNOTAVAIL;
325         }
326
327         w = f->windows + wt;
328
329         if (_likely_(w->ptr &&
330                      w->offset <= offset &&
331                      w->offset + w->size >= offset + size)) {
332
333                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
334                 return 0;
335         }
336
337         if (w->ptr) {
338                 if (munmap(w->ptr, w->size) < 0)
339                         return -errno;
340
341                 w->ptr = NULL;
342                 w->size = w->offset = 0;
343         }
344
345         if (size < DEFAULT_WINDOW_SIZE) {
346                 /* If the default window size is larger then what was
347                  * asked for extend the mapping a bit in the hope to
348                  * minimize needed remappings later on. We add half
349                  * the window space before and half behind the
350                  * requested mapping */
351
352                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
353
354                 if (delta > offset)
355                         delta = offset;
356
357                 offset -= delta;
358                 size = DEFAULT_WINDOW_SIZE;
359         } else
360                 delta = 0;
361
362         if (offset + size > (uint64_t) f->last_stat.st_size)
363                 size = (uint64_t) f->last_stat.st_size - offset;
364
365         if (size <= 0)
366                 return -EADDRNOTAVAIL;
367
368         r = journal_file_map(f,
369                              offset, size,
370                              &w->ptr, &w->offset, &w->size,
371                              &p);
372
373         if (r < 0)
374                 return r;
375
376         *ret = (uint8_t*) p + delta;
377         return 0;
378 }
379
380 static bool verify_hash(Object *o) {
381         uint64_t h1, h2;
382
383         assert(o);
384
385         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
386                 h1 = le64toh(o->data.hash);
387                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
388         } else if (o->object.type == OBJECT_FIELD) {
389                 h1 = le64toh(o->field.hash);
390                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
391         } else
392                 return true;
393
394         return h1 == h2;
395 }
396
397 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
398         int r;
399         void *t;
400         Object *o;
401         uint64_t s;
402
403         assert(f);
404         assert(ret);
405         assert(type < _OBJECT_TYPE_MAX);
406
407         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
408         if (r < 0)
409                 return r;
410
411         o = (Object*) t;
412         s = le64toh(o->object.size);
413
414         if (s < sizeof(ObjectHeader))
415                 return -EBADMSG;
416
417         if (type >= 0 && o->object.type != type)
418                 return -EBADMSG;
419
420         if (s > sizeof(ObjectHeader)) {
421                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
422                 if (r < 0)
423                         return r;
424
425                 o = (Object*) t;
426         }
427
428         if (!verify_hash(o))
429                 return -EBADMSG;
430
431         *ret = o;
432         return 0;
433 }
434
435 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
436         uint64_t r;
437
438         assert(f);
439
440         r = le64toh(f->header->tail_seqnum) + 1;
441
442         if (seqnum) {
443                 /* If an external seqnum counter was passed, we update
444                  * both the local and the external one, and set it to
445                  * the maximum of both */
446
447                 if (*seqnum + 1 > r)
448                         r = *seqnum + 1;
449
450                 *seqnum = r;
451         }
452
453         f->header->tail_seqnum = htole64(r);
454
455         if (f->header->head_seqnum == 0)
456                 f->header->head_seqnum = htole64(r);
457
458         return r;
459 }
460
461 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
462         int r;
463         uint64_t p;
464         Object *tail, *o;
465         void *t;
466
467         assert(f);
468         assert(size >= sizeof(ObjectHeader));
469         assert(offset);
470         assert(ret);
471
472         p = le64toh(f->header->tail_object_offset);
473         if (p == 0)
474                 p = le64toh(f->header->header_size);
475         else {
476                 r = journal_file_move_to_object(f, -1, p, &tail);
477                 if (r < 0)
478                         return r;
479
480                 p += ALIGN64(le64toh(tail->object.size));
481         }
482
483         r = journal_file_allocate(f, p, size);
484         if (r < 0)
485                 return r;
486
487         r = journal_file_move_to(f, type, p, size, &t);
488         if (r < 0)
489                 return r;
490
491         o = (Object*) t;
492
493         zero(o->object);
494         o->object.type = type;
495         o->object.size = htole64(size);
496
497         f->header->tail_object_offset = htole64(p);
498         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
499
500         *ret = o;
501         *offset = p;
502
503         return 0;
504 }
505
506 static int journal_file_setup_data_hash_table(JournalFile *f) {
507         uint64_t s, p;
508         Object *o;
509         int r;
510
511         assert(f);
512
513         /* We estimate that we need 1 hash table entry per 768 of
514            journal file and we want to make sure we never get beyond
515            75% fill level. Calculate the hash table size for the
516            maximum file size based on these metrics. */
517
518         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
519         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
520                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
521
522         log_info("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
523
524         r = journal_file_append_object(f,
525                                        OBJECT_DATA_HASH_TABLE,
526                                        offsetof(Object, hash_table.items) + s,
527                                        &o, &p);
528         if (r < 0)
529                 return r;
530
531         memset(o->hash_table.items, 0, s);
532
533         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
534         f->header->data_hash_table_size = htole64(s);
535
536         return 0;
537 }
538
539 static int journal_file_setup_field_hash_table(JournalFile *f) {
540         uint64_t s, p;
541         Object *o;
542         int r;
543
544         assert(f);
545
546         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
547         r = journal_file_append_object(f,
548                                        OBJECT_FIELD_HASH_TABLE,
549                                        offsetof(Object, hash_table.items) + s,
550                                        &o, &p);
551         if (r < 0)
552                 return r;
553
554         memset(o->hash_table.items, 0, s);
555
556         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
557         f->header->field_hash_table_size = htole64(s);
558
559         return 0;
560 }
561
562 static int journal_file_map_data_hash_table(JournalFile *f) {
563         uint64_t s, p;
564         void *t;
565         int r;
566
567         assert(f);
568
569         p = le64toh(f->header->data_hash_table_offset);
570         s = le64toh(f->header->data_hash_table_size);
571
572         r = journal_file_move_to(f,
573                                  WINDOW_DATA_HASH_TABLE,
574                                  p, s,
575                                  &t);
576         if (r < 0)
577                 return r;
578
579         f->data_hash_table = t;
580         return 0;
581 }
582
583 static int journal_file_map_field_hash_table(JournalFile *f) {
584         uint64_t s, p;
585         void *t;
586         int r;
587
588         assert(f);
589
590         p = le64toh(f->header->field_hash_table_offset);
591         s = le64toh(f->header->field_hash_table_size);
592
593         r = journal_file_move_to(f,
594                                  WINDOW_FIELD_HASH_TABLE,
595                                  p, s,
596                                  &t);
597         if (r < 0)
598                 return r;
599
600         f->field_hash_table = t;
601         return 0;
602 }
603
604 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
605         uint64_t p, h;
606         int r;
607
608         assert(f);
609         assert(o);
610         assert(offset > 0);
611         assert(o->object.type == OBJECT_DATA);
612
613         /* This might alter the window we are looking at */
614
615         o->data.next_hash_offset = o->data.next_field_offset = 0;
616         o->data.entry_offset = o->data.entry_array_offset = 0;
617         o->data.n_entries = 0;
618
619         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
620         p = le64toh(f->data_hash_table[h].tail_hash_offset);
621         if (p == 0) {
622                 /* Only entry in the hash table is easy */
623                 f->data_hash_table[h].head_hash_offset = htole64(offset);
624         } else {
625                 /* Move back to the previous data object, to patch in
626                  * pointer */
627
628                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
629                 if (r < 0)
630                         return r;
631
632                 o->data.next_hash_offset = htole64(offset);
633         }
634
635         f->data_hash_table[h].tail_hash_offset = htole64(offset);
636
637         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
638                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
639
640         return 0;
641 }
642
643 int journal_file_find_data_object_with_hash(
644                 JournalFile *f,
645                 const void *data, uint64_t size, uint64_t hash,
646                 Object **ret, uint64_t *offset) {
647
648         uint64_t p, osize, h;
649         int r;
650
651         assert(f);
652         assert(data || size == 0);
653
654         osize = offsetof(Object, data.payload) + size;
655
656         if (f->header->data_hash_table_size == 0)
657                 return -EBADMSG;
658
659         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
660         p = le64toh(f->data_hash_table[h].head_hash_offset);
661
662         while (p > 0) {
663                 Object *o;
664
665                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
666                 if (r < 0)
667                         return r;
668
669                 if (le64toh(o->data.hash) != hash)
670                         goto next;
671
672                 if (o->object.flags & OBJECT_COMPRESSED) {
673 #ifdef HAVE_XZ
674                         uint64_t l, rsize;
675
676                         l = le64toh(o->object.size);
677                         if (l <= offsetof(Object, data.payload))
678                                 return -EBADMSG;
679
680                         l -= offsetof(Object, data.payload);
681
682                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
683                                 return -EBADMSG;
684
685                         if (rsize == size &&
686                             memcmp(f->compress_buffer, data, size) == 0) {
687
688                                 if (ret)
689                                         *ret = o;
690
691                                 if (offset)
692                                         *offset = p;
693
694                                 return 1;
695                         }
696 #else
697                         return -EPROTONOSUPPORT;
698 #endif
699
700                 } else if (le64toh(o->object.size) == osize &&
701                            memcmp(o->data.payload, data, size) == 0) {
702
703                         if (ret)
704                                 *ret = o;
705
706                         if (offset)
707                                 *offset = p;
708
709                         return 1;
710                 }
711
712         next:
713                 p = le64toh(o->data.next_hash_offset);
714         }
715
716         return 0;
717 }
718
719 int journal_file_find_data_object(
720                 JournalFile *f,
721                 const void *data, uint64_t size,
722                 Object **ret, uint64_t *offset) {
723
724         uint64_t hash;
725
726         assert(f);
727         assert(data || size == 0);
728
729         hash = hash64(data, size);
730
731         return journal_file_find_data_object_with_hash(f,
732                                                        data, size, hash,
733                                                        ret, offset);
734 }
735
736 static int journal_file_append_data(
737                 JournalFile *f,
738                 const void *data, uint64_t size,
739                 Object **ret, uint64_t *offset) {
740
741         uint64_t hash, p;
742         uint64_t osize;
743         Object *o;
744         int r;
745         bool compressed = false;
746
747         assert(f);
748         assert(data || size == 0);
749
750         hash = hash64(data, size);
751
752         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
753         if (r < 0)
754                 return r;
755         else if (r > 0) {
756
757                 if (ret)
758                         *ret = o;
759
760                 if (offset)
761                         *offset = p;
762
763                 return 0;
764         }
765
766         osize = offsetof(Object, data.payload) + size;
767         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
768         if (r < 0)
769                 return r;
770
771         o->data.hash = htole64(hash);
772
773 #ifdef HAVE_XZ
774         if (f->compress &&
775             size >= COMPRESSION_SIZE_THRESHOLD) {
776                 uint64_t rsize;
777
778                 compressed = compress_blob(data, size, o->data.payload, &rsize);
779
780                 if (compressed) {
781                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
782                         o->object.flags |= OBJECT_COMPRESSED;
783
784                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
785
786                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
787                 }
788         }
789 #endif
790
791         if (!compressed)
792                 memcpy(o->data.payload, data, size);
793
794         r = journal_file_link_data(f, o, p, hash);
795         if (r < 0)
796                 return r;
797
798         /* The linking might have altered the window, so let's
799          * refresh our pointer */
800         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
801         if (r < 0)
802                 return r;
803
804         if (ret)
805                 *ret = o;
806
807         if (offset)
808                 *offset = p;
809
810         return 0;
811 }
812
813 uint64_t journal_file_entry_n_items(Object *o) {
814         assert(o);
815         assert(o->object.type == OBJECT_ENTRY);
816
817         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
818 }
819
820 static uint64_t journal_file_entry_array_n_items(Object *o) {
821         assert(o);
822         assert(o->object.type == OBJECT_ENTRY_ARRAY);
823
824         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
825 }
826
827 static int link_entry_into_array(JournalFile *f,
828                                  le64_t *first,
829                                  le64_t *idx,
830                                  uint64_t p) {
831         int r;
832         uint64_t n = 0, ap = 0, q, i, a, hidx;
833         Object *o;
834
835         assert(f);
836         assert(first);
837         assert(idx);
838         assert(p > 0);
839
840         a = le64toh(*first);
841         i = hidx = le64toh(*idx);
842         while (a > 0) {
843
844                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
845                 if (r < 0)
846                         return r;
847
848                 n = journal_file_entry_array_n_items(o);
849                 if (i < n) {
850                         o->entry_array.items[i] = htole64(p);
851                         *idx = htole64(hidx + 1);
852                         return 0;
853                 }
854
855                 i -= n;
856                 ap = a;
857                 a = le64toh(o->entry_array.next_entry_array_offset);
858         }
859
860         if (hidx > n)
861                 n = (hidx+1) * 2;
862         else
863                 n = n * 2;
864
865         if (n < 4)
866                 n = 4;
867
868         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
869                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
870                                        &o, &q);
871         if (r < 0)
872                 return r;
873
874         o->entry_array.items[i] = htole64(p);
875
876         if (ap == 0)
877                 *first = htole64(q);
878         else {
879                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
880                 if (r < 0)
881                         return r;
882
883                 o->entry_array.next_entry_array_offset = htole64(q);
884         }
885
886         *idx = htole64(hidx + 1);
887
888         return 0;
889 }
890
891 static int link_entry_into_array_plus_one(JournalFile *f,
892                                           le64_t *extra,
893                                           le64_t *first,
894                                           le64_t *idx,
895                                           uint64_t p) {
896
897         int r;
898
899         assert(f);
900         assert(extra);
901         assert(first);
902         assert(idx);
903         assert(p > 0);
904
905         if (*idx == 0)
906                 *extra = htole64(p);
907         else {
908                 le64_t i;
909
910                 i = htole64(le64toh(*idx) - 1);
911                 r = link_entry_into_array(f, first, &i, p);
912                 if (r < 0)
913                         return r;
914         }
915
916         *idx = htole64(le64toh(*idx) + 1);
917         return 0;
918 }
919
920 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
921         uint64_t p;
922         int r;
923         assert(f);
924         assert(o);
925         assert(offset > 0);
926
927         p = le64toh(o->entry.items[i].object_offset);
928         if (p == 0)
929                 return -EINVAL;
930
931         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
932         if (r < 0)
933                 return r;
934
935         return link_entry_into_array_plus_one(f,
936                                               &o->data.entry_offset,
937                                               &o->data.entry_array_offset,
938                                               &o->data.n_entries,
939                                               offset);
940 }
941
942 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
943         uint64_t n, i;
944         int r;
945
946         assert(f);
947         assert(o);
948         assert(offset > 0);
949         assert(o->object.type == OBJECT_ENTRY);
950
951         __sync_synchronize();
952
953         /* Link up the entry itself */
954         r = link_entry_into_array(f,
955                                   &f->header->entry_array_offset,
956                                   &f->header->n_entries,
957                                   offset);
958         if (r < 0)
959                 return r;
960
961         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
962
963         if (f->header->head_entry_realtime == 0)
964                 f->header->head_entry_realtime = o->entry.realtime;
965
966         f->header->tail_entry_realtime = o->entry.realtime;
967         f->header->tail_entry_monotonic = o->entry.monotonic;
968
969         f->tail_entry_monotonic_valid = true;
970
971         /* Link up the items */
972         n = journal_file_entry_n_items(o);
973         for (i = 0; i < n; i++) {
974                 r = journal_file_link_entry_item(f, o, offset, i);
975                 if (r < 0)
976                         return r;
977         }
978
979         return 0;
980 }
981
982 static int journal_file_append_entry_internal(
983                 JournalFile *f,
984                 const dual_timestamp *ts,
985                 uint64_t xor_hash,
986                 const EntryItem items[], unsigned n_items,
987                 uint64_t *seqnum,
988                 Object **ret, uint64_t *offset) {
989         uint64_t np;
990         uint64_t osize;
991         Object *o;
992         int r;
993
994         assert(f);
995         assert(items || n_items == 0);
996         assert(ts);
997
998         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
999
1000         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1001         if (r < 0)
1002                 return r;
1003
1004         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
1005         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1006         o->entry.realtime = htole64(ts->realtime);
1007         o->entry.monotonic = htole64(ts->monotonic);
1008         o->entry.xor_hash = htole64(xor_hash);
1009         o->entry.boot_id = f->header->boot_id;
1010
1011         r = journal_file_link_entry(f, o, np);
1012         if (r < 0)
1013                 return r;
1014
1015         if (ret)
1016                 *ret = o;
1017
1018         if (offset)
1019                 *offset = np;
1020
1021         return 0;
1022 }
1023
1024 void journal_file_post_change(JournalFile *f) {
1025         assert(f);
1026
1027         /* inotify() does not receive IN_MODIFY events from file
1028          * accesses done via mmap(). After each access we hence
1029          * trigger IN_MODIFY by truncating the journal file to its
1030          * current size which triggers IN_MODIFY. */
1031
1032         __sync_synchronize();
1033
1034         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1035                 log_error("Failed to to truncate file to its own size: %m");
1036 }
1037
1038 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1039         unsigned i;
1040         EntryItem *items;
1041         int r;
1042         uint64_t xor_hash = 0;
1043         struct dual_timestamp _ts;
1044
1045         assert(f);
1046         assert(iovec || n_iovec == 0);
1047
1048         if (!f->writable)
1049                 return -EPERM;
1050
1051         if (!ts) {
1052                 dual_timestamp_get(&_ts);
1053                 ts = &_ts;
1054         }
1055
1056         if (f->tail_entry_monotonic_valid &&
1057             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1058                 return -EINVAL;
1059
1060         items = alloca(sizeof(EntryItem) * n_iovec);
1061
1062         for (i = 0; i < n_iovec; i++) {
1063                 uint64_t p;
1064                 Object *o;
1065
1066                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1067                 if (r < 0)
1068                         return r;
1069
1070                 xor_hash ^= le64toh(o->data.hash);
1071                 items[i].object_offset = htole64(p);
1072                 items[i].hash = o->data.hash;
1073         }
1074
1075         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1076
1077         journal_file_post_change(f);
1078
1079         return r;
1080 }
1081
1082 static int generic_array_get(JournalFile *f,
1083                              uint64_t first,
1084                              uint64_t i,
1085                              Object **ret, uint64_t *offset) {
1086
1087         Object *o;
1088         uint64_t p = 0, a;
1089         int r;
1090
1091         assert(f);
1092
1093         a = first;
1094         while (a > 0) {
1095                 uint64_t n;
1096
1097                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1098                 if (r < 0)
1099                         return r;
1100
1101                 n = journal_file_entry_array_n_items(o);
1102                 if (i < n) {
1103                         p = le64toh(o->entry_array.items[i]);
1104                         break;
1105                 }
1106
1107                 i -= n;
1108                 a = le64toh(o->entry_array.next_entry_array_offset);
1109         }
1110
1111         if (a <= 0 || p <= 0)
1112                 return 0;
1113
1114         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1115         if (r < 0)
1116                 return r;
1117
1118         if (ret)
1119                 *ret = o;
1120
1121         if (offset)
1122                 *offset = p;
1123
1124         return 1;
1125 }
1126
1127 static int generic_array_get_plus_one(JournalFile *f,
1128                                       uint64_t extra,
1129                                       uint64_t first,
1130                                       uint64_t i,
1131                                       Object **ret, uint64_t *offset) {
1132
1133         Object *o;
1134
1135         assert(f);
1136
1137         if (i == 0) {
1138                 int r;
1139
1140                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1141                 if (r < 0)
1142                         return r;
1143
1144                 if (ret)
1145                         *ret = o;
1146
1147                 if (offset)
1148                         *offset = extra;
1149
1150                 return 1;
1151         }
1152
1153         return generic_array_get(f, first, i-1, ret, offset);
1154 }
1155
1156 enum {
1157         TEST_FOUND,
1158         TEST_LEFT,
1159         TEST_RIGHT
1160 };
1161
1162 static int generic_array_bisect(JournalFile *f,
1163                                 uint64_t first,
1164                                 uint64_t n,
1165                                 uint64_t needle,
1166                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1167                                 direction_t direction,
1168                                 Object **ret,
1169                                 uint64_t *offset,
1170                                 uint64_t *idx) {
1171
1172         uint64_t a, p, t = 0, i = 0, last_p = 0;
1173         bool subtract_one = false;
1174         Object *o, *array = NULL;
1175         int r;
1176
1177         assert(f);
1178         assert(test_object);
1179
1180         a = first;
1181         while (a > 0) {
1182                 uint64_t left, right, k, lp;
1183
1184                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1185                 if (r < 0)
1186                         return r;
1187
1188                 k = journal_file_entry_array_n_items(array);
1189                 right = MIN(k, n);
1190                 if (right <= 0)
1191                         return 0;
1192
1193                 i = right - 1;
1194                 lp = p = le64toh(array->entry_array.items[i]);
1195                 if (p <= 0)
1196                         return -EBADMSG;
1197
1198                 r = test_object(f, p, needle);
1199                 if (r < 0)
1200                         return r;
1201
1202                 if (r == TEST_FOUND)
1203                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1204
1205                 if (r == TEST_RIGHT) {
1206                         left = 0;
1207                         right -= 1;
1208                         for (;;) {
1209                                 if (left == right) {
1210                                         if (direction == DIRECTION_UP)
1211                                                 subtract_one = true;
1212
1213                                         i = left;
1214                                         goto found;
1215                                 }
1216
1217                                 assert(left < right);
1218
1219                                 i = (left + right) / 2;
1220                                 p = le64toh(array->entry_array.items[i]);
1221                                 if (p <= 0)
1222                                         return -EBADMSG;
1223
1224                                 r = test_object(f, p, needle);
1225                                 if (r < 0)
1226                                         return r;
1227
1228                                 if (r == TEST_FOUND)
1229                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1230
1231                                 if (r == TEST_RIGHT)
1232                                         right = i;
1233                                 else
1234                                         left = i + 1;
1235                         }
1236                 }
1237
1238                 if (k > n) {
1239                         if (direction == DIRECTION_UP) {
1240                                 i = n;
1241                                 subtract_one = true;
1242                                 goto found;
1243                         }
1244
1245                         return 0;
1246                 }
1247
1248                 last_p = lp;
1249
1250                 n -= k;
1251                 t += k;
1252                 a = le64toh(array->entry_array.next_entry_array_offset);
1253         }
1254
1255         return 0;
1256
1257 found:
1258         if (subtract_one && t == 0 && i == 0)
1259                 return 0;
1260
1261         if (subtract_one && i == 0)
1262                 p = last_p;
1263         else if (subtract_one)
1264                 p = le64toh(array->entry_array.items[i-1]);
1265         else
1266                 p = le64toh(array->entry_array.items[i]);
1267
1268         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1269         if (r < 0)
1270                 return r;
1271
1272         if (ret)
1273                 *ret = o;
1274
1275         if (offset)
1276                 *offset = p;
1277
1278         if (idx)
1279                 *idx = t + i + (subtract_one ? -1 : 0);
1280
1281         return 1;
1282 }
1283
1284 static int generic_array_bisect_plus_one(JournalFile *f,
1285                                          uint64_t extra,
1286                                          uint64_t first,
1287                                          uint64_t n,
1288                                          uint64_t needle,
1289                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1290                                          direction_t direction,
1291                                          Object **ret,
1292                                          uint64_t *offset,
1293                                          uint64_t *idx) {
1294
1295         int r;
1296         bool step_back = false;
1297         Object *o;
1298
1299         assert(f);
1300         assert(test_object);
1301
1302         if (n <= 0)
1303                 return 0;
1304
1305         /* This bisects the array in object 'first', but first checks
1306          * an extra  */
1307         r = test_object(f, extra, needle);
1308         if (r < 0)
1309                 return r;
1310
1311         if (r == TEST_FOUND)
1312                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1313
1314         /* if we are looking with DIRECTION_UP then we need to first
1315            see if in the actual array there is a matching entry, and
1316            return the last one of that. But if there isn't any we need
1317            to return this one. Hence remember this, and return it
1318            below. */
1319         if (r == TEST_LEFT)
1320                 step_back = direction == DIRECTION_UP;
1321
1322         if (r == TEST_RIGHT) {
1323                 if (direction == DIRECTION_DOWN)
1324                         goto found;
1325                 else
1326                         return 0;
1327         }
1328
1329         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1330
1331         if (r == 0 && step_back)
1332                 goto found;
1333
1334         if (r > 0 && idx)
1335                 (*idx) ++;
1336
1337         return r;
1338
1339 found:
1340         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1341         if (r < 0)
1342                 return r;
1343
1344         if (ret)
1345                 *ret = o;
1346
1347         if (offset)
1348                 *offset = extra;
1349
1350         if (idx)
1351                 *idx = 0;
1352
1353         return 1;
1354 }
1355
1356 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1357         assert(f);
1358         assert(p > 0);
1359
1360         if (p == needle)
1361                 return TEST_FOUND;
1362         else if (p < needle)
1363                 return TEST_LEFT;
1364         else
1365                 return TEST_RIGHT;
1366 }
1367
1368 int journal_file_move_to_entry_by_offset(
1369                 JournalFile *f,
1370                 uint64_t p,
1371                 direction_t direction,
1372                 Object **ret,
1373                 uint64_t *offset) {
1374
1375         return generic_array_bisect(f,
1376                                     le64toh(f->header->entry_array_offset),
1377                                     le64toh(f->header->n_entries),
1378                                     p,
1379                                     test_object_offset,
1380                                     direction,
1381                                     ret, offset, NULL);
1382 }
1383
1384
1385 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1386         Object *o;
1387         int r;
1388
1389         assert(f);
1390         assert(p > 0);
1391
1392         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1393         if (r < 0)
1394                 return r;
1395
1396         if (le64toh(o->entry.seqnum) == needle)
1397                 return TEST_FOUND;
1398         else if (le64toh(o->entry.seqnum) < needle)
1399                 return TEST_LEFT;
1400         else
1401                 return TEST_RIGHT;
1402 }
1403
1404 int journal_file_move_to_entry_by_seqnum(
1405                 JournalFile *f,
1406                 uint64_t seqnum,
1407                 direction_t direction,
1408                 Object **ret,
1409                 uint64_t *offset) {
1410
1411         return generic_array_bisect(f,
1412                                     le64toh(f->header->entry_array_offset),
1413                                     le64toh(f->header->n_entries),
1414                                     seqnum,
1415                                     test_object_seqnum,
1416                                     direction,
1417                                     ret, offset, NULL);
1418 }
1419
1420 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1421         Object *o;
1422         int r;
1423
1424         assert(f);
1425         assert(p > 0);
1426
1427         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1428         if (r < 0)
1429                 return r;
1430
1431         if (le64toh(o->entry.realtime) == needle)
1432                 return TEST_FOUND;
1433         else if (le64toh(o->entry.realtime) < needle)
1434                 return TEST_LEFT;
1435         else
1436                 return TEST_RIGHT;
1437 }
1438
1439 int journal_file_move_to_entry_by_realtime(
1440                 JournalFile *f,
1441                 uint64_t realtime,
1442                 direction_t direction,
1443                 Object **ret,
1444                 uint64_t *offset) {
1445
1446         return generic_array_bisect(f,
1447                                     le64toh(f->header->entry_array_offset),
1448                                     le64toh(f->header->n_entries),
1449                                     realtime,
1450                                     test_object_realtime,
1451                                     direction,
1452                                     ret, offset, NULL);
1453 }
1454
1455 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1456         Object *o;
1457         int r;
1458
1459         assert(f);
1460         assert(p > 0);
1461
1462         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1463         if (r < 0)
1464                 return r;
1465
1466         if (le64toh(o->entry.monotonic) == needle)
1467                 return TEST_FOUND;
1468         else if (le64toh(o->entry.monotonic) < needle)
1469                 return TEST_LEFT;
1470         else
1471                 return TEST_RIGHT;
1472 }
1473
1474 int journal_file_move_to_entry_by_monotonic(
1475                 JournalFile *f,
1476                 sd_id128_t boot_id,
1477                 uint64_t monotonic,
1478                 direction_t direction,
1479                 Object **ret,
1480                 uint64_t *offset) {
1481
1482         char t[9+32+1] = "_BOOT_ID=";
1483         Object *o;
1484         int r;
1485
1486         assert(f);
1487
1488         sd_id128_to_string(boot_id, t + 9);
1489         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1490         if (r < 0)
1491                 return r;
1492         if (r == 0)
1493                 return -ENOENT;
1494
1495         return generic_array_bisect_plus_one(f,
1496                                              le64toh(o->data.entry_offset),
1497                                              le64toh(o->data.entry_array_offset),
1498                                              le64toh(o->data.n_entries),
1499                                              monotonic,
1500                                              test_object_monotonic,
1501                                              direction,
1502                                              ret, offset, NULL);
1503 }
1504
1505 int journal_file_next_entry(
1506                 JournalFile *f,
1507                 Object *o, uint64_t p,
1508                 direction_t direction,
1509                 Object **ret, uint64_t *offset) {
1510
1511         uint64_t i, n;
1512         int r;
1513
1514         assert(f);
1515         assert(p > 0 || !o);
1516
1517         n = le64toh(f->header->n_entries);
1518         if (n <= 0)
1519                 return 0;
1520
1521         if (!o)
1522                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1523         else {
1524                 if (o->object.type != OBJECT_ENTRY)
1525                         return -EINVAL;
1526
1527                 r = generic_array_bisect(f,
1528                                          le64toh(f->header->entry_array_offset),
1529                                          le64toh(f->header->n_entries),
1530                                          p,
1531                                          test_object_offset,
1532                                          DIRECTION_DOWN,
1533                                          NULL, NULL,
1534                                          &i);
1535                 if (r <= 0)
1536                         return r;
1537
1538                 if (direction == DIRECTION_DOWN) {
1539                         if (i >= n - 1)
1540                                 return 0;
1541
1542                         i++;
1543                 } else {
1544                         if (i <= 0)
1545                                 return 0;
1546
1547                         i--;
1548                 }
1549         }
1550
1551         /* And jump to it */
1552         return generic_array_get(f,
1553                                  le64toh(f->header->entry_array_offset),
1554                                  i,
1555                                  ret, offset);
1556 }
1557
1558 int journal_file_skip_entry(
1559                 JournalFile *f,
1560                 Object *o, uint64_t p,
1561                 int64_t skip,
1562                 Object **ret, uint64_t *offset) {
1563
1564         uint64_t i, n;
1565         int r;
1566
1567         assert(f);
1568         assert(o);
1569         assert(p > 0);
1570
1571         if (o->object.type != OBJECT_ENTRY)
1572                 return -EINVAL;
1573
1574         r = generic_array_bisect(f,
1575                                  le64toh(f->header->entry_array_offset),
1576                                  le64toh(f->header->n_entries),
1577                                  p,
1578                                  test_object_offset,
1579                                  DIRECTION_DOWN,
1580                                  NULL, NULL,
1581                                  &i);
1582         if (r <= 0)
1583                 return r;
1584
1585         /* Calculate new index */
1586         if (skip < 0) {
1587                 if ((uint64_t) -skip >= i)
1588                         i = 0;
1589                 else
1590                         i = i - (uint64_t) -skip;
1591         } else
1592                 i  += (uint64_t) skip;
1593
1594         n = le64toh(f->header->n_entries);
1595         if (n <= 0)
1596                 return -EBADMSG;
1597
1598         if (i >= n)
1599                 i = n-1;
1600
1601         return generic_array_get(f,
1602                                  le64toh(f->header->entry_array_offset),
1603                                  i,
1604                                  ret, offset);
1605 }
1606
1607 int journal_file_next_entry_for_data(
1608                 JournalFile *f,
1609                 Object *o, uint64_t p,
1610                 uint64_t data_offset,
1611                 direction_t direction,
1612                 Object **ret, uint64_t *offset) {
1613
1614         uint64_t n, i;
1615         int r;
1616         Object *d;
1617
1618         assert(f);
1619         assert(p > 0 || !o);
1620
1621         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1622         if (r < 0)
1623                 return r;
1624
1625         n = le64toh(d->data.n_entries);
1626         if (n <= 0)
1627                 return n;
1628
1629         if (!o)
1630                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1631         else {
1632                 if (o->object.type != OBJECT_ENTRY)
1633                         return -EINVAL;
1634
1635                 r = generic_array_bisect_plus_one(f,
1636                                                   le64toh(d->data.entry_offset),
1637                                                   le64toh(d->data.entry_array_offset),
1638                                                   le64toh(d->data.n_entries),
1639                                                   p,
1640                                                   test_object_offset,
1641                                                   DIRECTION_DOWN,
1642                                                   NULL, NULL,
1643                                                   &i);
1644
1645                 if (r <= 0)
1646                         return r;
1647
1648                 if (direction == DIRECTION_DOWN) {
1649                         if (i >= n - 1)
1650                                 return 0;
1651
1652                         i++;
1653                 } else {
1654                         if (i <= 0)
1655                                 return 0;
1656
1657                         i--;
1658                 }
1659
1660         }
1661
1662         return generic_array_get_plus_one(f,
1663                                           le64toh(d->data.entry_offset),
1664                                           le64toh(d->data.entry_array_offset),
1665                                           i,
1666                                           ret, offset);
1667 }
1668
1669 int journal_file_move_to_entry_by_offset_for_data(
1670                 JournalFile *f,
1671                 uint64_t data_offset,
1672                 uint64_t p,
1673                 direction_t direction,
1674                 Object **ret, uint64_t *offset) {
1675
1676         int r;
1677         Object *d;
1678
1679         assert(f);
1680
1681         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1682         if (r < 0)
1683                 return r;
1684
1685         return generic_array_bisect_plus_one(f,
1686                                              le64toh(d->data.entry_offset),
1687                                              le64toh(d->data.entry_array_offset),
1688                                              le64toh(d->data.n_entries),
1689                                              p,
1690                                              test_object_offset,
1691                                              direction,
1692                                              ret, offset, NULL);
1693 }
1694
1695 int journal_file_move_to_entry_by_monotonic_for_data(
1696                 JournalFile *f,
1697                 uint64_t data_offset,
1698                 sd_id128_t boot_id,
1699                 uint64_t monotonic,
1700                 direction_t direction,
1701                 Object **ret, uint64_t *offset) {
1702
1703         char t[9+32+1] = "_BOOT_ID=";
1704         Object *o, *d;
1705         int r;
1706         uint64_t b, z;
1707
1708         assert(f);
1709
1710         /* First, seek by time */
1711         sd_id128_to_string(boot_id, t + 9);
1712         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1713         if (r < 0)
1714                 return r;
1715         if (r == 0)
1716                 return -ENOENT;
1717
1718         r = generic_array_bisect_plus_one(f,
1719                                           le64toh(o->data.entry_offset),
1720                                           le64toh(o->data.entry_array_offset),
1721                                           le64toh(o->data.n_entries),
1722                                           monotonic,
1723                                           test_object_monotonic,
1724                                           direction,
1725                                           NULL, &z, NULL);
1726         if (r <= 0)
1727                 return r;
1728
1729         /* And now, continue seeking until we find an entry that
1730          * exists in both bisection arrays */
1731
1732         for (;;) {
1733                 Object *qo;
1734                 uint64_t p, q;
1735
1736                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1737                 if (r < 0)
1738                         return r;
1739
1740                 r = generic_array_bisect_plus_one(f,
1741                                                   le64toh(d->data.entry_offset),
1742                                                   le64toh(d->data.entry_array_offset),
1743                                                   le64toh(d->data.n_entries),
1744                                                   z,
1745                                                   test_object_offset,
1746                                                   direction,
1747                                                   NULL, &p, NULL);
1748                 if (r <= 0)
1749                         return r;
1750
1751                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1752                 if (r < 0)
1753                         return r;
1754
1755                 r = generic_array_bisect_plus_one(f,
1756                                                   le64toh(o->data.entry_offset),
1757                                                   le64toh(o->data.entry_array_offset),
1758                                                   le64toh(o->data.n_entries),
1759                                                   p,
1760                                                   test_object_offset,
1761                                                   direction,
1762                                                   &qo, &q, NULL);
1763
1764                 if (r <= 0)
1765                         return r;
1766
1767                 if (p == q) {
1768                         if (ret)
1769                                 *ret = qo;
1770                         if (offset)
1771                                 *offset = q;
1772
1773                         return 1;
1774                 }
1775
1776                 z = q;
1777         }
1778
1779         return 0;
1780 }
1781
1782 int journal_file_move_to_entry_by_seqnum_for_data(
1783                 JournalFile *f,
1784                 uint64_t data_offset,
1785                 uint64_t seqnum,
1786                 direction_t direction,
1787                 Object **ret, uint64_t *offset) {
1788
1789         Object *d;
1790         int r;
1791
1792         assert(f);
1793
1794         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1795         if (r < 0)
1796                 return r;
1797
1798         return generic_array_bisect_plus_one(f,
1799                                              le64toh(d->data.entry_offset),
1800                                              le64toh(d->data.entry_array_offset),
1801                                              le64toh(d->data.n_entries),
1802                                              seqnum,
1803                                              test_object_seqnum,
1804                                              direction,
1805                                              ret, offset, NULL);
1806 }
1807
1808 int journal_file_move_to_entry_by_realtime_for_data(
1809                 JournalFile *f,
1810                 uint64_t data_offset,
1811                 uint64_t realtime,
1812                 direction_t direction,
1813                 Object **ret, uint64_t *offset) {
1814
1815         Object *d;
1816         int r;
1817
1818         assert(f);
1819
1820         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1821         if (r < 0)
1822                 return r;
1823
1824         return generic_array_bisect_plus_one(f,
1825                                              le64toh(d->data.entry_offset),
1826                                              le64toh(d->data.entry_array_offset),
1827                                              le64toh(d->data.n_entries),
1828                                              realtime,
1829                                              test_object_realtime,
1830                                              direction,
1831                                              ret, offset, NULL);
1832 }
1833
1834 void journal_file_dump(JournalFile *f) {
1835         Object *o;
1836         int r;
1837         uint64_t p;
1838
1839         assert(f);
1840
1841         journal_file_print_header(f);
1842
1843         p = le64toh(f->header->header_size);
1844         while (p != 0) {
1845                 r = journal_file_move_to_object(f, -1, p, &o);
1846                 if (r < 0)
1847                         goto fail;
1848
1849                 switch (o->object.type) {
1850
1851                 case OBJECT_UNUSED:
1852                         printf("Type: OBJECT_UNUSED\n");
1853                         break;
1854
1855                 case OBJECT_DATA:
1856                         printf("Type: OBJECT_DATA\n");
1857                         break;
1858
1859                 case OBJECT_ENTRY:
1860                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1861                                (unsigned long long) le64toh(o->entry.seqnum),
1862                                (unsigned long long) le64toh(o->entry.monotonic),
1863                                (unsigned long long) le64toh(o->entry.realtime));
1864                         break;
1865
1866                 case OBJECT_FIELD_HASH_TABLE:
1867                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1868                         break;
1869
1870                 case OBJECT_DATA_HASH_TABLE:
1871                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1872                         break;
1873
1874                 case OBJECT_ENTRY_ARRAY:
1875                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1876                         break;
1877
1878                 case OBJECT_SIGNATURE:
1879                         printf("Type: OBJECT_SIGNATURE\n");
1880                         break;
1881                 }
1882
1883                 if (o->object.flags & OBJECT_COMPRESSED)
1884                         printf("Flags: COMPRESSED\n");
1885
1886                 if (p == le64toh(f->header->tail_object_offset))
1887                         p = 0;
1888                 else
1889                         p = p + ALIGN64(le64toh(o->object.size));
1890         }
1891
1892         return;
1893 fail:
1894         log_error("File corrupt");
1895 }
1896
1897 void journal_file_print_header(JournalFile *f) {
1898         char a[33], b[33], c[33];
1899         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1900
1901         assert(f);
1902
1903         printf("File Path: %s\n"
1904                "File ID: %s\n"
1905                "Machine ID: %s\n"
1906                "Boot ID: %s\n"
1907                "Sequential Number ID: %s\n"
1908                "State: %s\n"
1909                "Compatible Flags:%s%s\n"
1910                "Incompatible Flags:%s%s\n"
1911                "Header size: %llu\n"
1912                "Arena size: %llu\n"
1913                "Data Hash Table Size: %llu\n"
1914                "Field Hash Table Size: %llu\n"
1915                "Objects: %llu\n"
1916                "Entry Objects: %llu\n"
1917                "Rotate Suggested: %s\n"
1918                "Head Sequential Number: %llu\n"
1919                "Tail Sequential Number: %llu\n"
1920                "Head Realtime Timestamp: %s\n"
1921                "Tail Realtime Timestamp: %s\n",
1922                f->path,
1923                sd_id128_to_string(f->header->file_id, a),
1924                sd_id128_to_string(f->header->machine_id, b),
1925                sd_id128_to_string(f->header->boot_id, c),
1926                sd_id128_to_string(f->header->seqnum_id, c),
1927                f->header->state == STATE_OFFLINE ? "offline" :
1928                f->header->state == STATE_ONLINE ? "online" :
1929                f->header->state == STATE_ARCHIVED ? "archived" : "unknown",
1930                (f->header->compatible_flags & HEADER_COMPATIBLE_SIGNED) ? " SIGNED" : "",
1931                (f->header->compatible_flags & ~HEADER_COMPATIBLE_SIGNED) ? " ???" : "",
1932                (f->header->incompatible_flags & HEADER_INCOMPATIBLE_COMPRESSED) ? " COMPRESSED" : "",
1933                (f->header->incompatible_flags & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
1934                (unsigned long long) le64toh(f->header->header_size),
1935                (unsigned long long) le64toh(f->header->arena_size),
1936                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1937                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1938                (unsigned long long) le64toh(f->header->n_objects),
1939                (unsigned long long) le64toh(f->header->n_entries),
1940                yes_no(journal_file_rotate_suggested(f)),
1941                (unsigned long long) le64toh(f->header->head_seqnum),
1942                (unsigned long long) le64toh(f->header->tail_seqnum),
1943                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1944                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)));
1945
1946         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1947                 printf("Data Objects: %llu\n"
1948                        "Data Hash Table Fill: %.1f%%\n",
1949                        (unsigned long long) le64toh(f->header->n_data),
1950                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1951
1952         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1953                 printf("Field Objects: %llu\n"
1954                        "Field Hash Table Fill: %.1f%%\n",
1955                        (unsigned long long) le64toh(f->header->n_fields),
1956                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1957 }
1958
1959 int journal_file_open(
1960                 const char *fname,
1961                 int flags,
1962                 mode_t mode,
1963                 JournalMetrics *metrics,
1964                 JournalFile *template,
1965                 JournalFile **ret) {
1966
1967         JournalFile *f;
1968         int r;
1969         bool newly_created = false;
1970
1971         assert(fname);
1972
1973         if ((flags & O_ACCMODE) != O_RDONLY &&
1974             (flags & O_ACCMODE) != O_RDWR)
1975                 return -EINVAL;
1976
1977         if (!endswith(fname, ".journal"))
1978                 return -EINVAL;
1979
1980         f = new0(JournalFile, 1);
1981         if (!f)
1982                 return -ENOMEM;
1983
1984         f->fd = -1;
1985         f->flags = flags;
1986         f->mode = mode;
1987         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1988         f->prot = prot_from_flags(flags);
1989
1990         if (template)
1991                 f->compress = template->compress;
1992
1993         f->path = strdup(fname);
1994         if (!f->path) {
1995                 r = -ENOMEM;
1996                 goto fail;
1997         }
1998
1999         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2000         if (f->fd < 0) {
2001                 r = -errno;
2002                 goto fail;
2003         }
2004
2005         if (fstat(f->fd, &f->last_stat) < 0) {
2006                 r = -errno;
2007                 goto fail;
2008         }
2009
2010         if (f->last_stat.st_size == 0 && f->writable) {
2011                 newly_created = true;
2012
2013                 r = journal_file_init_header(f, template);
2014                 if (r < 0)
2015                         goto fail;
2016
2017                 if (fstat(f->fd, &f->last_stat) < 0) {
2018                         r = -errno;
2019                         goto fail;
2020                 }
2021         }
2022
2023         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2024                 r = -EIO;
2025                 goto fail;
2026         }
2027
2028         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2029         if (f->header == MAP_FAILED) {
2030                 f->header = NULL;
2031                 r = -errno;
2032                 goto fail;
2033         }
2034
2035         if (!newly_created) {
2036                 r = journal_file_verify_header(f);
2037                 if (r < 0)
2038                         goto fail;
2039         }
2040
2041         if (f->writable) {
2042                 if (metrics) {
2043                         journal_default_metrics(metrics, f->fd);
2044                         f->metrics = *metrics;
2045                 } else if (template)
2046                         f->metrics = template->metrics;
2047
2048                 r = journal_file_refresh_header(f);
2049                 if (r < 0)
2050                         goto fail;
2051         }
2052
2053         if (newly_created) {
2054
2055                 r = journal_file_setup_field_hash_table(f);
2056                 if (r < 0)
2057                         goto fail;
2058
2059                 r = journal_file_setup_data_hash_table(f);
2060                 if (r < 0)
2061                         goto fail;
2062         }
2063
2064         r = journal_file_map_field_hash_table(f);
2065         if (r < 0)
2066                 goto fail;
2067
2068         r = journal_file_map_data_hash_table(f);
2069         if (r < 0)
2070                 goto fail;
2071
2072         if (ret)
2073                 *ret = f;
2074
2075         return 0;
2076
2077 fail:
2078         journal_file_close(f);
2079
2080         return r;
2081 }
2082
2083 int journal_file_rotate(JournalFile **f) {
2084         char *p;
2085         size_t l;
2086         JournalFile *old_file, *new_file = NULL;
2087         int r;
2088
2089         assert(f);
2090         assert(*f);
2091
2092         old_file = *f;
2093
2094         if (!old_file->writable)
2095                 return -EINVAL;
2096
2097         if (!endswith(old_file->path, ".journal"))
2098                 return -EINVAL;
2099
2100         l = strlen(old_file->path);
2101
2102         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2103         if (!p)
2104                 return -ENOMEM;
2105
2106         memcpy(p, old_file->path, l - 8);
2107         p[l-8] = '@';
2108         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2109         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2110                  "-%016llx-%016llx.journal",
2111                  (unsigned long long) le64toh((*f)->header->tail_seqnum),
2112                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2113
2114         r = rename(old_file->path, p);
2115         free(p);
2116
2117         if (r < 0)
2118                 return -errno;
2119
2120         old_file->header->state = STATE_ARCHIVED;
2121
2122         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, NULL, old_file, &new_file);
2123         journal_file_close(old_file);
2124
2125         *f = new_file;
2126         return r;
2127 }
2128
2129 int journal_file_open_reliably(
2130                 const char *fname,
2131                 int flags,
2132                 mode_t mode,
2133                 JournalMetrics *metrics,
2134                 JournalFile *template,
2135                 JournalFile **ret) {
2136
2137         int r;
2138         size_t l;
2139         char *p;
2140
2141         r = journal_file_open(fname, flags, mode, metrics, template, ret);
2142         if (r != -EBADMSG && /* corrupted */
2143             r != -ENODATA && /* truncated */
2144             r != -EHOSTDOWN && /* other machine */
2145             r != -EPROTONOSUPPORT && /* incompatible feature */
2146             r != -EBUSY && /* unclean shutdown */
2147             r != -ESHUTDOWN /* already archived */)
2148                 return r;
2149
2150         if ((flags & O_ACCMODE) == O_RDONLY)
2151                 return r;
2152
2153         if (!(flags & O_CREAT))
2154                 return r;
2155
2156         /* The file is corrupted. Rotate it away and try it again (but only once) */
2157
2158         l = strlen(fname);
2159         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2160                      (int) (l-8), fname,
2161                      (unsigned long long) now(CLOCK_REALTIME),
2162                      random_ull()) < 0)
2163                 return -ENOMEM;
2164
2165         r = rename(fname, p);
2166         free(p);
2167         if (r < 0)
2168                 return -errno;
2169
2170         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2171
2172         return journal_file_open(fname, flags, mode, metrics, template, ret);
2173 }
2174
2175 struct vacuum_info {
2176         off_t usage;
2177         char *filename;
2178
2179         uint64_t realtime;
2180         sd_id128_t seqnum_id;
2181         uint64_t seqnum;
2182
2183         bool have_seqnum;
2184 };
2185
2186 static int vacuum_compare(const void *_a, const void *_b) {
2187         const struct vacuum_info *a, *b;
2188
2189         a = _a;
2190         b = _b;
2191
2192         if (a->have_seqnum && b->have_seqnum &&
2193             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
2194                 if (a->seqnum < b->seqnum)
2195                         return -1;
2196                 else if (a->seqnum > b->seqnum)
2197                         return 1;
2198                 else
2199                         return 0;
2200         }
2201
2202         if (a->realtime < b->realtime)
2203                 return -1;
2204         else if (a->realtime > b->realtime)
2205                 return 1;
2206         else if (a->have_seqnum && b->have_seqnum)
2207                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
2208         else
2209                 return strcmp(a->filename, b->filename);
2210 }
2211
2212 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
2213         DIR *d;
2214         int r = 0;
2215         struct vacuum_info *list = NULL;
2216         unsigned n_list = 0, n_allocated = 0, i;
2217         uint64_t sum = 0;
2218
2219         assert(directory);
2220
2221         if (max_use <= 0)
2222                 return 0;
2223
2224         d = opendir(directory);
2225         if (!d)
2226                 return -errno;
2227
2228         for (;;) {
2229                 int k;
2230                 struct dirent buf, *de;
2231                 size_t q;
2232                 struct stat st;
2233                 char *p;
2234                 unsigned long long seqnum = 0, realtime;
2235                 sd_id128_t seqnum_id;
2236                 bool have_seqnum;
2237
2238                 k = readdir_r(d, &buf, &de);
2239                 if (k != 0) {
2240                         r = -k;
2241                         goto finish;
2242                 }
2243
2244                 if (!de)
2245                         break;
2246
2247                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2248                         continue;
2249
2250                 if (!S_ISREG(st.st_mode))
2251                         continue;
2252
2253                 q = strlen(de->d_name);
2254
2255                 if (endswith(de->d_name, ".journal")) {
2256
2257                         /* Vacuum archived files */
2258
2259                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2260                                 continue;
2261
2262                         if (de->d_name[q-8-16-1] != '-' ||
2263                             de->d_name[q-8-16-1-16-1] != '-' ||
2264                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2265                                 continue;
2266
2267                         p = strdup(de->d_name);
2268                         if (!p) {
2269                                 r = -ENOMEM;
2270                                 goto finish;
2271                         }
2272
2273                         de->d_name[q-8-16-1-16-1] = 0;
2274                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2275                                 free(p);
2276                                 continue;
2277                         }
2278
2279                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2280                                 free(p);
2281                                 continue;
2282                         }
2283
2284                         have_seqnum = true;
2285
2286                 } else if (endswith(de->d_name, ".journal~")) {
2287                         unsigned long long tmp;
2288
2289                         /* Vacuum corrupted files */
2290
2291                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2292                                 continue;
2293
2294                         if (de->d_name[q-1-8-16-1] != '-' ||
2295                             de->d_name[q-1-8-16-1-16-1] != '@')
2296                                 continue;
2297
2298                         p = strdup(de->d_name);
2299                         if (!p) {
2300                                 r = -ENOMEM;
2301                                 goto finish;
2302                         }
2303
2304                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2305                                 free(p);
2306                                 continue;
2307                         }
2308
2309                         have_seqnum = false;
2310                 } else
2311                         continue;
2312
2313                 if (n_list >= n_allocated) {
2314                         struct vacuum_info *j;
2315
2316                         n_allocated = MAX(n_allocated * 2U, 8U);
2317                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2318                         if (!j) {
2319                                 free(p);
2320                                 r = -ENOMEM;
2321                                 goto finish;
2322                         }
2323
2324                         list = j;
2325                 }
2326
2327                 list[n_list].filename = p;
2328                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2329                 list[n_list].seqnum = seqnum;
2330                 list[n_list].realtime = realtime;
2331                 list[n_list].seqnum_id = seqnum_id;
2332                 list[n_list].have_seqnum = have_seqnum;
2333
2334                 sum += list[n_list].usage;
2335
2336                 n_list ++;
2337         }
2338
2339         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2340
2341         for(i = 0; i < n_list; i++) {
2342                 struct statvfs ss;
2343
2344                 if (fstatvfs(dirfd(d), &ss) < 0) {
2345                         r = -errno;
2346                         goto finish;
2347                 }
2348
2349                 if (sum <= max_use &&
2350                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2351                         break;
2352
2353                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2354                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2355                         sum -= list[i].usage;
2356                 } else if (errno != ENOENT)
2357                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2358         }
2359
2360 finish:
2361         for (i = 0; i < n_list; i++)
2362                 free(list[i].filename);
2363
2364         free(list);
2365
2366         if (d)
2367                 closedir(d);
2368
2369         return r;
2370 }
2371
2372 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2373         uint64_t i, n;
2374         uint64_t q, xor_hash = 0;
2375         int r;
2376         EntryItem *items;
2377         dual_timestamp ts;
2378
2379         assert(from);
2380         assert(to);
2381         assert(o);
2382         assert(p);
2383
2384         if (!to->writable)
2385                 return -EPERM;
2386
2387         ts.monotonic = le64toh(o->entry.monotonic);
2388         ts.realtime = le64toh(o->entry.realtime);
2389
2390         if (to->tail_entry_monotonic_valid &&
2391             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2392                 return -EINVAL;
2393
2394         n = journal_file_entry_n_items(o);
2395         items = alloca(sizeof(EntryItem) * n);
2396
2397         for (i = 0; i < n; i++) {
2398                 uint64_t l, h;
2399                 le64_t le_hash;
2400                 size_t t;
2401                 void *data;
2402                 Object *u;
2403
2404                 q = le64toh(o->entry.items[i].object_offset);
2405                 le_hash = o->entry.items[i].hash;
2406
2407                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2408                 if (r < 0)
2409                         return r;
2410
2411                 if (le_hash != o->data.hash)
2412                         return -EBADMSG;
2413
2414                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2415                 t = (size_t) l;
2416
2417                 /* We hit the limit on 32bit machines */
2418                 if ((uint64_t) t != l)
2419                         return -E2BIG;
2420
2421                 if (o->object.flags & OBJECT_COMPRESSED) {
2422 #ifdef HAVE_XZ
2423                         uint64_t rsize;
2424
2425                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2426                                 return -EBADMSG;
2427
2428                         data = from->compress_buffer;
2429                         l = rsize;
2430 #else
2431                         return -EPROTONOSUPPORT;
2432 #endif
2433                 } else
2434                         data = o->data.payload;
2435
2436                 r = journal_file_append_data(to, data, l, &u, &h);
2437                 if (r < 0)
2438                         return r;
2439
2440                 xor_hash ^= le64toh(u->data.hash);
2441                 items[i].object_offset = htole64(h);
2442                 items[i].hash = u->data.hash;
2443
2444                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2445                 if (r < 0)
2446                         return r;
2447         }
2448
2449         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2450 }
2451
2452 void journal_default_metrics(JournalMetrics *m, int fd) {
2453         uint64_t fs_size = 0;
2454         struct statvfs ss;
2455         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2456
2457         assert(m);
2458         assert(fd >= 0);
2459
2460         if (fstatvfs(fd, &ss) >= 0)
2461                 fs_size = ss.f_frsize * ss.f_blocks;
2462
2463         if (m->max_use == (uint64_t) -1) {
2464
2465                 if (fs_size > 0) {
2466                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2467
2468                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2469                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2470
2471                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2472                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2473                 } else
2474                         m->max_use = DEFAULT_MAX_USE_LOWER;
2475         } else {
2476                 m->max_use = PAGE_ALIGN(m->max_use);
2477
2478                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2479                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2480         }
2481
2482         if (m->max_size == (uint64_t) -1) {
2483                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2484
2485                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2486                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2487         } else
2488                 m->max_size = PAGE_ALIGN(m->max_size);
2489
2490         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2491                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2492
2493         if (m->max_size*2 > m->max_use)
2494                 m->max_use = m->max_size*2;
2495
2496         if (m->min_size == (uint64_t) -1)
2497                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2498         else {
2499                 m->min_size = PAGE_ALIGN(m->min_size);
2500
2501                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2502                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2503
2504                 if (m->min_size > m->max_size)
2505                         m->max_size = m->min_size;
2506         }
2507
2508         if (m->keep_free == (uint64_t) -1) {
2509
2510                 if (fs_size > 0) {
2511                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2512
2513                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2514                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2515
2516                 } else
2517                         m->keep_free = DEFAULT_KEEP_FREE;
2518         }
2519
2520         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2521                  format_bytes(a, sizeof(a), m->max_use),
2522                  format_bytes(b, sizeof(b), m->max_size),
2523                  format_bytes(c, sizeof(c), m->min_size),
2524                  format_bytes(d, sizeof(d), m->keep_free));
2525 }
2526
2527 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2528         assert(f);
2529         assert(from || to);
2530
2531         if (from) {
2532                 if (f->header->head_entry_realtime == 0)
2533                         return -ENOENT;
2534
2535                 *from = le64toh(f->header->head_entry_realtime);
2536         }
2537
2538         if (to) {
2539                 if (f->header->tail_entry_realtime == 0)
2540                         return -ENOENT;
2541
2542                 *to = le64toh(f->header->tail_entry_realtime);
2543         }
2544
2545         return 1;
2546 }
2547
2548 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2549         char t[9+32+1] = "_BOOT_ID=";
2550         Object *o;
2551         uint64_t p;
2552         int r;
2553
2554         assert(f);
2555         assert(from || to);
2556
2557         sd_id128_to_string(boot_id, t + 9);
2558
2559         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2560         if (r <= 0)
2561                 return r;
2562
2563         if (le64toh(o->data.n_entries) <= 0)
2564                 return 0;
2565
2566         if (from) {
2567                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2568                 if (r < 0)
2569                         return r;
2570
2571                 *from = le64toh(o->entry.monotonic);
2572         }
2573
2574         if (to) {
2575                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2576                 if (r < 0)
2577                         return r;
2578
2579                 r = generic_array_get_plus_one(f,
2580                                                le64toh(o->data.entry_offset),
2581                                                le64toh(o->data.entry_array_offset),
2582                                                le64toh(o->data.n_entries)-1,
2583                                                &o, NULL);
2584                 if (r <= 0)
2585                         return r;
2586
2587                 *to = le64toh(o->entry.monotonic);
2588         }
2589
2590         return 1;
2591 }
2592
2593 bool journal_file_rotate_suggested(JournalFile *f) {
2594         assert(f);
2595
2596         /* If we gained new header fields we gained new features,
2597          * hence suggest a rotation */
2598         if (le64toh(f->header->header_size) < sizeof(Header)) {
2599                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2600                 return true;
2601         }
2602
2603         /* Let's check if the hash tables grew over a certain fill
2604          * level (75%, borrowing this value from Java's hash table
2605          * implementation), and if so suggest a rotation. To calculate
2606          * the fill level we need the n_data field, which only exists
2607          * in newer versions. */
2608
2609         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2610                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2611                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2612                                   f->path,
2613                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2614                                   (unsigned long long) le64toh(f->header->n_data),
2615                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2616                                   (unsigned long long) (f->last_stat.st_size),
2617                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2618                         return true;
2619                 }
2620
2621         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2622                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2623                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2624                                   f->path,
2625                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2626                                   (unsigned long long) le64toh(f->header->n_fields),
2627                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2628                         return true;
2629                 }
2630
2631         return false;
2632 }