chiark / gitweb /
journal: actually set archived files to archived state
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65
66 #define JOURNAL_HEADER_CONTAINS(h, field) \
67         (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
68
69 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
70
71 void journal_file_close(JournalFile *f) {
72         int t;
73
74         assert(f);
75
76         if (f->header) {
77                 /* Mark the file offline. Don't override the archived state if it already is set */
78                 if (f->writable && f->header->state == STATE_ONLINE)
79                         f->header->state = STATE_OFFLINE;
80
81                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
82         }
83
84         for (t = 0; t < _WINDOW_MAX; t++)
85                 if (f->windows[t].ptr)
86                         munmap(f->windows[t].ptr, f->windows[t].size);
87
88         if (f->fd >= 0)
89                 close_nointr_nofail(f->fd);
90
91         free(f->path);
92
93 #ifdef HAVE_XZ
94         free(f->compress_buffer);
95 #endif
96
97         free(f);
98 }
99
100 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
101         Header h;
102         ssize_t k;
103         int r;
104
105         assert(f);
106
107         zero(h);
108         memcpy(h.signature, signature, 8);
109         h.header_size = htole64(ALIGN64(sizeof(h)));
110
111         r = sd_id128_randomize(&h.file_id);
112         if (r < 0)
113                 return r;
114
115         if (template) {
116                 h.seqnum_id = template->header->seqnum_id;
117                 h.tail_seqnum = template->header->tail_seqnum;
118         } else
119                 h.seqnum_id = h.file_id;
120
121         k = pwrite(f->fd, &h, sizeof(h), 0);
122         if (k < 0)
123                 return -errno;
124
125         if (k != sizeof(h))
126                 return -EIO;
127
128         return 0;
129 }
130
131 static int journal_file_refresh_header(JournalFile *f) {
132         int r;
133         sd_id128_t boot_id;
134
135         assert(f);
136
137         r = sd_id128_get_machine(&f->header->machine_id);
138         if (r < 0)
139                 return r;
140
141         r = sd_id128_get_boot(&boot_id);
142         if (r < 0)
143                 return r;
144
145         if (sd_id128_equal(boot_id, f->header->boot_id))
146                 f->tail_entry_monotonic_valid = true;
147
148         f->header->boot_id = boot_id;
149
150         f->header->state = STATE_ONLINE;
151
152         __sync_synchronize();
153
154         return 0;
155 }
156
157 static int journal_file_verify_header(JournalFile *f) {
158         assert(f);
159
160         if (memcmp(f->header, signature, 8))
161                 return -EBADMSG;
162
163 #ifdef HAVE_XZ
164         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
165                 return -EPROTONOSUPPORT;
166 #else
167         if (f->header->incompatible_flags != 0)
168                 return -EPROTONOSUPPORT;
169 #endif
170
171         /* The first addition was n_data, so check that we are at least this large */
172         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
173                 return -EBADMSG;
174
175         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
176                 return -ENODATA;
177
178         if (f->writable) {
179                 uint8_t state;
180                 sd_id128_t machine_id;
181                 int r;
182
183                 r = sd_id128_get_machine(&machine_id);
184                 if (r < 0)
185                         return r;
186
187                 if (!sd_id128_equal(machine_id, f->header->machine_id))
188                         return -EHOSTDOWN;
189
190                 state = f->header->state;
191
192                 if (state == STATE_ONLINE) {
193                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
194                         return -EBUSY;
195                 } else if (state == STATE_ARCHIVED)
196                         return -ESHUTDOWN;
197                 else if (state != STATE_OFFLINE) {
198                         log_debug("Journal file %s has unknown state %u.", f->path, state);
199                         return -EBUSY;
200                 }
201         }
202
203         return 0;
204 }
205
206 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
207         uint64_t old_size, new_size;
208         int r;
209
210         assert(f);
211
212         /* We assume that this file is not sparse, and we know that
213          * for sure, since we always call posix_fallocate()
214          * ourselves */
215
216         old_size =
217                 le64toh(f->header->header_size) +
218                 le64toh(f->header->arena_size);
219
220         new_size = PAGE_ALIGN(offset + size);
221         if (new_size < le64toh(f->header->header_size))
222                 new_size = le64toh(f->header->header_size);
223
224         if (new_size <= old_size)
225                 return 0;
226
227         if (f->metrics.max_size > 0 &&
228             new_size > f->metrics.max_size)
229                 return -E2BIG;
230
231         if (new_size > f->metrics.min_size &&
232             f->metrics.keep_free > 0) {
233                 struct statvfs svfs;
234
235                 if (fstatvfs(f->fd, &svfs) >= 0) {
236                         uint64_t available;
237
238                         available = svfs.f_bfree * svfs.f_bsize;
239
240                         if (available >= f->metrics.keep_free)
241                                 available -= f->metrics.keep_free;
242                         else
243                                 available = 0;
244
245                         if (new_size - old_size > available)
246                                 return -E2BIG;
247                 }
248         }
249
250         /* Note that the glibc fallocate() fallback is very
251            inefficient, hence we try to minimize the allocation area
252            as we can. */
253         r = posix_fallocate(f->fd, old_size, new_size - old_size);
254         if (r != 0)
255                 return -r;
256
257         if (fstat(f->fd, &f->last_stat) < 0)
258                 return -errno;
259
260         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
261
262         return 0;
263 }
264
265 static int journal_file_map(
266                 JournalFile *f,
267                 uint64_t offset,
268                 uint64_t size,
269                 void **_window,
270                 uint64_t *_woffset,
271                 uint64_t *_wsize,
272                 void **ret) {
273
274         uint64_t woffset, wsize;
275         void *window;
276
277         assert(f);
278         assert(size > 0);
279         assert(ret);
280
281         woffset = offset & ~((uint64_t) page_size() - 1ULL);
282         wsize = size + (offset - woffset);
283         wsize = PAGE_ALIGN(wsize);
284
285         /* Avoid SIGBUS on invalid accesses */
286         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
287                 return -EADDRNOTAVAIL;
288
289         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
290         if (window == MAP_FAILED)
291                 return -errno;
292
293         if (_window)
294                 *_window = window;
295
296         if (_woffset)
297                 *_woffset = woffset;
298
299         if (_wsize)
300                 *_wsize = wsize;
301
302         *ret = (uint8_t*) window + (offset - woffset);
303
304         return 0;
305 }
306
307 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
308         void *p = NULL;
309         uint64_t delta;
310         int r;
311         Window *w;
312
313         assert(f);
314         assert(ret);
315         assert(wt >= 0);
316         assert(wt < _WINDOW_MAX);
317
318         if (offset + size > (uint64_t) f->last_stat.st_size) {
319                 /* Hmm, out of range? Let's refresh the fstat() data
320                  * first, before we trust that check. */
321
322                 if (fstat(f->fd, &f->last_stat) < 0 ||
323                     offset + size > (uint64_t) f->last_stat.st_size)
324                         return -EADDRNOTAVAIL;
325         }
326
327         w = f->windows + wt;
328
329         if (_likely_(w->ptr &&
330                      w->offset <= offset &&
331                      w->offset + w->size >= offset + size)) {
332
333                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
334                 return 0;
335         }
336
337         if (w->ptr) {
338                 if (munmap(w->ptr, w->size) < 0)
339                         return -errno;
340
341                 w->ptr = NULL;
342                 w->size = w->offset = 0;
343         }
344
345         if (size < DEFAULT_WINDOW_SIZE) {
346                 /* If the default window size is larger then what was
347                  * asked for extend the mapping a bit in the hope to
348                  * minimize needed remappings later on. We add half
349                  * the window space before and half behind the
350                  * requested mapping */
351
352                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
353
354                 if (delta > offset)
355                         delta = offset;
356
357                 offset -= delta;
358                 size = DEFAULT_WINDOW_SIZE;
359         } else
360                 delta = 0;
361
362         if (offset + size > (uint64_t) f->last_stat.st_size)
363                 size = (uint64_t) f->last_stat.st_size - offset;
364
365         if (size <= 0)
366                 return -EADDRNOTAVAIL;
367
368         r = journal_file_map(f,
369                              offset, size,
370                              &w->ptr, &w->offset, &w->size,
371                              &p);
372
373         if (r < 0)
374                 return r;
375
376         *ret = (uint8_t*) p + delta;
377         return 0;
378 }
379
380 static bool verify_hash(Object *o) {
381         uint64_t h1, h2;
382
383         assert(o);
384
385         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
386                 h1 = le64toh(o->data.hash);
387                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
388         } else if (o->object.type == OBJECT_FIELD) {
389                 h1 = le64toh(o->field.hash);
390                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
391         } else
392                 return true;
393
394         return h1 == h2;
395 }
396
397 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
398         int r;
399         void *t;
400         Object *o;
401         uint64_t s;
402
403         assert(f);
404         assert(ret);
405         assert(type < _OBJECT_TYPE_MAX);
406
407         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
408         if (r < 0)
409                 return r;
410
411         o = (Object*) t;
412         s = le64toh(o->object.size);
413
414         if (s < sizeof(ObjectHeader))
415                 return -EBADMSG;
416
417         if (type >= 0 && o->object.type != type)
418                 return -EBADMSG;
419
420         if (s > sizeof(ObjectHeader)) {
421                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
422                 if (r < 0)
423                         return r;
424
425                 o = (Object*) t;
426         }
427
428         if (!verify_hash(o))
429                 return -EBADMSG;
430
431         *ret = o;
432         return 0;
433 }
434
435 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
436         uint64_t r;
437
438         assert(f);
439
440         r = le64toh(f->header->tail_seqnum) + 1;
441
442         if (seqnum) {
443                 /* If an external seqnum counter was passed, we update
444                  * both the local and the external one, and set it to
445                  * the maximum of both */
446
447                 if (*seqnum + 1 > r)
448                         r = *seqnum + 1;
449
450                 *seqnum = r;
451         }
452
453         f->header->tail_seqnum = htole64(r);
454
455         if (f->header->head_seqnum == 0)
456                 f->header->head_seqnum = htole64(r);
457
458         return r;
459 }
460
461 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
462         int r;
463         uint64_t p;
464         Object *tail, *o;
465         void *t;
466
467         assert(f);
468         assert(size >= sizeof(ObjectHeader));
469         assert(offset);
470         assert(ret);
471
472         p = le64toh(f->header->tail_object_offset);
473         if (p == 0)
474                 p = le64toh(f->header->header_size);
475         else {
476                 r = journal_file_move_to_object(f, -1, p, &tail);
477                 if (r < 0)
478                         return r;
479
480                 p += ALIGN64(le64toh(tail->object.size));
481         }
482
483         r = journal_file_allocate(f, p, size);
484         if (r < 0)
485                 return r;
486
487         r = journal_file_move_to(f, type, p, size, &t);
488         if (r < 0)
489                 return r;
490
491         o = (Object*) t;
492
493         zero(o->object);
494         o->object.type = type;
495         o->object.size = htole64(size);
496
497         f->header->tail_object_offset = htole64(p);
498         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
499
500         *ret = o;
501         *offset = p;
502
503         return 0;
504 }
505
506 static int journal_file_setup_data_hash_table(JournalFile *f) {
507         uint64_t s, p;
508         Object *o;
509         int r;
510
511         assert(f);
512
513         /* We estimate that we need 1 hash table entry per 2K of
514            journal file and we want to make sure we never get beyond
515            75% fill level. Calculate the hash table size for the
516            maximum file size based on these metrics. */
517
518         s = (f->metrics.max_size * 4 / 2048 / 3) * sizeof(HashItem);
519         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
520                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
521
522         log_info("Reserving %llu entries in hash table.", (unsigned long long) s);
523
524         r = journal_file_append_object(f,
525                                        OBJECT_DATA_HASH_TABLE,
526                                        offsetof(Object, hash_table.items) + s,
527                                        &o, &p);
528         if (r < 0)
529                 return r;
530
531         memset(o->hash_table.items, 0, s);
532
533         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
534         f->header->data_hash_table_size = htole64(s);
535
536         return 0;
537 }
538
539 static int journal_file_setup_field_hash_table(JournalFile *f) {
540         uint64_t s, p;
541         Object *o;
542         int r;
543
544         assert(f);
545
546         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
547         r = journal_file_append_object(f,
548                                        OBJECT_FIELD_HASH_TABLE,
549                                        offsetof(Object, hash_table.items) + s,
550                                        &o, &p);
551         if (r < 0)
552                 return r;
553
554         memset(o->hash_table.items, 0, s);
555
556         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
557         f->header->field_hash_table_size = htole64(s);
558
559         return 0;
560 }
561
562 static int journal_file_map_data_hash_table(JournalFile *f) {
563         uint64_t s, p;
564         void *t;
565         int r;
566
567         assert(f);
568
569         p = le64toh(f->header->data_hash_table_offset);
570         s = le64toh(f->header->data_hash_table_size);
571
572         r = journal_file_move_to(f,
573                                  WINDOW_DATA_HASH_TABLE,
574                                  p, s,
575                                  &t);
576         if (r < 0)
577                 return r;
578
579         f->data_hash_table = t;
580         return 0;
581 }
582
583 static int journal_file_map_field_hash_table(JournalFile *f) {
584         uint64_t s, p;
585         void *t;
586         int r;
587
588         assert(f);
589
590         p = le64toh(f->header->field_hash_table_offset);
591         s = le64toh(f->header->field_hash_table_size);
592
593         r = journal_file_move_to(f,
594                                  WINDOW_FIELD_HASH_TABLE,
595                                  p, s,
596                                  &t);
597         if (r < 0)
598                 return r;
599
600         f->field_hash_table = t;
601         return 0;
602 }
603
604 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
605         uint64_t p, h;
606         int r;
607
608         assert(f);
609         assert(o);
610         assert(offset > 0);
611         assert(o->object.type == OBJECT_DATA);
612
613         /* This might alter the window we are looking at */
614
615         o->data.next_hash_offset = o->data.next_field_offset = 0;
616         o->data.entry_offset = o->data.entry_array_offset = 0;
617         o->data.n_entries = 0;
618
619         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
620         p = le64toh(f->data_hash_table[h].tail_hash_offset);
621         if (p == 0) {
622                 /* Only entry in the hash table is easy */
623                 f->data_hash_table[h].head_hash_offset = htole64(offset);
624         } else {
625                 /* Move back to the previous data object, to patch in
626                  * pointer */
627
628                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
629                 if (r < 0)
630                         return r;
631
632                 o->data.next_hash_offset = htole64(offset);
633         }
634
635         f->data_hash_table[h].tail_hash_offset = htole64(offset);
636
637         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
638                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
639
640         return 0;
641 }
642
643 int journal_file_find_data_object_with_hash(
644                 JournalFile *f,
645                 const void *data, uint64_t size, uint64_t hash,
646                 Object **ret, uint64_t *offset) {
647
648         uint64_t p, osize, h;
649         int r;
650
651         assert(f);
652         assert(data || size == 0);
653
654         osize = offsetof(Object, data.payload) + size;
655
656         if (f->header->data_hash_table_size == 0)
657                 return -EBADMSG;
658
659         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
660         p = le64toh(f->data_hash_table[h].head_hash_offset);
661
662         while (p > 0) {
663                 Object *o;
664
665                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
666                 if (r < 0)
667                         return r;
668
669                 if (le64toh(o->data.hash) != hash)
670                         goto next;
671
672                 if (o->object.flags & OBJECT_COMPRESSED) {
673 #ifdef HAVE_XZ
674                         uint64_t l, rsize;
675
676                         l = le64toh(o->object.size);
677                         if (l <= offsetof(Object, data.payload))
678                                 return -EBADMSG;
679
680                         l -= offsetof(Object, data.payload);
681
682                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
683                                 return -EBADMSG;
684
685                         if (rsize == size &&
686                             memcmp(f->compress_buffer, data, size) == 0) {
687
688                                 if (ret)
689                                         *ret = o;
690
691                                 if (offset)
692                                         *offset = p;
693
694                                 return 1;
695                         }
696 #else
697                         return -EPROTONOSUPPORT;
698 #endif
699
700                 } else if (le64toh(o->object.size) == osize &&
701                            memcmp(o->data.payload, data, size) == 0) {
702
703                         if (ret)
704                                 *ret = o;
705
706                         if (offset)
707                                 *offset = p;
708
709                         return 1;
710                 }
711
712         next:
713                 p = le64toh(o->data.next_hash_offset);
714         }
715
716         return 0;
717 }
718
719 int journal_file_find_data_object(
720                 JournalFile *f,
721                 const void *data, uint64_t size,
722                 Object **ret, uint64_t *offset) {
723
724         uint64_t hash;
725
726         assert(f);
727         assert(data || size == 0);
728
729         hash = hash64(data, size);
730
731         return journal_file_find_data_object_with_hash(f,
732                                                        data, size, hash,
733                                                        ret, offset);
734 }
735
736 static int journal_file_append_data(
737                 JournalFile *f,
738                 const void *data, uint64_t size,
739                 Object **ret, uint64_t *offset) {
740
741         uint64_t hash, p;
742         uint64_t osize;
743         Object *o;
744         int r;
745         bool compressed = false;
746
747         assert(f);
748         assert(data || size == 0);
749
750         hash = hash64(data, size);
751
752         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
753         if (r < 0)
754                 return r;
755         else if (r > 0) {
756
757                 if (ret)
758                         *ret = o;
759
760                 if (offset)
761                         *offset = p;
762
763                 return 0;
764         }
765
766         osize = offsetof(Object, data.payload) + size;
767         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
768         if (r < 0)
769                 return r;
770
771         o->data.hash = htole64(hash);
772
773 #ifdef HAVE_XZ
774         if (f->compress &&
775             size >= COMPRESSION_SIZE_THRESHOLD) {
776                 uint64_t rsize;
777
778                 compressed = compress_blob(data, size, o->data.payload, &rsize);
779
780                 if (compressed) {
781                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
782                         o->object.flags |= OBJECT_COMPRESSED;
783
784                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
785
786                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
787                 }
788         }
789 #endif
790
791         if (!compressed)
792                 memcpy(o->data.payload, data, size);
793
794         r = journal_file_link_data(f, o, p, hash);
795         if (r < 0)
796                 return r;
797
798         /* The linking might have altered the window, so let's
799          * refresh our pointer */
800         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
801         if (r < 0)
802                 return r;
803
804         if (ret)
805                 *ret = o;
806
807         if (offset)
808                 *offset = p;
809
810         return 0;
811 }
812
813 uint64_t journal_file_entry_n_items(Object *o) {
814         assert(o);
815         assert(o->object.type == OBJECT_ENTRY);
816
817         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
818 }
819
820 static uint64_t journal_file_entry_array_n_items(Object *o) {
821         assert(o);
822         assert(o->object.type == OBJECT_ENTRY_ARRAY);
823
824         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
825 }
826
827 static int link_entry_into_array(JournalFile *f,
828                                  le64_t *first,
829                                  le64_t *idx,
830                                  uint64_t p) {
831         int r;
832         uint64_t n = 0, ap = 0, q, i, a, hidx;
833         Object *o;
834
835         assert(f);
836         assert(first);
837         assert(idx);
838         assert(p > 0);
839
840         a = le64toh(*first);
841         i = hidx = le64toh(*idx);
842         while (a > 0) {
843
844                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
845                 if (r < 0)
846                         return r;
847
848                 n = journal_file_entry_array_n_items(o);
849                 if (i < n) {
850                         o->entry_array.items[i] = htole64(p);
851                         *idx = htole64(hidx + 1);
852                         return 0;
853                 }
854
855                 i -= n;
856                 ap = a;
857                 a = le64toh(o->entry_array.next_entry_array_offset);
858         }
859
860         if (hidx > n)
861                 n = (hidx+1) * 2;
862         else
863                 n = n * 2;
864
865         if (n < 4)
866                 n = 4;
867
868         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
869                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
870                                        &o, &q);
871         if (r < 0)
872                 return r;
873
874         o->entry_array.items[i] = htole64(p);
875
876         if (ap == 0)
877                 *first = htole64(q);
878         else {
879                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
880                 if (r < 0)
881                         return r;
882
883                 o->entry_array.next_entry_array_offset = htole64(q);
884         }
885
886         *idx = htole64(hidx + 1);
887
888         return 0;
889 }
890
891 static int link_entry_into_array_plus_one(JournalFile *f,
892                                           le64_t *extra,
893                                           le64_t *first,
894                                           le64_t *idx,
895                                           uint64_t p) {
896
897         int r;
898
899         assert(f);
900         assert(extra);
901         assert(first);
902         assert(idx);
903         assert(p > 0);
904
905         if (*idx == 0)
906                 *extra = htole64(p);
907         else {
908                 le64_t i;
909
910                 i = htole64(le64toh(*idx) - 1);
911                 r = link_entry_into_array(f, first, &i, p);
912                 if (r < 0)
913                         return r;
914         }
915
916         *idx = htole64(le64toh(*idx) + 1);
917         return 0;
918 }
919
920 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
921         uint64_t p;
922         int r;
923         assert(f);
924         assert(o);
925         assert(offset > 0);
926
927         p = le64toh(o->entry.items[i].object_offset);
928         if (p == 0)
929                 return -EINVAL;
930
931         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
932         if (r < 0)
933                 return r;
934
935         return link_entry_into_array_plus_one(f,
936                                               &o->data.entry_offset,
937                                               &o->data.entry_array_offset,
938                                               &o->data.n_entries,
939                                               offset);
940 }
941
942 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
943         uint64_t n, i;
944         int r;
945
946         assert(f);
947         assert(o);
948         assert(offset > 0);
949         assert(o->object.type == OBJECT_ENTRY);
950
951         __sync_synchronize();
952
953         /* Link up the entry itself */
954         r = link_entry_into_array(f,
955                                   &f->header->entry_array_offset,
956                                   &f->header->n_entries,
957                                   offset);
958         if (r < 0)
959                 return r;
960
961         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
962
963         if (f->header->head_entry_realtime == 0)
964                 f->header->head_entry_realtime = o->entry.realtime;
965
966         f->header->tail_entry_realtime = o->entry.realtime;
967         f->header->tail_entry_monotonic = o->entry.monotonic;
968
969         f->tail_entry_monotonic_valid = true;
970
971         /* Link up the items */
972         n = journal_file_entry_n_items(o);
973         for (i = 0; i < n; i++) {
974                 r = journal_file_link_entry_item(f, o, offset, i);
975                 if (r < 0)
976                         return r;
977         }
978
979         return 0;
980 }
981
982 static int journal_file_append_entry_internal(
983                 JournalFile *f,
984                 const dual_timestamp *ts,
985                 uint64_t xor_hash,
986                 const EntryItem items[], unsigned n_items,
987                 uint64_t *seqnum,
988                 Object **ret, uint64_t *offset) {
989         uint64_t np;
990         uint64_t osize;
991         Object *o;
992         int r;
993
994         assert(f);
995         assert(items || n_items == 0);
996         assert(ts);
997
998         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
999
1000         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1001         if (r < 0)
1002                 return r;
1003
1004         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
1005         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1006         o->entry.realtime = htole64(ts->realtime);
1007         o->entry.monotonic = htole64(ts->monotonic);
1008         o->entry.xor_hash = htole64(xor_hash);
1009         o->entry.boot_id = f->header->boot_id;
1010
1011         r = journal_file_link_entry(f, o, np);
1012         if (r < 0)
1013                 return r;
1014
1015         if (ret)
1016                 *ret = o;
1017
1018         if (offset)
1019                 *offset = np;
1020
1021         return 0;
1022 }
1023
1024 void journal_file_post_change(JournalFile *f) {
1025         assert(f);
1026
1027         /* inotify() does not receive IN_MODIFY events from file
1028          * accesses done via mmap(). After each access we hence
1029          * trigger IN_MODIFY by truncating the journal file to its
1030          * current size which triggers IN_MODIFY. */
1031
1032         __sync_synchronize();
1033
1034         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1035                 log_error("Failed to to truncate file to its own size: %m");
1036 }
1037
1038 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1039         unsigned i;
1040         EntryItem *items;
1041         int r;
1042         uint64_t xor_hash = 0;
1043         struct dual_timestamp _ts;
1044
1045         assert(f);
1046         assert(iovec || n_iovec == 0);
1047
1048         if (!f->writable)
1049                 return -EPERM;
1050
1051         if (!ts) {
1052                 dual_timestamp_get(&_ts);
1053                 ts = &_ts;
1054         }
1055
1056         if (f->tail_entry_monotonic_valid &&
1057             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1058                 return -EINVAL;
1059
1060         items = alloca(sizeof(EntryItem) * n_iovec);
1061
1062         for (i = 0; i < n_iovec; i++) {
1063                 uint64_t p;
1064                 Object *o;
1065
1066                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1067                 if (r < 0)
1068                         return r;
1069
1070                 xor_hash ^= le64toh(o->data.hash);
1071                 items[i].object_offset = htole64(p);
1072                 items[i].hash = o->data.hash;
1073         }
1074
1075         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1076
1077         journal_file_post_change(f);
1078
1079         return r;
1080 }
1081
1082 static int generic_array_get(JournalFile *f,
1083                              uint64_t first,
1084                              uint64_t i,
1085                              Object **ret, uint64_t *offset) {
1086
1087         Object *o;
1088         uint64_t p = 0, a;
1089         int r;
1090
1091         assert(f);
1092
1093         a = first;
1094         while (a > 0) {
1095                 uint64_t n;
1096
1097                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1098                 if (r < 0)
1099                         return r;
1100
1101                 n = journal_file_entry_array_n_items(o);
1102                 if (i < n) {
1103                         p = le64toh(o->entry_array.items[i]);
1104                         break;
1105                 }
1106
1107                 i -= n;
1108                 a = le64toh(o->entry_array.next_entry_array_offset);
1109         }
1110
1111         if (a <= 0 || p <= 0)
1112                 return 0;
1113
1114         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1115         if (r < 0)
1116                 return r;
1117
1118         if (ret)
1119                 *ret = o;
1120
1121         if (offset)
1122                 *offset = p;
1123
1124         return 1;
1125 }
1126
1127 static int generic_array_get_plus_one(JournalFile *f,
1128                                       uint64_t extra,
1129                                       uint64_t first,
1130                                       uint64_t i,
1131                                       Object **ret, uint64_t *offset) {
1132
1133         Object *o;
1134
1135         assert(f);
1136
1137         if (i == 0) {
1138                 int r;
1139
1140                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1141                 if (r < 0)
1142                         return r;
1143
1144                 if (ret)
1145                         *ret = o;
1146
1147                 if (offset)
1148                         *offset = extra;
1149
1150                 return 1;
1151         }
1152
1153         return generic_array_get(f, first, i-1, ret, offset);
1154 }
1155
1156 enum {
1157         TEST_FOUND,
1158         TEST_LEFT,
1159         TEST_RIGHT
1160 };
1161
1162 static int generic_array_bisect(JournalFile *f,
1163                                 uint64_t first,
1164                                 uint64_t n,
1165                                 uint64_t needle,
1166                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1167                                 direction_t direction,
1168                                 Object **ret,
1169                                 uint64_t *offset,
1170                                 uint64_t *idx) {
1171
1172         uint64_t a, p, t = 0, i = 0, last_p = 0;
1173         bool subtract_one = false;
1174         Object *o, *array = NULL;
1175         int r;
1176
1177         assert(f);
1178         assert(test_object);
1179
1180         a = first;
1181         while (a > 0) {
1182                 uint64_t left, right, k, lp;
1183
1184                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1185                 if (r < 0)
1186                         return r;
1187
1188                 k = journal_file_entry_array_n_items(array);
1189                 right = MIN(k, n);
1190                 if (right <= 0)
1191                         return 0;
1192
1193                 i = right - 1;
1194                 lp = p = le64toh(array->entry_array.items[i]);
1195                 if (p <= 0)
1196                         return -EBADMSG;
1197
1198                 r = test_object(f, p, needle);
1199                 if (r < 0)
1200                         return r;
1201
1202                 if (r == TEST_FOUND)
1203                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1204
1205                 if (r == TEST_RIGHT) {
1206                         left = 0;
1207                         right -= 1;
1208                         for (;;) {
1209                                 if (left == right) {
1210                                         if (direction == DIRECTION_UP)
1211                                                 subtract_one = true;
1212
1213                                         i = left;
1214                                         goto found;
1215                                 }
1216
1217                                 assert(left < right);
1218
1219                                 i = (left + right) / 2;
1220                                 p = le64toh(array->entry_array.items[i]);
1221                                 if (p <= 0)
1222                                         return -EBADMSG;
1223
1224                                 r = test_object(f, p, needle);
1225                                 if (r < 0)
1226                                         return r;
1227
1228                                 if (r == TEST_FOUND)
1229                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1230
1231                                 if (r == TEST_RIGHT)
1232                                         right = i;
1233                                 else
1234                                         left = i + 1;
1235                         }
1236                 }
1237
1238                 if (k > n) {
1239                         if (direction == DIRECTION_UP) {
1240                                 i = n;
1241                                 subtract_one = true;
1242                                 goto found;
1243                         }
1244
1245                         return 0;
1246                 }
1247
1248                 last_p = lp;
1249
1250                 n -= k;
1251                 t += k;
1252                 a = le64toh(array->entry_array.next_entry_array_offset);
1253         }
1254
1255         return 0;
1256
1257 found:
1258         if (subtract_one && t == 0 && i == 0)
1259                 return 0;
1260
1261         if (subtract_one && i == 0)
1262                 p = last_p;
1263         else if (subtract_one)
1264                 p = le64toh(array->entry_array.items[i-1]);
1265         else
1266                 p = le64toh(array->entry_array.items[i]);
1267
1268         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1269         if (r < 0)
1270                 return r;
1271
1272         if (ret)
1273                 *ret = o;
1274
1275         if (offset)
1276                 *offset = p;
1277
1278         if (idx)
1279                 *idx = t + i + (subtract_one ? -1 : 0);
1280
1281         return 1;
1282 }
1283
1284 static int generic_array_bisect_plus_one(JournalFile *f,
1285                                          uint64_t extra,
1286                                          uint64_t first,
1287                                          uint64_t n,
1288                                          uint64_t needle,
1289                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1290                                          direction_t direction,
1291                                          Object **ret,
1292                                          uint64_t *offset,
1293                                          uint64_t *idx) {
1294
1295         int r;
1296         bool step_back = false;
1297         Object *o;
1298
1299         assert(f);
1300         assert(test_object);
1301
1302         if (n <= 0)
1303                 return 0;
1304
1305         /* This bisects the array in object 'first', but first checks
1306          * an extra  */
1307         r = test_object(f, extra, needle);
1308         if (r < 0)
1309                 return r;
1310
1311         if (r == TEST_FOUND)
1312                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1313
1314         /* if we are looking with DIRECTION_UP then we need to first
1315            see if in the actual array there is a matching entry, and
1316            return the last one of that. But if there isn't any we need
1317            to return this one. Hence remember this, and return it
1318            below. */
1319         if (r == TEST_LEFT)
1320                 step_back = direction == DIRECTION_UP;
1321
1322         if (r == TEST_RIGHT) {
1323                 if (direction == DIRECTION_DOWN)
1324                         goto found;
1325                 else
1326                         return 0;
1327         }
1328
1329         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1330
1331         if (r == 0 && step_back)
1332                 goto found;
1333
1334         if (r > 0 && idx)
1335                 (*idx) ++;
1336
1337         return r;
1338
1339 found:
1340         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1341         if (r < 0)
1342                 return r;
1343
1344         if (ret)
1345                 *ret = o;
1346
1347         if (offset)
1348                 *offset = extra;
1349
1350         if (idx)
1351                 *idx = 0;
1352
1353         return 1;
1354 }
1355
1356 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1357         assert(f);
1358         assert(p > 0);
1359
1360         if (p == needle)
1361                 return TEST_FOUND;
1362         else if (p < needle)
1363                 return TEST_LEFT;
1364         else
1365                 return TEST_RIGHT;
1366 }
1367
1368 int journal_file_move_to_entry_by_offset(
1369                 JournalFile *f,
1370                 uint64_t p,
1371                 direction_t direction,
1372                 Object **ret,
1373                 uint64_t *offset) {
1374
1375         return generic_array_bisect(f,
1376                                     le64toh(f->header->entry_array_offset),
1377                                     le64toh(f->header->n_entries),
1378                                     p,
1379                                     test_object_offset,
1380                                     direction,
1381                                     ret, offset, NULL);
1382 }
1383
1384
1385 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1386         Object *o;
1387         int r;
1388
1389         assert(f);
1390         assert(p > 0);
1391
1392         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1393         if (r < 0)
1394                 return r;
1395
1396         if (le64toh(o->entry.seqnum) == needle)
1397                 return TEST_FOUND;
1398         else if (le64toh(o->entry.seqnum) < needle)
1399                 return TEST_LEFT;
1400         else
1401                 return TEST_RIGHT;
1402 }
1403
1404 int journal_file_move_to_entry_by_seqnum(
1405                 JournalFile *f,
1406                 uint64_t seqnum,
1407                 direction_t direction,
1408                 Object **ret,
1409                 uint64_t *offset) {
1410
1411         return generic_array_bisect(f,
1412                                     le64toh(f->header->entry_array_offset),
1413                                     le64toh(f->header->n_entries),
1414                                     seqnum,
1415                                     test_object_seqnum,
1416                                     direction,
1417                                     ret, offset, NULL);
1418 }
1419
1420 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1421         Object *o;
1422         int r;
1423
1424         assert(f);
1425         assert(p > 0);
1426
1427         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1428         if (r < 0)
1429                 return r;
1430
1431         if (le64toh(o->entry.realtime) == needle)
1432                 return TEST_FOUND;
1433         else if (le64toh(o->entry.realtime) < needle)
1434                 return TEST_LEFT;
1435         else
1436                 return TEST_RIGHT;
1437 }
1438
1439 int journal_file_move_to_entry_by_realtime(
1440                 JournalFile *f,
1441                 uint64_t realtime,
1442                 direction_t direction,
1443                 Object **ret,
1444                 uint64_t *offset) {
1445
1446         return generic_array_bisect(f,
1447                                     le64toh(f->header->entry_array_offset),
1448                                     le64toh(f->header->n_entries),
1449                                     realtime,
1450                                     test_object_realtime,
1451                                     direction,
1452                                     ret, offset, NULL);
1453 }
1454
1455 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1456         Object *o;
1457         int r;
1458
1459         assert(f);
1460         assert(p > 0);
1461
1462         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1463         if (r < 0)
1464                 return r;
1465
1466         if (le64toh(o->entry.monotonic) == needle)
1467                 return TEST_FOUND;
1468         else if (le64toh(o->entry.monotonic) < needle)
1469                 return TEST_LEFT;
1470         else
1471                 return TEST_RIGHT;
1472 }
1473
1474 int journal_file_move_to_entry_by_monotonic(
1475                 JournalFile *f,
1476                 sd_id128_t boot_id,
1477                 uint64_t monotonic,
1478                 direction_t direction,
1479                 Object **ret,
1480                 uint64_t *offset) {
1481
1482         char t[9+32+1] = "_BOOT_ID=";
1483         Object *o;
1484         int r;
1485
1486         assert(f);
1487
1488         sd_id128_to_string(boot_id, t + 9);
1489         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1490         if (r < 0)
1491                 return r;
1492         if (r == 0)
1493                 return -ENOENT;
1494
1495         return generic_array_bisect_plus_one(f,
1496                                              le64toh(o->data.entry_offset),
1497                                              le64toh(o->data.entry_array_offset),
1498                                              le64toh(o->data.n_entries),
1499                                              monotonic,
1500                                              test_object_monotonic,
1501                                              direction,
1502                                              ret, offset, NULL);
1503 }
1504
1505 int journal_file_next_entry(
1506                 JournalFile *f,
1507                 Object *o, uint64_t p,
1508                 direction_t direction,
1509                 Object **ret, uint64_t *offset) {
1510
1511         uint64_t i, n;
1512         int r;
1513
1514         assert(f);
1515         assert(p > 0 || !o);
1516
1517         n = le64toh(f->header->n_entries);
1518         if (n <= 0)
1519                 return 0;
1520
1521         if (!o)
1522                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1523         else {
1524                 if (o->object.type != OBJECT_ENTRY)
1525                         return -EINVAL;
1526
1527                 r = generic_array_bisect(f,
1528                                          le64toh(f->header->entry_array_offset),
1529                                          le64toh(f->header->n_entries),
1530                                          p,
1531                                          test_object_offset,
1532                                          DIRECTION_DOWN,
1533                                          NULL, NULL,
1534                                          &i);
1535                 if (r <= 0)
1536                         return r;
1537
1538                 if (direction == DIRECTION_DOWN) {
1539                         if (i >= n - 1)
1540                                 return 0;
1541
1542                         i++;
1543                 } else {
1544                         if (i <= 0)
1545                                 return 0;
1546
1547                         i--;
1548                 }
1549         }
1550
1551         /* And jump to it */
1552         return generic_array_get(f,
1553                                  le64toh(f->header->entry_array_offset),
1554                                  i,
1555                                  ret, offset);
1556 }
1557
1558 int journal_file_skip_entry(
1559                 JournalFile *f,
1560                 Object *o, uint64_t p,
1561                 int64_t skip,
1562                 Object **ret, uint64_t *offset) {
1563
1564         uint64_t i, n;
1565         int r;
1566
1567         assert(f);
1568         assert(o);
1569         assert(p > 0);
1570
1571         if (o->object.type != OBJECT_ENTRY)
1572                 return -EINVAL;
1573
1574         r = generic_array_bisect(f,
1575                                  le64toh(f->header->entry_array_offset),
1576                                  le64toh(f->header->n_entries),
1577                                  p,
1578                                  test_object_offset,
1579                                  DIRECTION_DOWN,
1580                                  NULL, NULL,
1581                                  &i);
1582         if (r <= 0)
1583                 return r;
1584
1585         /* Calculate new index */
1586         if (skip < 0) {
1587                 if ((uint64_t) -skip >= i)
1588                         i = 0;
1589                 else
1590                         i = i - (uint64_t) -skip;
1591         } else
1592                 i  += (uint64_t) skip;
1593
1594         n = le64toh(f->header->n_entries);
1595         if (n <= 0)
1596                 return -EBADMSG;
1597
1598         if (i >= n)
1599                 i = n-1;
1600
1601         return generic_array_get(f,
1602                                  le64toh(f->header->entry_array_offset),
1603                                  i,
1604                                  ret, offset);
1605 }
1606
1607 int journal_file_next_entry_for_data(
1608                 JournalFile *f,
1609                 Object *o, uint64_t p,
1610                 uint64_t data_offset,
1611                 direction_t direction,
1612                 Object **ret, uint64_t *offset) {
1613
1614         uint64_t n, i;
1615         int r;
1616         Object *d;
1617
1618         assert(f);
1619         assert(p > 0 || !o);
1620
1621         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1622         if (r < 0)
1623                 return r;
1624
1625         n = le64toh(d->data.n_entries);
1626         if (n <= 0)
1627                 return n;
1628
1629         if (!o)
1630                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1631         else {
1632                 if (o->object.type != OBJECT_ENTRY)
1633                         return -EINVAL;
1634
1635                 r = generic_array_bisect_plus_one(f,
1636                                                   le64toh(d->data.entry_offset),
1637                                                   le64toh(d->data.entry_array_offset),
1638                                                   le64toh(d->data.n_entries),
1639                                                   p,
1640                                                   test_object_offset,
1641                                                   DIRECTION_DOWN,
1642                                                   NULL, NULL,
1643                                                   &i);
1644
1645                 if (r <= 0)
1646                         return r;
1647
1648                 if (direction == DIRECTION_DOWN) {
1649                         if (i >= n - 1)
1650                                 return 0;
1651
1652                         i++;
1653                 } else {
1654                         if (i <= 0)
1655                                 return 0;
1656
1657                         i--;
1658                 }
1659
1660         }
1661
1662         return generic_array_get_plus_one(f,
1663                                           le64toh(d->data.entry_offset),
1664                                           le64toh(d->data.entry_array_offset),
1665                                           i,
1666                                           ret, offset);
1667 }
1668
1669 int journal_file_move_to_entry_by_offset_for_data(
1670                 JournalFile *f,
1671                 uint64_t data_offset,
1672                 uint64_t p,
1673                 direction_t direction,
1674                 Object **ret, uint64_t *offset) {
1675
1676         int r;
1677         Object *d;
1678
1679         assert(f);
1680
1681         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1682         if (r < 0)
1683                 return r;
1684
1685         return generic_array_bisect_plus_one(f,
1686                                              le64toh(d->data.entry_offset),
1687                                              le64toh(d->data.entry_array_offset),
1688                                              le64toh(d->data.n_entries),
1689                                              p,
1690                                              test_object_offset,
1691                                              direction,
1692                                              ret, offset, NULL);
1693 }
1694
1695 int journal_file_move_to_entry_by_monotonic_for_data(
1696                 JournalFile *f,
1697                 uint64_t data_offset,
1698                 sd_id128_t boot_id,
1699                 uint64_t monotonic,
1700                 direction_t direction,
1701                 Object **ret, uint64_t *offset) {
1702
1703         char t[9+32+1] = "_BOOT_ID=";
1704         Object *o, *d;
1705         int r;
1706         uint64_t b, z;
1707
1708         assert(f);
1709
1710         /* First, seek by time */
1711         sd_id128_to_string(boot_id, t + 9);
1712         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1713         if (r < 0)
1714                 return r;
1715         if (r == 0)
1716                 return -ENOENT;
1717
1718         r = generic_array_bisect_plus_one(f,
1719                                           le64toh(o->data.entry_offset),
1720                                           le64toh(o->data.entry_array_offset),
1721                                           le64toh(o->data.n_entries),
1722                                           monotonic,
1723                                           test_object_monotonic,
1724                                           direction,
1725                                           NULL, &z, NULL);
1726         if (r <= 0)
1727                 return r;
1728
1729         /* And now, continue seeking until we find an entry that
1730          * exists in both bisection arrays */
1731
1732         for (;;) {
1733                 Object *qo;
1734                 uint64_t p, q;
1735
1736                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1737                 if (r < 0)
1738                         return r;
1739
1740                 r = generic_array_bisect_plus_one(f,
1741                                                   le64toh(d->data.entry_offset),
1742                                                   le64toh(d->data.entry_array_offset),
1743                                                   le64toh(d->data.n_entries),
1744                                                   z,
1745                                                   test_object_offset,
1746                                                   direction,
1747                                                   NULL, &p, NULL);
1748                 if (r <= 0)
1749                         return r;
1750
1751                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1752                 if (r < 0)
1753                         return r;
1754
1755                 r = generic_array_bisect_plus_one(f,
1756                                                   le64toh(o->data.entry_offset),
1757                                                   le64toh(o->data.entry_array_offset),
1758                                                   le64toh(o->data.n_entries),
1759                                                   p,
1760                                                   test_object_offset,
1761                                                   direction,
1762                                                   &qo, &q, NULL);
1763
1764                 if (r <= 0)
1765                         return r;
1766
1767                 if (p == q) {
1768                         if (ret)
1769                                 *ret = qo;
1770                         if (offset)
1771                                 *offset = q;
1772
1773                         return 1;
1774                 }
1775
1776                 z = q;
1777         }
1778
1779         return 0;
1780 }
1781
1782 int journal_file_move_to_entry_by_seqnum_for_data(
1783                 JournalFile *f,
1784                 uint64_t data_offset,
1785                 uint64_t seqnum,
1786                 direction_t direction,
1787                 Object **ret, uint64_t *offset) {
1788
1789         Object *d;
1790         int r;
1791
1792         assert(f);
1793
1794         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1795         if (r < 0)
1796                 return r;
1797
1798         return generic_array_bisect_plus_one(f,
1799                                              le64toh(d->data.entry_offset),
1800                                              le64toh(d->data.entry_array_offset),
1801                                              le64toh(d->data.n_entries),
1802                                              seqnum,
1803                                              test_object_seqnum,
1804                                              direction,
1805                                              ret, offset, NULL);
1806 }
1807
1808 int journal_file_move_to_entry_by_realtime_for_data(
1809                 JournalFile *f,
1810                 uint64_t data_offset,
1811                 uint64_t realtime,
1812                 direction_t direction,
1813                 Object **ret, uint64_t *offset) {
1814
1815         Object *d;
1816         int r;
1817
1818         assert(f);
1819
1820         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1821         if (r < 0)
1822                 return r;
1823
1824         return generic_array_bisect_plus_one(f,
1825                                              le64toh(d->data.entry_offset),
1826                                              le64toh(d->data.entry_array_offset),
1827                                              le64toh(d->data.n_entries),
1828                                              realtime,
1829                                              test_object_realtime,
1830                                              direction,
1831                                              ret, offset, NULL);
1832 }
1833
1834 void journal_file_dump(JournalFile *f) {
1835         Object *o;
1836         int r;
1837         uint64_t p;
1838
1839         assert(f);
1840
1841         journal_file_print_header(f);
1842
1843         p = le64toh(f->header->header_size);
1844         while (p != 0) {
1845                 r = journal_file_move_to_object(f, -1, p, &o);
1846                 if (r < 0)
1847                         goto fail;
1848
1849                 switch (o->object.type) {
1850
1851                 case OBJECT_UNUSED:
1852                         printf("Type: OBJECT_UNUSED\n");
1853                         break;
1854
1855                 case OBJECT_DATA:
1856                         printf("Type: OBJECT_DATA\n");
1857                         break;
1858
1859                 case OBJECT_ENTRY:
1860                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1861                                (unsigned long long) le64toh(o->entry.seqnum),
1862                                (unsigned long long) le64toh(o->entry.monotonic),
1863                                (unsigned long long) le64toh(o->entry.realtime));
1864                         break;
1865
1866                 case OBJECT_FIELD_HASH_TABLE:
1867                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1868                         break;
1869
1870                 case OBJECT_DATA_HASH_TABLE:
1871                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1872                         break;
1873
1874                 case OBJECT_ENTRY_ARRAY:
1875                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1876                         break;
1877
1878                 case OBJECT_SIGNATURE:
1879                         printf("Type: OBJECT_SIGNATURE\n");
1880                         break;
1881                 }
1882
1883                 if (o->object.flags & OBJECT_COMPRESSED)
1884                         printf("Flags: COMPRESSED\n");
1885
1886                 if (p == le64toh(f->header->tail_object_offset))
1887                         p = 0;
1888                 else
1889                         p = p + ALIGN64(le64toh(o->object.size));
1890         }
1891
1892         return;
1893 fail:
1894         log_error("File corrupt");
1895 }
1896
1897 void journal_file_print_header(JournalFile *f) {
1898         char a[33], b[33], c[33];
1899         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1900
1901         assert(f);
1902
1903         printf("File Path: %s\n"
1904                "File ID: %s\n"
1905                "Machine ID: %s\n"
1906                "Boot ID: %s\n"
1907                "Sequential Number ID: %s\n"
1908                "State: %s\n"
1909                "Compatible Flags:%s%s\n"
1910                "Incompatible Flags:%s%s\n"
1911                "Header size: %llu\n"
1912                "Arena size: %llu\n"
1913                "Data Hash Table Size: %llu\n"
1914                "Field Hash Table Size: %llu\n"
1915                "Objects: %llu\n"
1916                "Entry Objects: %llu\n"
1917                "Rotate Suggested: %s\n"
1918                "Head Sequential Number: %llu\n"
1919                "Tail Sequential Number: %llu\n"
1920                "Head Realtime Timestamp: %s\n"
1921                "Tail Realtime Timestamp: %s\n",
1922                f->path,
1923                sd_id128_to_string(f->header->file_id, a),
1924                sd_id128_to_string(f->header->machine_id, b),
1925                sd_id128_to_string(f->header->boot_id, c),
1926                sd_id128_to_string(f->header->seqnum_id, c),
1927                f->header->state == STATE_OFFLINE ? "offline" :
1928                f->header->state == STATE_ONLINE ? "online" :
1929                f->header->state == STATE_ARCHIVED ? "archived" : "unknown",
1930                (f->header->compatible_flags & HEADER_COMPATIBLE_SIGNED) ? " SIGNED" : "",
1931                (f->header->compatible_flags & ~HEADER_COMPATIBLE_SIGNED) ? " ???" : "",
1932                (f->header->incompatible_flags & HEADER_INCOMPATIBLE_COMPRESSED) ? " COMPRESSED" : "",
1933                (f->header->incompatible_flags & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
1934                (unsigned long long) le64toh(f->header->header_size),
1935                (unsigned long long) le64toh(f->header->arena_size),
1936                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1937                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1938                (unsigned long long) le64toh(f->header->n_objects),
1939                (unsigned long long) le64toh(f->header->n_entries),
1940                yes_no(journal_file_rotate_suggested(f)),
1941                (unsigned long long) le64toh(f->header->head_seqnum),
1942                (unsigned long long) le64toh(f->header->tail_seqnum),
1943                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1944                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)));
1945
1946         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1947                 printf("Data Objects: %llu\n"
1948                        "Data Hash Table Fill: %.1f%%\n",
1949                        (unsigned long long) le64toh(f->header->n_data),
1950                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1951
1952         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1953                 printf("Field Objects: %llu\n"
1954                        "Field Hash Table Fill: %.1f%%\n",
1955                        (unsigned long long) le64toh(f->header->n_fields),
1956                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1957 }
1958
1959 int journal_file_open(
1960                 const char *fname,
1961                 int flags,
1962                 mode_t mode,
1963                 JournalMetrics *metrics,
1964                 JournalFile *template,
1965                 JournalFile **ret) {
1966
1967         JournalFile *f;
1968         int r;
1969         bool newly_created = false;
1970
1971         assert(fname);
1972
1973         if ((flags & O_ACCMODE) != O_RDONLY &&
1974             (flags & O_ACCMODE) != O_RDWR)
1975                 return -EINVAL;
1976
1977         if (!endswith(fname, ".journal"))
1978                 return -EINVAL;
1979
1980         f = new0(JournalFile, 1);
1981         if (!f)
1982                 return -ENOMEM;
1983
1984         f->fd = -1;
1985         f->flags = flags;
1986         f->mode = mode;
1987         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1988         f->prot = prot_from_flags(flags);
1989
1990         if (template)
1991                 f->compress = template->compress;
1992
1993         f->path = strdup(fname);
1994         if (!f->path) {
1995                 r = -ENOMEM;
1996                 goto fail;
1997         }
1998
1999         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2000         if (f->fd < 0) {
2001                 r = -errno;
2002                 goto fail;
2003         }
2004
2005         if (fstat(f->fd, &f->last_stat) < 0) {
2006                 r = -errno;
2007                 goto fail;
2008         }
2009
2010         if (f->last_stat.st_size == 0 && f->writable) {
2011                 newly_created = true;
2012
2013                 r = journal_file_init_header(f, template);
2014                 if (r < 0)
2015                         goto fail;
2016
2017                 if (fstat(f->fd, &f->last_stat) < 0) {
2018                         r = -errno;
2019                         goto fail;
2020                 }
2021         }
2022
2023         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2024                 r = -EIO;
2025                 goto fail;
2026         }
2027
2028         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2029         if (f->header == MAP_FAILED) {
2030                 f->header = NULL;
2031                 r = -errno;
2032                 goto fail;
2033         }
2034
2035         if (!newly_created) {
2036                 r = journal_file_verify_header(f);
2037                 if (r < 0)
2038                         goto fail;
2039         }
2040
2041         if (f->writable) {
2042                 if (metrics) {
2043                         journal_default_metrics(metrics, f->fd);
2044                         f->metrics = *metrics;
2045                 } else if (template)
2046                         f->metrics = template->metrics;
2047
2048                 r = journal_file_refresh_header(f);
2049                 if (r < 0)
2050                         goto fail;
2051         }
2052
2053         if (newly_created) {
2054
2055                 r = journal_file_setup_field_hash_table(f);
2056                 if (r < 0)
2057                         goto fail;
2058
2059                 r = journal_file_setup_data_hash_table(f);
2060                 if (r < 0)
2061                         goto fail;
2062         }
2063
2064         r = journal_file_map_field_hash_table(f);
2065         if (r < 0)
2066                 goto fail;
2067
2068         r = journal_file_map_data_hash_table(f);
2069         if (r < 0)
2070                 goto fail;
2071
2072         if (ret)
2073                 *ret = f;
2074
2075         return 0;
2076
2077 fail:
2078         journal_file_close(f);
2079
2080         return r;
2081 }
2082
2083 int journal_file_rotate(JournalFile **f) {
2084         char *p;
2085         size_t l;
2086         JournalFile *old_file, *new_file = NULL;
2087         int r;
2088
2089         assert(f);
2090         assert(*f);
2091
2092         old_file = *f;
2093
2094         if (!old_file->writable)
2095                 return -EINVAL;
2096
2097         if (!endswith(old_file->path, ".journal"))
2098                 return -EINVAL;
2099
2100         l = strlen(old_file->path);
2101
2102         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2103         if (!p)
2104                 return -ENOMEM;
2105
2106         memcpy(p, old_file->path, l - 8);
2107         p[l-8] = '@';
2108         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2109         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2110                  "-%016llx-%016llx.journal",
2111                  (unsigned long long) le64toh((*f)->header->tail_seqnum),
2112                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2113
2114         r = rename(old_file->path, p);
2115         free(p);
2116
2117         if (r < 0)
2118                 return -errno;
2119
2120         old_file->header->state = STATE_ARCHIVED;
2121
2122         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, NULL, old_file, &new_file);
2123         journal_file_close(old_file);
2124
2125         *f = new_file;
2126         return r;
2127 }
2128
2129 int journal_file_open_reliably(
2130                 const char *fname,
2131                 int flags,
2132                 mode_t mode,
2133                 JournalMetrics *metrics,
2134                 JournalFile *template,
2135                 JournalFile **ret) {
2136
2137         int r;
2138         size_t l;
2139         char *p;
2140
2141         r = journal_file_open(fname, flags, mode, metrics, template, ret);
2142         if (r != -EBADMSG && /* corrupted */
2143             r != -ENODATA && /* truncated */
2144             r != -EHOSTDOWN && /* other machine */
2145             r != -EPROTONOSUPPORT) /* incompatible feature */
2146                 return r;
2147
2148         if ((flags & O_ACCMODE) == O_RDONLY)
2149                 return r;
2150
2151         if (!(flags & O_CREAT))
2152                 return r;
2153
2154         /* The file is corrupted. Rotate it away and try it again (but only once) */
2155
2156         l = strlen(fname);
2157         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2158                      (int) (l-8), fname,
2159                      (unsigned long long) now(CLOCK_REALTIME),
2160                      random_ull()) < 0)
2161                 return -ENOMEM;
2162
2163         r = rename(fname, p);
2164         free(p);
2165         if (r < 0)
2166                 return -errno;
2167
2168         log_warning("File %s corrupted, renaming and replacing.", fname);
2169
2170         return journal_file_open(fname, flags, mode, metrics, template, ret);
2171 }
2172
2173 struct vacuum_info {
2174         off_t usage;
2175         char *filename;
2176
2177         uint64_t realtime;
2178         sd_id128_t seqnum_id;
2179         uint64_t seqnum;
2180
2181         bool have_seqnum;
2182 };
2183
2184 static int vacuum_compare(const void *_a, const void *_b) {
2185         const struct vacuum_info *a, *b;
2186
2187         a = _a;
2188         b = _b;
2189
2190         if (a->have_seqnum && b->have_seqnum &&
2191             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
2192                 if (a->seqnum < b->seqnum)
2193                         return -1;
2194                 else if (a->seqnum > b->seqnum)
2195                         return 1;
2196                 else
2197                         return 0;
2198         }
2199
2200         if (a->realtime < b->realtime)
2201                 return -1;
2202         else if (a->realtime > b->realtime)
2203                 return 1;
2204         else if (a->have_seqnum && b->have_seqnum)
2205                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
2206         else
2207                 return strcmp(a->filename, b->filename);
2208 }
2209
2210 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
2211         DIR *d;
2212         int r = 0;
2213         struct vacuum_info *list = NULL;
2214         unsigned n_list = 0, n_allocated = 0, i;
2215         uint64_t sum = 0;
2216
2217         assert(directory);
2218
2219         if (max_use <= 0)
2220                 return 0;
2221
2222         d = opendir(directory);
2223         if (!d)
2224                 return -errno;
2225
2226         for (;;) {
2227                 int k;
2228                 struct dirent buf, *de;
2229                 size_t q;
2230                 struct stat st;
2231                 char *p;
2232                 unsigned long long seqnum = 0, realtime;
2233                 sd_id128_t seqnum_id;
2234                 bool have_seqnum;
2235
2236                 k = readdir_r(d, &buf, &de);
2237                 if (k != 0) {
2238                         r = -k;
2239                         goto finish;
2240                 }
2241
2242                 if (!de)
2243                         break;
2244
2245                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2246                         continue;
2247
2248                 if (!S_ISREG(st.st_mode))
2249                         continue;
2250
2251                 q = strlen(de->d_name);
2252
2253                 if (endswith(de->d_name, ".journal")) {
2254
2255                         /* Vacuum archived files */
2256
2257                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2258                                 continue;
2259
2260                         if (de->d_name[q-8-16-1] != '-' ||
2261                             de->d_name[q-8-16-1-16-1] != '-' ||
2262                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2263                                 continue;
2264
2265                         p = strdup(de->d_name);
2266                         if (!p) {
2267                                 r = -ENOMEM;
2268                                 goto finish;
2269                         }
2270
2271                         de->d_name[q-8-16-1-16-1] = 0;
2272                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2273                                 free(p);
2274                                 continue;
2275                         }
2276
2277                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2278                                 free(p);
2279                                 continue;
2280                         }
2281
2282                         have_seqnum = true;
2283
2284                 } else if (endswith(de->d_name, ".journal~")) {
2285                         unsigned long long tmp;
2286
2287                         /* Vacuum corrupted files */
2288
2289                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2290                                 continue;
2291
2292                         if (de->d_name[q-1-8-16-1] != '-' ||
2293                             de->d_name[q-1-8-16-1-16-1] != '@')
2294                                 continue;
2295
2296                         p = strdup(de->d_name);
2297                         if (!p) {
2298                                 r = -ENOMEM;
2299                                 goto finish;
2300                         }
2301
2302                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2303                                 free(p);
2304                                 continue;
2305                         }
2306
2307                         have_seqnum = false;
2308                 } else
2309                         continue;
2310
2311                 if (n_list >= n_allocated) {
2312                         struct vacuum_info *j;
2313
2314                         n_allocated = MAX(n_allocated * 2U, 8U);
2315                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2316                         if (!j) {
2317                                 free(p);
2318                                 r = -ENOMEM;
2319                                 goto finish;
2320                         }
2321
2322                         list = j;
2323                 }
2324
2325                 list[n_list].filename = p;
2326                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2327                 list[n_list].seqnum = seqnum;
2328                 list[n_list].realtime = realtime;
2329                 list[n_list].seqnum_id = seqnum_id;
2330                 list[n_list].have_seqnum = have_seqnum;
2331
2332                 sum += list[n_list].usage;
2333
2334                 n_list ++;
2335         }
2336
2337         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2338
2339         for(i = 0; i < n_list; i++) {
2340                 struct statvfs ss;
2341
2342                 if (fstatvfs(dirfd(d), &ss) < 0) {
2343                         r = -errno;
2344                         goto finish;
2345                 }
2346
2347                 if (sum <= max_use &&
2348                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2349                         break;
2350
2351                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2352                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2353                         sum -= list[i].usage;
2354                 } else if (errno != ENOENT)
2355                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2356         }
2357
2358 finish:
2359         for (i = 0; i < n_list; i++)
2360                 free(list[i].filename);
2361
2362         free(list);
2363
2364         if (d)
2365                 closedir(d);
2366
2367         return r;
2368 }
2369
2370 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2371         uint64_t i, n;
2372         uint64_t q, xor_hash = 0;
2373         int r;
2374         EntryItem *items;
2375         dual_timestamp ts;
2376
2377         assert(from);
2378         assert(to);
2379         assert(o);
2380         assert(p);
2381
2382         if (!to->writable)
2383                 return -EPERM;
2384
2385         ts.monotonic = le64toh(o->entry.monotonic);
2386         ts.realtime = le64toh(o->entry.realtime);
2387
2388         if (to->tail_entry_monotonic_valid &&
2389             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2390                 return -EINVAL;
2391
2392         n = journal_file_entry_n_items(o);
2393         items = alloca(sizeof(EntryItem) * n);
2394
2395         for (i = 0; i < n; i++) {
2396                 uint64_t l, h;
2397                 le64_t le_hash;
2398                 size_t t;
2399                 void *data;
2400                 Object *u;
2401
2402                 q = le64toh(o->entry.items[i].object_offset);
2403                 le_hash = o->entry.items[i].hash;
2404
2405                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2406                 if (r < 0)
2407                         return r;
2408
2409                 if (le_hash != o->data.hash)
2410                         return -EBADMSG;
2411
2412                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2413                 t = (size_t) l;
2414
2415                 /* We hit the limit on 32bit machines */
2416                 if ((uint64_t) t != l)
2417                         return -E2BIG;
2418
2419                 if (o->object.flags & OBJECT_COMPRESSED) {
2420 #ifdef HAVE_XZ
2421                         uint64_t rsize;
2422
2423                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2424                                 return -EBADMSG;
2425
2426                         data = from->compress_buffer;
2427                         l = rsize;
2428 #else
2429                         return -EPROTONOSUPPORT;
2430 #endif
2431                 } else
2432                         data = o->data.payload;
2433
2434                 r = journal_file_append_data(to, data, l, &u, &h);
2435                 if (r < 0)
2436                         return r;
2437
2438                 xor_hash ^= le64toh(u->data.hash);
2439                 items[i].object_offset = htole64(h);
2440                 items[i].hash = u->data.hash;
2441
2442                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2443                 if (r < 0)
2444                         return r;
2445         }
2446
2447         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2448 }
2449
2450 void journal_default_metrics(JournalMetrics *m, int fd) {
2451         uint64_t fs_size = 0;
2452         struct statvfs ss;
2453         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2454
2455         assert(m);
2456         assert(fd >= 0);
2457
2458         if (fstatvfs(fd, &ss) >= 0)
2459                 fs_size = ss.f_frsize * ss.f_blocks;
2460
2461         if (m->max_use == (uint64_t) -1) {
2462
2463                 if (fs_size > 0) {
2464                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2465
2466                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2467                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2468
2469                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2470                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2471                 } else
2472                         m->max_use = DEFAULT_MAX_USE_LOWER;
2473         } else {
2474                 m->max_use = PAGE_ALIGN(m->max_use);
2475
2476                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2477                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2478         }
2479
2480         if (m->max_size == (uint64_t) -1) {
2481                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2482
2483                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2484                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2485         } else
2486                 m->max_size = PAGE_ALIGN(m->max_size);
2487
2488         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2489                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2490
2491         if (m->max_size*2 > m->max_use)
2492                 m->max_use = m->max_size*2;
2493
2494         if (m->min_size == (uint64_t) -1)
2495                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2496         else {
2497                 m->min_size = PAGE_ALIGN(m->min_size);
2498
2499                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2500                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2501
2502                 if (m->min_size > m->max_size)
2503                         m->max_size = m->min_size;
2504         }
2505
2506         if (m->keep_free == (uint64_t) -1) {
2507
2508                 if (fs_size > 0) {
2509                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2510
2511                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2512                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2513
2514                 } else
2515                         m->keep_free = DEFAULT_KEEP_FREE;
2516         }
2517
2518         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2519                  format_bytes(a, sizeof(a), m->max_use),
2520                  format_bytes(b, sizeof(b), m->max_size),
2521                  format_bytes(c, sizeof(c), m->min_size),
2522                  format_bytes(d, sizeof(d), m->keep_free));
2523 }
2524
2525 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2526         assert(f);
2527         assert(from || to);
2528
2529         if (from) {
2530                 if (f->header->head_entry_realtime == 0)
2531                         return -ENOENT;
2532
2533                 *from = le64toh(f->header->head_entry_realtime);
2534         }
2535
2536         if (to) {
2537                 if (f->header->tail_entry_realtime == 0)
2538                         return -ENOENT;
2539
2540                 *to = le64toh(f->header->tail_entry_realtime);
2541         }
2542
2543         return 1;
2544 }
2545
2546 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2547         char t[9+32+1] = "_BOOT_ID=";
2548         Object *o;
2549         uint64_t p;
2550         int r;
2551
2552         assert(f);
2553         assert(from || to);
2554
2555         sd_id128_to_string(boot_id, t + 9);
2556
2557         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2558         if (r <= 0)
2559                 return r;
2560
2561         if (le64toh(o->data.n_entries) <= 0)
2562                 return 0;
2563
2564         if (from) {
2565                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2566                 if (r < 0)
2567                         return r;
2568
2569                 *from = le64toh(o->entry.monotonic);
2570         }
2571
2572         if (to) {
2573                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2574                 if (r < 0)
2575                         return r;
2576
2577                 r = generic_array_get_plus_one(f,
2578                                                le64toh(o->data.entry_offset),
2579                                                le64toh(o->data.entry_array_offset),
2580                                                le64toh(o->data.n_entries)-1,
2581                                                &o, NULL);
2582                 if (r <= 0)
2583                         return r;
2584
2585                 *to = le64toh(o->entry.monotonic);
2586         }
2587
2588         return 1;
2589 }
2590
2591 bool journal_file_rotate_suggested(JournalFile *f) {
2592         assert(f);
2593
2594         /* If we gained new header fields we gained new features,
2595          * hence suggest a rotation */
2596         if (le64toh(f->header->header_size) < sizeof(Header))
2597                 return true;
2598
2599         /* Let's check if the hash tables grew over a certain fill
2600          * level (75%, borrowing this value from Java's hash table
2601          * implementation), and if so suggest a rotation. To calculate
2602          * the fill level we need the n_data field, which only exists
2603          * in newer versions. */
2604
2605         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2606                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL)
2607                         return true;
2608
2609         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2610                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL)
2611                         return true;
2612
2613         return false;
2614 }