chiark / gitweb /
5dd6e575fb9fb1f1766cdd85b9e82aa2d6f9e524
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
62
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
64
65 void journal_file_close(JournalFile *f) {
66         int t;
67
68         assert(f);
69
70         if (f->header && f->writable)
71                 f->header->state = STATE_OFFLINE;
72
73
74         for (t = 0; t < _WINDOW_MAX; t++)
75                 if (f->windows[t].ptr)
76                         munmap(f->windows[t].ptr, f->windows[t].size);
77
78         if (f->fd >= 0)
79                 close_nointr_nofail(f->fd);
80
81         free(f->path);
82
83 #ifdef HAVE_XZ
84         free(f->compress_buffer);
85 #endif
86
87         free(f);
88 }
89
90 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
91         Header h;
92         ssize_t k;
93         int r;
94
95         assert(f);
96
97         zero(h);
98         memcpy(h.signature, signature, 8);
99         h.arena_offset = htole64(ALIGN64(sizeof(h)));
100
101         r = sd_id128_randomize(&h.file_id);
102         if (r < 0)
103                 return r;
104
105         if (template) {
106                 h.seqnum_id = template->header->seqnum_id;
107                 h.seqnum = template->header->seqnum;
108         } else
109                 h.seqnum_id = h.file_id;
110
111         k = pwrite(f->fd, &h, sizeof(h), 0);
112         if (k < 0)
113                 return -errno;
114
115         if (k != sizeof(h))
116                 return -EIO;
117
118         return 0;
119 }
120
121 static int journal_file_refresh_header(JournalFile *f) {
122         int r;
123         sd_id128_t boot_id;
124
125         assert(f);
126
127         r = sd_id128_get_machine(&f->header->machine_id);
128         if (r < 0)
129                 return r;
130
131         r = sd_id128_get_boot(&boot_id);
132         if (r < 0)
133                 return r;
134
135         if (sd_id128_equal(boot_id, f->header->boot_id))
136                 f->tail_entry_monotonic_valid = true;
137
138         f->header->boot_id = boot_id;
139
140         f->header->state = STATE_ONLINE;
141
142         __sync_synchronize();
143
144         return 0;
145 }
146
147 static int journal_file_verify_header(JournalFile *f) {
148         assert(f);
149
150         if (memcmp(f->header, signature, 8))
151                 return -EBADMSG;
152
153 #ifdef HAVE_XZ
154         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
155                 return -EPROTONOSUPPORT;
156 #else
157         if (f->header->incompatible_flags != 0)
158                 return -EPROTONOSUPPORT;
159 #endif
160
161         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
162                 return -ENODATA;
163
164         if (f->writable) {
165                 uint8_t state;
166                 sd_id128_t machine_id;
167                 int r;
168
169                 r = sd_id128_get_machine(&machine_id);
170                 if (r < 0)
171                         return r;
172
173                 if (!sd_id128_equal(machine_id, f->header->machine_id))
174                         return -EHOSTDOWN;
175
176                 state = f->header->state;
177
178                 if (state == STATE_ONLINE)
179                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
180                 else if (state == STATE_ARCHIVED)
181                         return -ESHUTDOWN;
182                 else if (state != STATE_OFFLINE)
183                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
184         }
185
186         return 0;
187 }
188
189 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
190         uint64_t old_size, new_size;
191         int r;
192
193         assert(f);
194
195         /* We assume that this file is not sparse, and we know that
196          * for sure, since we always call posix_fallocate()
197          * ourselves */
198
199         old_size =
200                 le64toh(f->header->arena_offset) +
201                 le64toh(f->header->arena_size);
202
203         new_size = PAGE_ALIGN(offset + size);
204         if (new_size < le64toh(f->header->arena_offset))
205                 new_size = le64toh(f->header->arena_offset);
206
207         if (new_size <= old_size)
208                 return 0;
209
210         if (f->metrics.max_size > 0 &&
211             new_size > f->metrics.max_size)
212                 return -E2BIG;
213
214         if (new_size > f->metrics.min_size &&
215             f->metrics.keep_free > 0) {
216                 struct statvfs svfs;
217
218                 if (fstatvfs(f->fd, &svfs) >= 0) {
219                         uint64_t available;
220
221                         available = svfs.f_bfree * svfs.f_bsize;
222
223                         if (available >= f->metrics.keep_free)
224                                 available -= f->metrics.keep_free;
225                         else
226                                 available = 0;
227
228                         if (new_size - old_size > available)
229                                 return -E2BIG;
230                 }
231         }
232
233         /* Note that the glibc fallocate() fallback is very
234            inefficient, hence we try to minimize the allocation area
235            as we can. */
236         r = posix_fallocate(f->fd, old_size, new_size - old_size);
237         if (r != 0)
238                 return -r;
239
240         if (fstat(f->fd, &f->last_stat) < 0)
241                 return -errno;
242
243         f->header->arena_size = htole64(new_size - le64toh(f->header->arena_offset));
244
245         return 0;
246 }
247
248 static int journal_file_map(
249                 JournalFile *f,
250                 uint64_t offset,
251                 uint64_t size,
252                 void **_window,
253                 uint64_t *_woffset,
254                 uint64_t *_wsize,
255                 void **ret) {
256
257         uint64_t woffset, wsize;
258         void *window;
259
260         assert(f);
261         assert(size > 0);
262         assert(ret);
263
264         woffset = offset & ~((uint64_t) page_size() - 1ULL);
265         wsize = size + (offset - woffset);
266         wsize = PAGE_ALIGN(wsize);
267
268         /* Avoid SIGBUS on invalid accesses */
269         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
270                 return -EADDRNOTAVAIL;
271
272         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
273         if (window == MAP_FAILED)
274                 return -errno;
275
276         if (_window)
277                 *_window = window;
278
279         if (_woffset)
280                 *_woffset = woffset;
281
282         if (_wsize)
283                 *_wsize = wsize;
284
285         *ret = (uint8_t*) window + (offset - woffset);
286
287         return 0;
288 }
289
290 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
291         void *p = NULL;
292         uint64_t delta;
293         int r;
294         Window *w;
295
296         assert(f);
297         assert(ret);
298         assert(wt >= 0);
299         assert(wt < _WINDOW_MAX);
300
301         if (offset + size > (uint64_t) f->last_stat.st_size) {
302                 /* Hmm, out of range? Let's refresh the fstat() data
303                  * first, before we trust that check. */
304
305                 if (fstat(f->fd, &f->last_stat) < 0 ||
306                     offset + size > (uint64_t) f->last_stat.st_size)
307                         return -EADDRNOTAVAIL;
308         }
309
310         w = f->windows + wt;
311
312         if (_likely_(w->ptr &&
313                      w->offset <= offset &&
314                      w->offset + w->size >= offset + size)) {
315
316                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
317                 return 0;
318         }
319
320         if (w->ptr) {
321                 if (munmap(w->ptr, w->size) < 0)
322                         return -errno;
323
324                 w->ptr = NULL;
325                 w->size = w->offset = 0;
326         }
327
328         if (size < DEFAULT_WINDOW_SIZE) {
329                 /* If the default window size is larger then what was
330                  * asked for extend the mapping a bit in the hope to
331                  * minimize needed remappings later on. We add half
332                  * the window space before and half behind the
333                  * requested mapping */
334
335                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
336
337                 if (delta > offset)
338                         delta = offset;
339
340                 offset -= delta;
341                 size = DEFAULT_WINDOW_SIZE;
342         } else
343                 delta = 0;
344
345         if (offset + size > (uint64_t) f->last_stat.st_size)
346                 size = (uint64_t) f->last_stat.st_size - offset;
347
348         if (size <= 0)
349                 return -EADDRNOTAVAIL;
350
351         r = journal_file_map(f,
352                              offset, size,
353                              &w->ptr, &w->offset, &w->size,
354                              &p);
355
356         if (r < 0)
357                 return r;
358
359         *ret = (uint8_t*) p + delta;
360         return 0;
361 }
362
363 static bool verify_hash(Object *o) {
364         uint64_t h1, h2;
365
366         assert(o);
367
368         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
369                 h1 = le64toh(o->data.hash);
370                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
371         } else if (o->object.type == OBJECT_FIELD) {
372                 h1 = le64toh(o->field.hash);
373                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
374         } else
375                 return true;
376
377         return h1 == h2;
378 }
379
380 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
381         int r;
382         void *t;
383         Object *o;
384         uint64_t s;
385
386         assert(f);
387         assert(ret);
388         assert(type < _OBJECT_TYPE_MAX);
389
390         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
391         if (r < 0)
392                 return r;
393
394         o = (Object*) t;
395         s = le64toh(o->object.size);
396
397         if (s < sizeof(ObjectHeader))
398                 return -EBADMSG;
399
400         if (type >= 0 && o->object.type != type)
401                 return -EBADMSG;
402
403         if (s > sizeof(ObjectHeader)) {
404                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
405                 if (r < 0)
406                         return r;
407
408                 o = (Object*) t;
409         }
410
411         if (!verify_hash(o))
412                 return -EBADMSG;
413
414         *ret = o;
415         return 0;
416 }
417
418 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
419         uint64_t r;
420
421         assert(f);
422
423         r = le64toh(f->header->seqnum) + 1;
424
425         if (seqnum) {
426                 /* If an external seqnum counter was passed, we update
427                  * both the local and the external one, and set it to
428                  * the maximum of both */
429
430                 if (*seqnum + 1 > r)
431                         r = *seqnum + 1;
432
433                 *seqnum = r;
434         }
435
436         f->header->seqnum = htole64(r);
437
438         if (f->header->first_seqnum == 0)
439                 f->header->first_seqnum = htole64(r);
440
441         return r;
442 }
443
444 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
445         int r;
446         uint64_t p;
447         Object *tail, *o;
448         void *t;
449
450         assert(f);
451         assert(size >= sizeof(ObjectHeader));
452         assert(offset);
453         assert(ret);
454
455         p = le64toh(f->header->tail_object_offset);
456         if (p == 0)
457                 p = le64toh(f->header->arena_offset);
458         else {
459                 r = journal_file_move_to_object(f, -1, p, &tail);
460                 if (r < 0)
461                         return r;
462
463                 p += ALIGN64(le64toh(tail->object.size));
464         }
465
466         r = journal_file_allocate(f, p, size);
467         if (r < 0)
468                 return r;
469
470         r = journal_file_move_to(f, type, p, size, &t);
471         if (r < 0)
472                 return r;
473
474         o = (Object*) t;
475
476         zero(o->object);
477         o->object.type = type;
478         o->object.size = htole64(size);
479
480         f->header->tail_object_offset = htole64(p);
481         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
482
483         *ret = o;
484         *offset = p;
485
486         return 0;
487 }
488
489 static int journal_file_setup_data_hash_table(JournalFile *f) {
490         uint64_t s, p;
491         Object *o;
492         int r;
493
494         assert(f);
495
496         s = DEFAULT_DATA_HASH_TABLE_SIZE;
497         r = journal_file_append_object(f,
498                                        OBJECT_DATA_HASH_TABLE,
499                                        offsetof(Object, hash_table.items) + s,
500                                        &o, &p);
501         if (r < 0)
502                 return r;
503
504         memset(o->hash_table.items, 0, s);
505
506         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
507         f->header->data_hash_table_size = htole64(s);
508
509         return 0;
510 }
511
512 static int journal_file_setup_field_hash_table(JournalFile *f) {
513         uint64_t s, p;
514         Object *o;
515         int r;
516
517         assert(f);
518
519         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
520         r = journal_file_append_object(f,
521                                        OBJECT_FIELD_HASH_TABLE,
522                                        offsetof(Object, hash_table.items) + s,
523                                        &o, &p);
524         if (r < 0)
525                 return r;
526
527         memset(o->hash_table.items, 0, s);
528
529         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
530         f->header->field_hash_table_size = htole64(s);
531
532         return 0;
533 }
534
535 static int journal_file_map_data_hash_table(JournalFile *f) {
536         uint64_t s, p;
537         void *t;
538         int r;
539
540         assert(f);
541
542         p = le64toh(f->header->data_hash_table_offset);
543         s = le64toh(f->header->data_hash_table_size);
544
545         r = journal_file_move_to(f,
546                                  WINDOW_DATA_HASH_TABLE,
547                                  p, s,
548                                  &t);
549         if (r < 0)
550                 return r;
551
552         f->data_hash_table = t;
553         return 0;
554 }
555
556 static int journal_file_map_field_hash_table(JournalFile *f) {
557         uint64_t s, p;
558         void *t;
559         int r;
560
561         assert(f);
562
563         p = le64toh(f->header->field_hash_table_offset);
564         s = le64toh(f->header->field_hash_table_size);
565
566         r = journal_file_move_to(f,
567                                  WINDOW_FIELD_HASH_TABLE,
568                                  p, s,
569                                  &t);
570         if (r < 0)
571                 return r;
572
573         f->field_hash_table = t;
574         return 0;
575 }
576
577 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
578         uint64_t p, h;
579         int r;
580
581         assert(f);
582         assert(o);
583         assert(offset > 0);
584         assert(o->object.type == OBJECT_DATA);
585
586         /* This might alter the window we are looking at */
587
588         o->data.next_hash_offset = o->data.next_field_offset = 0;
589         o->data.entry_offset = o->data.entry_array_offset = 0;
590         o->data.n_entries = 0;
591
592         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
593         p = le64toh(f->data_hash_table[h].head_hash_offset);
594         if (p == 0) {
595                 /* Only entry in the hash table is easy */
596                 f->data_hash_table[h].head_hash_offset = htole64(offset);
597         } else {
598                 /* Move back to the previous data object, to patch in
599                  * pointer */
600
601                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
602                 if (r < 0)
603                         return r;
604
605                 o->data.next_hash_offset = htole64(offset);
606         }
607
608         f->data_hash_table[h].tail_hash_offset = htole64(offset);
609
610         return 0;
611 }
612
613 int journal_file_find_data_object_with_hash(
614                 JournalFile *f,
615                 const void *data, uint64_t size, uint64_t hash,
616                 Object **ret, uint64_t *offset) {
617
618         uint64_t p, osize, h;
619         int r;
620
621         assert(f);
622         assert(data || size == 0);
623
624         osize = offsetof(Object, data.payload) + size;
625
626         if (f->header->data_hash_table_size == 0)
627                 return -EBADMSG;
628
629         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
630         p = le64toh(f->data_hash_table[h].head_hash_offset);
631
632         while (p > 0) {
633                 Object *o;
634
635                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
636                 if (r < 0)
637                         return r;
638
639                 if (le64toh(o->data.hash) != hash)
640                         goto next;
641
642                 if (o->object.flags & OBJECT_COMPRESSED) {
643 #ifdef HAVE_XZ
644                         uint64_t l, rsize;
645
646                         l = le64toh(o->object.size);
647                         if (l <= offsetof(Object, data.payload))
648                                 return -EBADMSG;
649
650                         l -= offsetof(Object, data.payload);
651
652                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
653                                 return -EBADMSG;
654
655                         if (rsize == size &&
656                             memcmp(f->compress_buffer, data, size) == 0) {
657
658                                 if (ret)
659                                         *ret = o;
660
661                                 if (offset)
662                                         *offset = p;
663
664                                 return 1;
665                         }
666 #else
667                         return -EPROTONOSUPPORT;
668 #endif
669
670                 } else if (le64toh(o->object.size) == osize &&
671                            memcmp(o->data.payload, data, size) == 0) {
672
673                         if (ret)
674                                 *ret = o;
675
676                         if (offset)
677                                 *offset = p;
678
679                         return 1;
680                 }
681
682         next:
683                 p = le64toh(o->data.next_hash_offset);
684         }
685
686         return 0;
687 }
688
689 int journal_file_find_data_object(
690                 JournalFile *f,
691                 const void *data, uint64_t size,
692                 Object **ret, uint64_t *offset) {
693
694         uint64_t hash;
695
696         assert(f);
697         assert(data || size == 0);
698
699         hash = hash64(data, size);
700
701         return journal_file_find_data_object_with_hash(f,
702                                                        data, size, hash,
703                                                        ret, offset);
704 }
705
706 static int journal_file_append_data(
707                 JournalFile *f,
708                 const void *data, uint64_t size,
709                 Object **ret, uint64_t *offset) {
710
711         uint64_t hash, p;
712         uint64_t osize;
713         Object *o;
714         int r;
715         bool compressed = false;
716
717         assert(f);
718         assert(data || size == 0);
719
720         hash = hash64(data, size);
721
722         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
723         if (r < 0)
724                 return r;
725         else if (r > 0) {
726
727                 if (ret)
728                         *ret = o;
729
730                 if (offset)
731                         *offset = p;
732
733                 return 0;
734         }
735
736         osize = offsetof(Object, data.payload) + size;
737         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
738         if (r < 0)
739                 return r;
740
741         o->data.hash = htole64(hash);
742
743 #ifdef HAVE_XZ
744         if (f->compress &&
745             size >= COMPRESSION_SIZE_THRESHOLD) {
746                 uint64_t rsize;
747
748                 compressed = compress_blob(data, size, o->data.payload, &rsize);
749
750                 if (compressed) {
751                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
752                         o->object.flags |= OBJECT_COMPRESSED;
753
754                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
755
756                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
757                 }
758         }
759 #endif
760
761         if (!compressed)
762                 memcpy(o->data.payload, data, size);
763
764         r = journal_file_link_data(f, o, p, hash);
765         if (r < 0)
766                 return r;
767
768         /* The linking might have altered the window, so let's
769          * refresh our pointer */
770         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
771         if (r < 0)
772                 return r;
773
774         if (ret)
775                 *ret = o;
776
777         if (offset)
778                 *offset = p;
779
780         return 0;
781 }
782
783 uint64_t journal_file_entry_n_items(Object *o) {
784         assert(o);
785         assert(o->object.type == OBJECT_ENTRY);
786
787         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
788 }
789
790 static uint64_t journal_file_entry_array_n_items(Object *o) {
791         assert(o);
792         assert(o->object.type == OBJECT_ENTRY_ARRAY);
793
794         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
795 }
796
797 static int link_entry_into_array(JournalFile *f,
798                                  le64_t *first,
799                                  le64_t *idx,
800                                  uint64_t p) {
801         int r;
802         uint64_t n = 0, ap = 0, q, i, a, hidx;
803         Object *o;
804
805         assert(f);
806         assert(first);
807         assert(idx);
808         assert(p > 0);
809
810         a = le64toh(*first);
811         i = hidx = le64toh(*idx);
812         while (a > 0) {
813
814                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
815                 if (r < 0)
816                         return r;
817
818                 n = journal_file_entry_array_n_items(o);
819                 if (i < n) {
820                         o->entry_array.items[i] = htole64(p);
821                         *idx = htole64(hidx + 1);
822                         return 0;
823                 }
824
825                 i -= n;
826                 ap = a;
827                 a = le64toh(o->entry_array.next_entry_array_offset);
828         }
829
830         if (hidx > n)
831                 n = (hidx+1) * 2;
832         else
833                 n = n * 2;
834
835         if (n < 4)
836                 n = 4;
837
838         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
839                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
840                                        &o, &q);
841         if (r < 0)
842                 return r;
843
844         o->entry_array.items[i] = htole64(p);
845
846         if (ap == 0)
847                 *first = htole64(q);
848         else {
849                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
850                 if (r < 0)
851                         return r;
852
853                 o->entry_array.next_entry_array_offset = htole64(q);
854         }
855
856         *idx = htole64(hidx + 1);
857
858         return 0;
859 }
860
861 static int link_entry_into_array_plus_one(JournalFile *f,
862                                           le64_t *extra,
863                                           le64_t *first,
864                                           le64_t *idx,
865                                           uint64_t p) {
866
867         int r;
868
869         assert(f);
870         assert(extra);
871         assert(first);
872         assert(idx);
873         assert(p > 0);
874
875         if (*idx == 0)
876                 *extra = htole64(p);
877         else {
878                 le64_t i;
879
880                 i = htole64(le64toh(*idx) - 1);
881                 r = link_entry_into_array(f, first, &i, p);
882                 if (r < 0)
883                         return r;
884         }
885
886         *idx = htole64(le64toh(*idx) + 1);
887         return 0;
888 }
889
890 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
891         uint64_t p;
892         int r;
893         assert(f);
894         assert(o);
895         assert(offset > 0);
896
897         p = le64toh(o->entry.items[i].object_offset);
898         if (p == 0)
899                 return -EINVAL;
900
901         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
902         if (r < 0)
903                 return r;
904
905         return link_entry_into_array_plus_one(f,
906                                               &o->data.entry_offset,
907                                               &o->data.entry_array_offset,
908                                               &o->data.n_entries,
909                                               offset);
910 }
911
912 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
913         uint64_t n, i;
914         int r;
915
916         assert(f);
917         assert(o);
918         assert(offset > 0);
919         assert(o->object.type == OBJECT_ENTRY);
920
921         __sync_synchronize();
922
923         /* Link up the entry itself */
924         r = link_entry_into_array(f,
925                                   &f->header->entry_array_offset,
926                                   &f->header->n_entries,
927                                   offset);
928         if (r < 0)
929                 return r;
930
931         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
932
933         if (f->header->head_entry_realtime == 0)
934                 f->header->head_entry_realtime = o->entry.realtime;
935
936         f->header->tail_entry_realtime = o->entry.realtime;
937         f->header->tail_entry_monotonic = o->entry.monotonic;
938
939         f->tail_entry_monotonic_valid = true;
940
941         /* Link up the items */
942         n = journal_file_entry_n_items(o);
943         for (i = 0; i < n; i++) {
944                 r = journal_file_link_entry_item(f, o, offset, i);
945                 if (r < 0)
946                         return r;
947         }
948
949         return 0;
950 }
951
952 static int journal_file_append_entry_internal(
953                 JournalFile *f,
954                 const dual_timestamp *ts,
955                 uint64_t xor_hash,
956                 const EntryItem items[], unsigned n_items,
957                 uint64_t *seqnum,
958                 Object **ret, uint64_t *offset) {
959         uint64_t np;
960         uint64_t osize;
961         Object *o;
962         int r;
963
964         assert(f);
965         assert(items || n_items == 0);
966         assert(ts);
967
968         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
969
970         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
971         if (r < 0)
972                 return r;
973
974         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
975         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
976         o->entry.realtime = htole64(ts->realtime);
977         o->entry.monotonic = htole64(ts->monotonic);
978         o->entry.xor_hash = htole64(xor_hash);
979         o->entry.boot_id = f->header->boot_id;
980
981         r = journal_file_link_entry(f, o, np);
982         if (r < 0)
983                 return r;
984
985         if (ret)
986                 *ret = o;
987
988         if (offset)
989                 *offset = np;
990
991         return 0;
992 }
993
994 void journal_file_post_change(JournalFile *f) {
995         assert(f);
996
997         /* inotify() does not receive IN_MODIFY events from file
998          * accesses done via mmap(). After each access we hence
999          * trigger IN_MODIFY by truncating the journal file to its
1000          * current size which triggers IN_MODIFY. */
1001
1002         __sync_synchronize();
1003
1004         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1005                 log_error("Failed to to truncate file to its own size: %m");
1006 }
1007
1008 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1009         unsigned i;
1010         EntryItem *items;
1011         int r;
1012         uint64_t xor_hash = 0;
1013         struct dual_timestamp _ts;
1014
1015         assert(f);
1016         assert(iovec || n_iovec == 0);
1017
1018         if (!f->writable)
1019                 return -EPERM;
1020
1021         if (!ts) {
1022                 dual_timestamp_get(&_ts);
1023                 ts = &_ts;
1024         }
1025
1026         if (f->tail_entry_monotonic_valid &&
1027             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1028                 return -EINVAL;
1029
1030         items = alloca(sizeof(EntryItem) * n_iovec);
1031
1032         for (i = 0; i < n_iovec; i++) {
1033                 uint64_t p;
1034                 Object *o;
1035
1036                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1037                 if (r < 0)
1038                         return r;
1039
1040                 xor_hash ^= le64toh(o->data.hash);
1041                 items[i].object_offset = htole64(p);
1042                 items[i].hash = o->data.hash;
1043         }
1044
1045         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1046
1047         journal_file_post_change(f);
1048
1049         return r;
1050 }
1051
1052 static int generic_array_get(JournalFile *f,
1053                              uint64_t first,
1054                              uint64_t i,
1055                              Object **ret, uint64_t *offset) {
1056
1057         Object *o;
1058         uint64_t p = 0, a;
1059         int r;
1060
1061         assert(f);
1062
1063         a = first;
1064         while (a > 0) {
1065                 uint64_t n;
1066
1067                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1068                 if (r < 0)
1069                         return r;
1070
1071                 n = journal_file_entry_array_n_items(o);
1072                 if (i < n) {
1073                         p = le64toh(o->entry_array.items[i]);
1074                         break;
1075                 }
1076
1077                 i -= n;
1078                 a = le64toh(o->entry_array.next_entry_array_offset);
1079         }
1080
1081         if (a <= 0 || p <= 0)
1082                 return 0;
1083
1084         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1085         if (r < 0)
1086                 return r;
1087
1088         if (ret)
1089                 *ret = o;
1090
1091         if (offset)
1092                 *offset = p;
1093
1094         return 1;
1095 }
1096
1097 static int generic_array_get_plus_one(JournalFile *f,
1098                                       uint64_t extra,
1099                                       uint64_t first,
1100                                       uint64_t i,
1101                                       Object **ret, uint64_t *offset) {
1102
1103         Object *o;
1104
1105         assert(f);
1106
1107         if (i == 0) {
1108                 int r;
1109
1110                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1111                 if (r < 0)
1112                         return r;
1113
1114                 if (ret)
1115                         *ret = o;
1116
1117                 if (offset)
1118                         *offset = extra;
1119
1120                 return 1;
1121         }
1122
1123         return generic_array_get(f, first, i-1, ret, offset);
1124 }
1125
1126 enum {
1127         TEST_FOUND,
1128         TEST_LEFT,
1129         TEST_RIGHT
1130 };
1131
1132 static int generic_array_bisect(JournalFile *f,
1133                                 uint64_t first,
1134                                 uint64_t n,
1135                                 uint64_t needle,
1136                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1137                                 direction_t direction,
1138                                 Object **ret,
1139                                 uint64_t *offset,
1140                                 uint64_t *idx) {
1141
1142         uint64_t a, p, t = 0, i = 0, last_p = 0;
1143         bool subtract_one = false;
1144         Object *o, *array = NULL;
1145         int r;
1146
1147         assert(f);
1148         assert(test_object);
1149
1150         a = first;
1151         while (a > 0) {
1152                 uint64_t left, right, k, lp;
1153
1154                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1155                 if (r < 0)
1156                         return r;
1157
1158                 k = journal_file_entry_array_n_items(array);
1159                 right = MIN(k, n);
1160                 if (right <= 0)
1161                         return 0;
1162
1163                 i = right - 1;
1164                 lp = p = le64toh(array->entry_array.items[i]);
1165                 if (p <= 0)
1166                         return -EBADMSG;
1167
1168                 r = test_object(f, p, needle);
1169                 if (r < 0)
1170                         return r;
1171
1172                 if (r == TEST_FOUND)
1173                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1174
1175                 if (r == TEST_RIGHT) {
1176                         left = 0;
1177                         right -= 1;
1178                         for (;;) {
1179                                 if (left == right) {
1180                                         if (direction == DIRECTION_UP)
1181                                                 subtract_one = true;
1182
1183                                         i = left;
1184                                         goto found;
1185                                 }
1186
1187                                 assert(left < right);
1188
1189                                 i = (left + right) / 2;
1190                                 p = le64toh(array->entry_array.items[i]);
1191                                 if (p <= 0)
1192                                         return -EBADMSG;
1193
1194                                 r = test_object(f, p, needle);
1195                                 if (r < 0)
1196                                         return r;
1197
1198                                 if (r == TEST_FOUND)
1199                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1200
1201                                 if (r == TEST_RIGHT)
1202                                         right = i;
1203                                 else
1204                                         left = i + 1;
1205                         }
1206                 }
1207
1208                 if (k > n)
1209                         return 0;
1210
1211                 last_p = lp;
1212
1213                 n -= k;
1214                 t += k;
1215                 a = le64toh(array->entry_array.next_entry_array_offset);
1216         }
1217
1218         return 0;
1219
1220 found:
1221         if (subtract_one && t == 0 && i == 0)
1222                 return 0;
1223
1224         if (subtract_one && i == 0)
1225                 p = last_p;
1226         else if (subtract_one)
1227                 p = le64toh(array->entry_array.items[i-1]);
1228         else
1229                 p = le64toh(array->entry_array.items[i]);
1230
1231         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1232         if (r < 0)
1233                 return r;
1234
1235         if (ret)
1236                 *ret = o;
1237
1238         if (offset)
1239                 *offset = p;
1240
1241         if (idx)
1242                 *idx = t + i - (subtract_one ? 1 : 0);
1243
1244         return 1;
1245 }
1246
1247 static int generic_array_bisect_plus_one(JournalFile *f,
1248                                          uint64_t extra,
1249                                          uint64_t first,
1250                                          uint64_t n,
1251                                          uint64_t needle,
1252                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1253                                          direction_t direction,
1254                                          Object **ret,
1255                                          uint64_t *offset,
1256                                          uint64_t *idx) {
1257
1258         int r;
1259
1260         assert(f);
1261         assert(test_object);
1262
1263         if (n <= 0)
1264                 return 0;
1265
1266         /* This bisects the array in object 'first', but first checks
1267          * an extra  */
1268         r = test_object(f, extra, needle);
1269         if (r < 0)
1270                 return r;
1271         else if (r == TEST_FOUND) {
1272                 Object *o;
1273
1274                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1275                 if (r < 0)
1276                         return r;
1277
1278                 if (ret)
1279                         *ret = o;
1280
1281                 if (offset)
1282                         *offset = extra;
1283
1284                 if (idx)
1285                         *idx = 0;
1286
1287                 return 1;
1288         } else if (r == TEST_RIGHT)
1289                 return 0;
1290
1291         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1292
1293         if (r > 0)
1294                 (*idx) ++;
1295
1296         return r;
1297 }
1298
1299 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1300         Object *o;
1301         int r;
1302
1303         assert(f);
1304         assert(p > 0);
1305
1306         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1307         if (r < 0)
1308                 return r;
1309
1310         if (le64toh(o->entry.seqnum) == needle)
1311                 return TEST_FOUND;
1312         else if (le64toh(o->entry.seqnum) < needle)
1313                 return TEST_LEFT;
1314         else
1315                 return TEST_RIGHT;
1316 }
1317
1318 int journal_file_move_to_entry_by_seqnum(
1319                 JournalFile *f,
1320                 uint64_t seqnum,
1321                 direction_t direction,
1322                 Object **ret,
1323                 uint64_t *offset) {
1324
1325         return generic_array_bisect(f,
1326                                     le64toh(f->header->entry_array_offset),
1327                                     le64toh(f->header->n_entries),
1328                                     seqnum,
1329                                     test_object_seqnum,
1330                                     direction,
1331                                     ret, offset, NULL);
1332 }
1333
1334 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1335         Object *o;
1336         int r;
1337
1338         assert(f);
1339         assert(p > 0);
1340
1341         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1342         if (r < 0)
1343                 return r;
1344
1345         if (le64toh(o->entry.realtime) == needle)
1346                 return TEST_FOUND;
1347         else if (le64toh(o->entry.realtime) < needle)
1348                 return TEST_LEFT;
1349         else
1350                 return TEST_RIGHT;
1351 }
1352
1353 int journal_file_move_to_entry_by_realtime(
1354                 JournalFile *f,
1355                 uint64_t realtime,
1356                 direction_t direction,
1357                 Object **ret,
1358                 uint64_t *offset) {
1359
1360         return generic_array_bisect(f,
1361                                     le64toh(f->header->entry_array_offset),
1362                                     le64toh(f->header->n_entries),
1363                                     realtime,
1364                                     test_object_realtime,
1365                                     direction,
1366                                     ret, offset, NULL);
1367 }
1368
1369 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1370         Object *o;
1371         int r;
1372
1373         assert(f);
1374         assert(p > 0);
1375
1376         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1377         if (r < 0)
1378                 return r;
1379
1380         if (le64toh(o->entry.monotonic) == needle)
1381                 return TEST_FOUND;
1382         else if (le64toh(o->entry.monotonic) < needle)
1383                 return TEST_LEFT;
1384         else
1385                 return TEST_RIGHT;
1386 }
1387
1388 int journal_file_move_to_entry_by_monotonic(
1389                 JournalFile *f,
1390                 sd_id128_t boot_id,
1391                 uint64_t monotonic,
1392                 direction_t direction,
1393                 Object **ret,
1394                 uint64_t *offset) {
1395
1396         char t[8+32+1] = "_BOOT_ID=";
1397         Object *o;
1398         int r;
1399
1400         sd_id128_to_string(boot_id, t + 8);
1401
1402         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1403         if (r < 0)
1404                 return r;
1405         else if (r == 0)
1406                 return -ENOENT;
1407
1408         return generic_array_bisect_plus_one(f,
1409                                              le64toh(o->data.entry_offset),
1410                                              le64toh(o->data.entry_array_offset),
1411                                              le64toh(o->data.n_entries),
1412                                              monotonic,
1413                                              test_object_monotonic,
1414                                              direction,
1415                                              ret, offset, NULL);
1416 }
1417
1418 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1419         assert(f);
1420         assert(p > 0);
1421
1422         if (p == needle)
1423                 return TEST_FOUND;
1424         else if (p < needle)
1425                 return TEST_LEFT;
1426         else
1427                 return TEST_RIGHT;
1428 }
1429
1430 int journal_file_next_entry(
1431                 JournalFile *f,
1432                 Object *o, uint64_t p,
1433                 direction_t direction,
1434                 Object **ret, uint64_t *offset) {
1435
1436         uint64_t i, n;
1437         int r;
1438
1439         assert(f);
1440         assert(p > 0 || !o);
1441
1442         n = le64toh(f->header->n_entries);
1443         if (n <= 0)
1444                 return 0;
1445
1446         if (!o)
1447                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1448         else {
1449                 if (o->object.type != OBJECT_ENTRY)
1450                         return -EINVAL;
1451
1452                 r = generic_array_bisect(f,
1453                                          le64toh(f->header->entry_array_offset),
1454                                          le64toh(f->header->n_entries),
1455                                          p,
1456                                          test_object_offset,
1457                                          DIRECTION_DOWN,
1458                                          NULL, NULL,
1459                                          &i);
1460                 if (r <= 0)
1461                         return r;
1462
1463                 if (direction == DIRECTION_DOWN) {
1464                         if (i >= n - 1)
1465                                 return 0;
1466
1467                         i++;
1468                 } else {
1469                         if (i <= 0)
1470                                 return 0;
1471
1472                         i--;
1473                 }
1474         }
1475
1476         /* And jump to it */
1477         return generic_array_get(f,
1478                                  le64toh(f->header->entry_array_offset),
1479                                  i,
1480                                  ret, offset);
1481 }
1482
1483 int journal_file_skip_entry(
1484                 JournalFile *f,
1485                 Object *o, uint64_t p,
1486                 int64_t skip,
1487                 Object **ret, uint64_t *offset) {
1488
1489         uint64_t i, n;
1490         int r;
1491
1492         assert(f);
1493         assert(o);
1494         assert(p > 0);
1495
1496         if (o->object.type != OBJECT_ENTRY)
1497                 return -EINVAL;
1498
1499         r = generic_array_bisect(f,
1500                                  le64toh(f->header->entry_array_offset),
1501                                  le64toh(f->header->n_entries),
1502                                  p,
1503                                  test_object_offset,
1504                                  DIRECTION_DOWN,
1505                                  NULL, NULL,
1506                                  &i);
1507         if (r <= 0)
1508                 return r;
1509
1510         /* Calculate new index */
1511         if (skip < 0) {
1512                 if ((uint64_t) -skip >= i)
1513                         i = 0;
1514                 else
1515                         i = i - (uint64_t) -skip;
1516         } else
1517                 i  += (uint64_t) skip;
1518
1519         n = le64toh(f->header->n_entries);
1520         if (n <= 0)
1521                 return -EBADMSG;
1522
1523         if (i >= n)
1524                 i = n-1;
1525
1526         return generic_array_get(f,
1527                                  le64toh(f->header->entry_array_offset),
1528                                  i,
1529                                  ret, offset);
1530 }
1531
1532 int journal_file_next_entry_for_data(
1533                 JournalFile *f,
1534                 Object *o, uint64_t p,
1535                 uint64_t data_offset,
1536                 direction_t direction,
1537                 Object **ret, uint64_t *offset) {
1538
1539         uint64_t n, i;
1540         int r;
1541         Object *d;
1542
1543         assert(f);
1544         assert(p > 0 || !o);
1545
1546         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1547         if (r < 0)
1548                 return r;
1549
1550         n = le64toh(d->data.n_entries);
1551         if (n <= 0)
1552                 return n;
1553
1554         if (!o)
1555                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1556         else {
1557                 if (o->object.type != OBJECT_ENTRY)
1558                         return -EINVAL;
1559
1560                 r = generic_array_bisect_plus_one(f,
1561                                                   le64toh(d->data.entry_offset),
1562                                                   le64toh(d->data.entry_array_offset),
1563                                                   le64toh(d->data.n_entries),
1564                                                   p,
1565                                                   test_object_offset,
1566                                                   DIRECTION_DOWN,
1567                                                   NULL, NULL,
1568                                                   &i);
1569
1570                 if (r <= 0)
1571                         return r;
1572
1573                 if (direction == DIRECTION_DOWN) {
1574                         if (i >= n - 1)
1575                                 return 0;
1576
1577                         i++;
1578                 } else {
1579                         if (i <= 0)
1580                                 return 0;
1581
1582                         i--;
1583                 }
1584
1585         }
1586
1587         return generic_array_get_plus_one(f,
1588                                           le64toh(d->data.entry_offset),
1589                                           le64toh(d->data.entry_array_offset),
1590                                           i,
1591                                           ret, offset);
1592 }
1593
1594 int journal_file_move_to_entry_by_seqnum_for_data(
1595                 JournalFile *f,
1596                 uint64_t data_offset,
1597                 uint64_t seqnum,
1598                 direction_t direction,
1599                 Object **ret, uint64_t *offset) {
1600
1601         Object *d;
1602         int r;
1603
1604         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1605         if (r <= 0)
1606                 return r;
1607
1608         return generic_array_bisect_plus_one(f,
1609                                              le64toh(d->data.entry_offset),
1610                                              le64toh(d->data.entry_array_offset),
1611                                              le64toh(d->data.n_entries),
1612                                              seqnum,
1613                                              test_object_seqnum,
1614                                              direction,
1615                                              ret, offset, NULL);
1616 }
1617
1618 int journal_file_move_to_entry_by_realtime_for_data(
1619                 JournalFile *f,
1620                 uint64_t data_offset,
1621                 uint64_t realtime,
1622                 direction_t direction,
1623                 Object **ret, uint64_t *offset) {
1624
1625         Object *d;
1626         int r;
1627
1628         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1629         if (r <= 0)
1630                 return r;
1631
1632         return generic_array_bisect_plus_one(f,
1633                                              le64toh(d->data.entry_offset),
1634                                              le64toh(d->data.entry_array_offset),
1635                                              le64toh(d->data.n_entries),
1636                                              realtime,
1637                                              test_object_realtime,
1638                                              direction,
1639                                              ret, offset, NULL);
1640 }
1641
1642 void journal_file_dump(JournalFile *f) {
1643         char a[33], b[33], c[33];
1644         Object *o;
1645         int r;
1646         uint64_t p;
1647
1648         assert(f);
1649
1650         printf("File Path: %s\n"
1651                "File ID: %s\n"
1652                "Machine ID: %s\n"
1653                "Boot ID: %s\n"
1654                "Arena size: %llu\n"
1655                "Objects: %lu\n"
1656                "Entries: %lu\n",
1657                f->path,
1658                sd_id128_to_string(f->header->file_id, a),
1659                sd_id128_to_string(f->header->machine_id, b),
1660                sd_id128_to_string(f->header->boot_id, c),
1661                (unsigned long long) le64toh(f->header->arena_size),
1662                (unsigned long) le64toh(f->header->n_objects),
1663                (unsigned long) le64toh(f->header->n_entries));
1664
1665         p = le64toh(f->header->arena_offset);
1666         while (p != 0) {
1667                 r = journal_file_move_to_object(f, -1, p, &o);
1668                 if (r < 0)
1669                         goto fail;
1670
1671                 switch (o->object.type) {
1672
1673                 case OBJECT_UNUSED:
1674                         printf("Type: OBJECT_UNUSED\n");
1675                         break;
1676
1677                 case OBJECT_DATA:
1678                         printf("Type: OBJECT_DATA\n");
1679                         break;
1680
1681                 case OBJECT_ENTRY:
1682                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1683                                (unsigned long long) le64toh(o->entry.seqnum),
1684                                (unsigned long long) le64toh(o->entry.monotonic),
1685                                (unsigned long long) le64toh(o->entry.realtime));
1686                         break;
1687
1688                 case OBJECT_FIELD_HASH_TABLE:
1689                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1690                         break;
1691
1692                 case OBJECT_DATA_HASH_TABLE:
1693                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1694                         break;
1695
1696                 case OBJECT_ENTRY_ARRAY:
1697                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1698                         break;
1699                 }
1700
1701                 if (o->object.flags & OBJECT_COMPRESSED)
1702                         printf("Flags: COMPRESSED\n");
1703
1704                 if (p == le64toh(f->header->tail_object_offset))
1705                         p = 0;
1706                 else
1707                         p = p + ALIGN64(le64toh(o->object.size));
1708         }
1709
1710         return;
1711 fail:
1712         log_error("File corrupt");
1713 }
1714
1715 int journal_file_open(
1716                 const char *fname,
1717                 int flags,
1718                 mode_t mode,
1719                 JournalFile *template,
1720                 JournalFile **ret) {
1721
1722         JournalFile *f;
1723         int r;
1724         bool newly_created = false;
1725
1726         assert(fname);
1727
1728         if ((flags & O_ACCMODE) != O_RDONLY &&
1729             (flags & O_ACCMODE) != O_RDWR)
1730                 return -EINVAL;
1731
1732         if (!endswith(fname, ".journal"))
1733                 return -EINVAL;
1734
1735         f = new0(JournalFile, 1);
1736         if (!f)
1737                 return -ENOMEM;
1738
1739         f->fd = -1;
1740         f->flags = flags;
1741         f->mode = mode;
1742         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1743         f->prot = prot_from_flags(flags);
1744
1745         if (template) {
1746                 f->metrics = template->metrics;
1747                 f->compress = template->compress;
1748         }
1749
1750         f->path = strdup(fname);
1751         if (!f->path) {
1752                 r = -ENOMEM;
1753                 goto fail;
1754         }
1755
1756         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1757         if (f->fd < 0) {
1758                 r = -errno;
1759                 goto fail;
1760         }
1761
1762         if (fstat(f->fd, &f->last_stat) < 0) {
1763                 r = -errno;
1764                 goto fail;
1765         }
1766
1767         if (f->last_stat.st_size == 0 && f->writable) {
1768                 newly_created = true;
1769
1770                 r = journal_file_init_header(f, template);
1771                 if (r < 0)
1772                         goto fail;
1773
1774                 if (fstat(f->fd, &f->last_stat) < 0) {
1775                         r = -errno;
1776                         goto fail;
1777                 }
1778         }
1779
1780         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1781                 r = -EIO;
1782                 goto fail;
1783         }
1784
1785         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1786         if (f->header == MAP_FAILED) {
1787                 f->header = NULL;
1788                 r = -errno;
1789                 goto fail;
1790         }
1791
1792         if (!newly_created) {
1793                 r = journal_file_verify_header(f);
1794                 if (r < 0)
1795                         goto fail;
1796         }
1797
1798         if (f->writable) {
1799                 r = journal_file_refresh_header(f);
1800                 if (r < 0)
1801                         goto fail;
1802         }
1803
1804         if (newly_created) {
1805
1806                 r = journal_file_setup_field_hash_table(f);
1807                 if (r < 0)
1808                         goto fail;
1809
1810                 r = journal_file_setup_data_hash_table(f);
1811                 if (r < 0)
1812                         goto fail;
1813         }
1814
1815         r = journal_file_map_field_hash_table(f);
1816         if (r < 0)
1817                 goto fail;
1818
1819         r = journal_file_map_data_hash_table(f);
1820         if (r < 0)
1821                 goto fail;
1822
1823         if (ret)
1824                 *ret = f;
1825
1826         return 0;
1827
1828 fail:
1829         journal_file_close(f);
1830
1831         return r;
1832 }
1833
1834 int journal_file_rotate(JournalFile **f) {
1835         char *p;
1836         size_t l;
1837         JournalFile *old_file, *new_file = NULL;
1838         int r;
1839
1840         assert(f);
1841         assert(*f);
1842
1843         old_file = *f;
1844
1845         if (!old_file->writable)
1846                 return -EINVAL;
1847
1848         if (!endswith(old_file->path, ".journal"))
1849                 return -EINVAL;
1850
1851         l = strlen(old_file->path);
1852
1853         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1854         if (!p)
1855                 return -ENOMEM;
1856
1857         memcpy(p, old_file->path, l - 8);
1858         p[l-8] = '@';
1859         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1860         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1861                  "-%016llx-%016llx.journal",
1862                  (unsigned long long) le64toh((*f)->header->seqnum),
1863                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1864
1865         r = rename(old_file->path, p);
1866         free(p);
1867
1868         if (r < 0)
1869                 return -errno;
1870
1871         old_file->header->state = STATE_ARCHIVED;
1872
1873         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1874         journal_file_close(old_file);
1875
1876         *f = new_file;
1877         return r;
1878 }
1879
1880 int journal_file_open_reliably(
1881                 const char *fname,
1882                 int flags,
1883                 mode_t mode,
1884                 JournalFile *template,
1885                 JournalFile **ret) {
1886
1887         int r;
1888         size_t l;
1889         char *p;
1890
1891         r = journal_file_open(fname, flags, mode, template, ret);
1892         if (r != -EBADMSG && /* corrupted */
1893             r != -ENODATA && /* truncated */
1894             r != -EHOSTDOWN && /* other machine */
1895             r != -EPROTONOSUPPORT) /* incompatible feature */
1896                 return r;
1897
1898         if ((flags & O_ACCMODE) == O_RDONLY)
1899                 return r;
1900
1901         if (!(flags & O_CREAT))
1902                 return r;
1903
1904         /* The file is corrupted. Rotate it away and try it again (but only once) */
1905
1906         l = strlen(fname);
1907         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1908                      (int) (l-8), fname,
1909                      (unsigned long long) now(CLOCK_REALTIME),
1910                      random_ull()) < 0)
1911                 return -ENOMEM;
1912
1913         r = rename(fname, p);
1914         free(p);
1915         if (r < 0)
1916                 return -errno;
1917
1918         log_warning("File %s corrupted, renaming and replacing.", fname);
1919
1920         return journal_file_open(fname, flags, mode, template, ret);
1921 }
1922
1923 struct vacuum_info {
1924         off_t usage;
1925         char *filename;
1926
1927         uint64_t realtime;
1928         sd_id128_t seqnum_id;
1929         uint64_t seqnum;
1930
1931         bool have_seqnum;
1932 };
1933
1934 static int vacuum_compare(const void *_a, const void *_b) {
1935         const struct vacuum_info *a, *b;
1936
1937         a = _a;
1938         b = _b;
1939
1940         if (a->have_seqnum && b->have_seqnum &&
1941             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1942                 if (a->seqnum < b->seqnum)
1943                         return -1;
1944                 else if (a->seqnum > b->seqnum)
1945                         return 1;
1946                 else
1947                         return 0;
1948         }
1949
1950         if (a->realtime < b->realtime)
1951                 return -1;
1952         else if (a->realtime > b->realtime)
1953                 return 1;
1954         else if (a->have_seqnum && b->have_seqnum)
1955                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1956         else
1957                 return strcmp(a->filename, b->filename);
1958 }
1959
1960 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1961         DIR *d;
1962         int r = 0;
1963         struct vacuum_info *list = NULL;
1964         unsigned n_list = 0, n_allocated = 0, i;
1965         uint64_t sum = 0;
1966
1967         assert(directory);
1968
1969         if (max_use <= 0)
1970                 return 0;
1971
1972         d = opendir(directory);
1973         if (!d)
1974                 return -errno;
1975
1976         for (;;) {
1977                 int k;
1978                 struct dirent buf, *de;
1979                 size_t q;
1980                 struct stat st;
1981                 char *p;
1982                 unsigned long long seqnum = 0, realtime;
1983                 sd_id128_t seqnum_id;
1984                 bool have_seqnum;
1985
1986                 k = readdir_r(d, &buf, &de);
1987                 if (k != 0) {
1988                         r = -k;
1989                         goto finish;
1990                 }
1991
1992                 if (!de)
1993                         break;
1994
1995                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
1996                         continue;
1997
1998                 if (!S_ISREG(st.st_mode))
1999                         continue;
2000
2001                 q = strlen(de->d_name);
2002
2003                 if (endswith(de->d_name, ".journal")) {
2004
2005                         /* Vacuum archived files */
2006
2007                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2008                                 continue;
2009
2010                         if (de->d_name[q-8-16-1] != '-' ||
2011                             de->d_name[q-8-16-1-16-1] != '-' ||
2012                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2013                                 continue;
2014
2015                         p = strdup(de->d_name);
2016                         if (!p) {
2017                                 r = -ENOMEM;
2018                                 goto finish;
2019                         }
2020
2021                         de->d_name[q-8-16-1-16-1] = 0;
2022                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2023                                 free(p);
2024                                 continue;
2025                         }
2026
2027                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2028                                 free(p);
2029                                 continue;
2030                         }
2031
2032                         have_seqnum = true;
2033
2034                 } else if (endswith(de->d_name, ".journal~")) {
2035                         unsigned long long tmp;
2036
2037                         /* Vacuum corrupted files */
2038
2039                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2040                                 continue;
2041
2042                         if (de->d_name[q-1-8-16-1] != '-' ||
2043                             de->d_name[q-1-8-16-1-16-1] != '@')
2044                                 continue;
2045
2046                         p = strdup(de->d_name);
2047                         if (!p) {
2048                                 r = -ENOMEM;
2049                                 goto finish;
2050                         }
2051
2052                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2053                                 free(p);
2054                                 continue;
2055                         }
2056
2057                         have_seqnum = false;
2058                 } else
2059                         continue;
2060
2061                 if (n_list >= n_allocated) {
2062                         struct vacuum_info *j;
2063
2064                         n_allocated = MAX(n_allocated * 2U, 8U);
2065                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2066                         if (!j) {
2067                                 free(p);
2068                                 r = -ENOMEM;
2069                                 goto finish;
2070                         }
2071
2072                         list = j;
2073                 }
2074
2075                 list[n_list].filename = p;
2076                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2077                 list[n_list].seqnum = seqnum;
2078                 list[n_list].realtime = realtime;
2079                 list[n_list].seqnum_id = seqnum_id;
2080                 list[n_list].have_seqnum = have_seqnum;
2081
2082                 sum += list[n_list].usage;
2083
2084                 n_list ++;
2085         }
2086
2087         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2088
2089         for(i = 0; i < n_list; i++) {
2090                 struct statvfs ss;
2091
2092                 if (fstatvfs(dirfd(d), &ss) < 0) {
2093                         r = -errno;
2094                         goto finish;
2095                 }
2096
2097                 if (sum <= max_use &&
2098                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2099                         break;
2100
2101                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2102                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2103                         sum -= list[i].usage;
2104                 } else if (errno != ENOENT)
2105                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2106         }
2107
2108 finish:
2109         for (i = 0; i < n_list; i++)
2110                 free(list[i].filename);
2111
2112         free(list);
2113
2114         if (d)
2115                 closedir(d);
2116
2117         return r;
2118 }
2119
2120 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2121         uint64_t i, n;
2122         uint64_t q, xor_hash = 0;
2123         int r;
2124         EntryItem *items;
2125         dual_timestamp ts;
2126
2127         assert(from);
2128         assert(to);
2129         assert(o);
2130         assert(p);
2131
2132         if (!to->writable)
2133                 return -EPERM;
2134
2135         ts.monotonic = le64toh(o->entry.monotonic);
2136         ts.realtime = le64toh(o->entry.realtime);
2137
2138         if (to->tail_entry_monotonic_valid &&
2139             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2140                 return -EINVAL;
2141
2142         if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2143                 return -EINVAL;
2144
2145         n = journal_file_entry_n_items(o);
2146         items = alloca(sizeof(EntryItem) * n);
2147
2148         for (i = 0; i < n; i++) {
2149                 uint64_t l, h;
2150                 le64_t le_hash;
2151                 size_t t;
2152                 void *data;
2153                 Object *u;
2154
2155                 q = le64toh(o->entry.items[i].object_offset);
2156                 le_hash = o->entry.items[i].hash;
2157
2158                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2159                 if (r < 0)
2160                         return r;
2161
2162                 if (le_hash != o->data.hash)
2163                         return -EBADMSG;
2164
2165                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2166                 t = (size_t) l;
2167
2168                 /* We hit the limit on 32bit machines */
2169                 if ((uint64_t) t != l)
2170                         return -E2BIG;
2171
2172                 if (o->object.flags & OBJECT_COMPRESSED) {
2173 #ifdef HAVE_XZ
2174                         uint64_t rsize;
2175
2176                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2177                                 return -EBADMSG;
2178
2179                         data = from->compress_buffer;
2180                         l = rsize;
2181 #else
2182                         return -EPROTONOSUPPORT;
2183 #endif
2184                 } else
2185                         data = o->data.payload;
2186
2187                 r = journal_file_append_data(to, data, l, &u, &h);
2188                 if (r < 0)
2189                         return r;
2190
2191                 xor_hash ^= le64toh(u->data.hash);
2192                 items[i].object_offset = htole64(h);
2193                 items[i].hash = u->data.hash;
2194
2195                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2196                 if (r < 0)
2197                         return r;
2198         }
2199
2200         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2201 }
2202
2203 void journal_default_metrics(JournalMetrics *m, int fd) {
2204         uint64_t fs_size = 0;
2205         struct statvfs ss;
2206         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2207
2208         assert(m);
2209         assert(fd >= 0);
2210
2211         if (fstatvfs(fd, &ss) >= 0)
2212                 fs_size = ss.f_frsize * ss.f_blocks;
2213
2214         if (m->max_use == (uint64_t) -1) {
2215
2216                 if (fs_size > 0) {
2217                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2218
2219                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2220                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2221
2222                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2223                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2224                 } else
2225                         m->max_use = DEFAULT_MAX_USE_LOWER;
2226         } else {
2227                 m->max_use = PAGE_ALIGN(m->max_use);
2228
2229                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2230                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2231         }
2232
2233         if (m->max_size == (uint64_t) -1) {
2234                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2235
2236                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2237                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2238         } else
2239                 m->max_size = PAGE_ALIGN(m->max_size);
2240
2241         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2242                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2243
2244         if (m->max_size*2 > m->max_use)
2245                 m->max_use = m->max_size*2;
2246
2247         if (m->min_size == (uint64_t) -1)
2248                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2249         else {
2250                 m->min_size = PAGE_ALIGN(m->min_size);
2251
2252                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2253                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2254
2255                 if (m->min_size > m->max_size)
2256                         m->max_size = m->min_size;
2257         }
2258
2259         if (m->keep_free == (uint64_t) -1) {
2260
2261                 if (fs_size > 0) {
2262                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2263
2264                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2265                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2266
2267                 } else
2268                         m->keep_free = DEFAULT_KEEP_FREE;
2269         }
2270
2271         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2272                  format_bytes(a, sizeof(a), m->max_use),
2273                  format_bytes(b, sizeof(b), m->max_size),
2274                  format_bytes(c, sizeof(c), m->min_size),
2275                  format_bytes(d, sizeof(d), m->keep_free));
2276 }