chiark / gitweb /
journal: correct list link up on hash collisions
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
62
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
64
65 void journal_file_close(JournalFile *f) {
66         int t;
67
68         assert(f);
69
70         if (f->header) {
71                 if (f->writable)
72                         f->header->state = STATE_OFFLINE;
73
74                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
75         }
76
77         for (t = 0; t < _WINDOW_MAX; t++)
78                 if (f->windows[t].ptr)
79                         munmap(f->windows[t].ptr, f->windows[t].size);
80
81         if (f->fd >= 0)
82                 close_nointr_nofail(f->fd);
83
84         free(f->path);
85
86 #ifdef HAVE_XZ
87         free(f->compress_buffer);
88 #endif
89
90         free(f);
91 }
92
93 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
94         Header h;
95         ssize_t k;
96         int r;
97
98         assert(f);
99
100         zero(h);
101         memcpy(h.signature, signature, 8);
102         h.header_size = htole64(ALIGN64(sizeof(h)));
103
104         r = sd_id128_randomize(&h.file_id);
105         if (r < 0)
106                 return r;
107
108         if (template) {
109                 h.seqnum_id = template->header->seqnum_id;
110                 h.seqnum = template->header->seqnum;
111         } else
112                 h.seqnum_id = h.file_id;
113
114         k = pwrite(f->fd, &h, sizeof(h), 0);
115         if (k < 0)
116                 return -errno;
117
118         if (k != sizeof(h))
119                 return -EIO;
120
121         return 0;
122 }
123
124 static int journal_file_refresh_header(JournalFile *f) {
125         int r;
126         sd_id128_t boot_id;
127
128         assert(f);
129
130         r = sd_id128_get_machine(&f->header->machine_id);
131         if (r < 0)
132                 return r;
133
134         r = sd_id128_get_boot(&boot_id);
135         if (r < 0)
136                 return r;
137
138         if (sd_id128_equal(boot_id, f->header->boot_id))
139                 f->tail_entry_monotonic_valid = true;
140
141         f->header->boot_id = boot_id;
142
143         f->header->state = STATE_ONLINE;
144
145         __sync_synchronize();
146
147         return 0;
148 }
149
150 static int journal_file_verify_header(JournalFile *f) {
151         assert(f);
152
153         if (memcmp(f->header, signature, 8))
154                 return -EBADMSG;
155
156 #ifdef HAVE_XZ
157         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
158                 return -EPROTONOSUPPORT;
159 #else
160         if (f->header->incompatible_flags != 0)
161                 return -EPROTONOSUPPORT;
162 #endif
163
164         if (f->header->header_size != htole64(ALIGN64(sizeof(*(f->header)))))
165                 return -EBADMSG;
166
167         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
168                 return -ENODATA;
169
170         if (f->writable) {
171                 uint8_t state;
172                 sd_id128_t machine_id;
173                 int r;
174
175                 r = sd_id128_get_machine(&machine_id);
176                 if (r < 0)
177                         return r;
178
179                 if (!sd_id128_equal(machine_id, f->header->machine_id))
180                         return -EHOSTDOWN;
181
182                 state = f->header->state;
183
184                 if (state == STATE_ONLINE)
185                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
186                 else if (state == STATE_ARCHIVED)
187                         return -ESHUTDOWN;
188                 else if (state != STATE_OFFLINE)
189                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
190         }
191
192         return 0;
193 }
194
195 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
196         uint64_t old_size, new_size;
197         int r;
198
199         assert(f);
200
201         /* We assume that this file is not sparse, and we know that
202          * for sure, since we always call posix_fallocate()
203          * ourselves */
204
205         old_size =
206                 le64toh(f->header->header_size) +
207                 le64toh(f->header->arena_size);
208
209         new_size = PAGE_ALIGN(offset + size);
210         if (new_size < le64toh(f->header->header_size))
211                 new_size = le64toh(f->header->header_size);
212
213         if (new_size <= old_size)
214                 return 0;
215
216         if (f->metrics.max_size > 0 &&
217             new_size > f->metrics.max_size)
218                 return -E2BIG;
219
220         if (new_size > f->metrics.min_size &&
221             f->metrics.keep_free > 0) {
222                 struct statvfs svfs;
223
224                 if (fstatvfs(f->fd, &svfs) >= 0) {
225                         uint64_t available;
226
227                         available = svfs.f_bfree * svfs.f_bsize;
228
229                         if (available >= f->metrics.keep_free)
230                                 available -= f->metrics.keep_free;
231                         else
232                                 available = 0;
233
234                         if (new_size - old_size > available)
235                                 return -E2BIG;
236                 }
237         }
238
239         /* Note that the glibc fallocate() fallback is very
240            inefficient, hence we try to minimize the allocation area
241            as we can. */
242         r = posix_fallocate(f->fd, old_size, new_size - old_size);
243         if (r != 0)
244                 return -r;
245
246         if (fstat(f->fd, &f->last_stat) < 0)
247                 return -errno;
248
249         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
250
251         return 0;
252 }
253
254 static int journal_file_map(
255                 JournalFile *f,
256                 uint64_t offset,
257                 uint64_t size,
258                 void **_window,
259                 uint64_t *_woffset,
260                 uint64_t *_wsize,
261                 void **ret) {
262
263         uint64_t woffset, wsize;
264         void *window;
265
266         assert(f);
267         assert(size > 0);
268         assert(ret);
269
270         woffset = offset & ~((uint64_t) page_size() - 1ULL);
271         wsize = size + (offset - woffset);
272         wsize = PAGE_ALIGN(wsize);
273
274         /* Avoid SIGBUS on invalid accesses */
275         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
276                 return -EADDRNOTAVAIL;
277
278         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
279         if (window == MAP_FAILED)
280                 return -errno;
281
282         if (_window)
283                 *_window = window;
284
285         if (_woffset)
286                 *_woffset = woffset;
287
288         if (_wsize)
289                 *_wsize = wsize;
290
291         *ret = (uint8_t*) window + (offset - woffset);
292
293         return 0;
294 }
295
296 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
297         void *p = NULL;
298         uint64_t delta;
299         int r;
300         Window *w;
301
302         assert(f);
303         assert(ret);
304         assert(wt >= 0);
305         assert(wt < _WINDOW_MAX);
306
307         if (offset + size > (uint64_t) f->last_stat.st_size) {
308                 /* Hmm, out of range? Let's refresh the fstat() data
309                  * first, before we trust that check. */
310
311                 if (fstat(f->fd, &f->last_stat) < 0 ||
312                     offset + size > (uint64_t) f->last_stat.st_size)
313                         return -EADDRNOTAVAIL;
314         }
315
316         w = f->windows + wt;
317
318         if (_likely_(w->ptr &&
319                      w->offset <= offset &&
320                      w->offset + w->size >= offset + size)) {
321
322                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
323                 return 0;
324         }
325
326         if (w->ptr) {
327                 if (munmap(w->ptr, w->size) < 0)
328                         return -errno;
329
330                 w->ptr = NULL;
331                 w->size = w->offset = 0;
332         }
333
334         if (size < DEFAULT_WINDOW_SIZE) {
335                 /* If the default window size is larger then what was
336                  * asked for extend the mapping a bit in the hope to
337                  * minimize needed remappings later on. We add half
338                  * the window space before and half behind the
339                  * requested mapping */
340
341                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
342
343                 if (delta > offset)
344                         delta = offset;
345
346                 offset -= delta;
347                 size = DEFAULT_WINDOW_SIZE;
348         } else
349                 delta = 0;
350
351         if (offset + size > (uint64_t) f->last_stat.st_size)
352                 size = (uint64_t) f->last_stat.st_size - offset;
353
354         if (size <= 0)
355                 return -EADDRNOTAVAIL;
356
357         r = journal_file_map(f,
358                              offset, size,
359                              &w->ptr, &w->offset, &w->size,
360                              &p);
361
362         if (r < 0)
363                 return r;
364
365         *ret = (uint8_t*) p + delta;
366         return 0;
367 }
368
369 static bool verify_hash(Object *o) {
370         uint64_t h1, h2;
371
372         assert(o);
373
374         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
375                 h1 = le64toh(o->data.hash);
376                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
377         } else if (o->object.type == OBJECT_FIELD) {
378                 h1 = le64toh(o->field.hash);
379                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
380         } else
381                 return true;
382
383         return h1 == h2;
384 }
385
386 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
387         int r;
388         void *t;
389         Object *o;
390         uint64_t s;
391
392         assert(f);
393         assert(ret);
394         assert(type < _OBJECT_TYPE_MAX);
395
396         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
397         if (r < 0)
398                 return r;
399
400         o = (Object*) t;
401         s = le64toh(o->object.size);
402
403         if (s < sizeof(ObjectHeader))
404                 return -EBADMSG;
405
406         if (type >= 0 && o->object.type != type)
407                 return -EBADMSG;
408
409         if (s > sizeof(ObjectHeader)) {
410                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
411                 if (r < 0)
412                         return r;
413
414                 o = (Object*) t;
415         }
416
417         if (!verify_hash(o))
418                 return -EBADMSG;
419
420         *ret = o;
421         return 0;
422 }
423
424 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
425         uint64_t r;
426
427         assert(f);
428
429         r = le64toh(f->header->seqnum) + 1;
430
431         if (seqnum) {
432                 /* If an external seqnum counter was passed, we update
433                  * both the local and the external one, and set it to
434                  * the maximum of both */
435
436                 if (*seqnum + 1 > r)
437                         r = *seqnum + 1;
438
439                 *seqnum = r;
440         }
441
442         f->header->seqnum = htole64(r);
443
444         if (f->header->first_seqnum == 0)
445                 f->header->first_seqnum = htole64(r);
446
447         return r;
448 }
449
450 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
451         int r;
452         uint64_t p;
453         Object *tail, *o;
454         void *t;
455
456         assert(f);
457         assert(size >= sizeof(ObjectHeader));
458         assert(offset);
459         assert(ret);
460
461         p = le64toh(f->header->tail_object_offset);
462         if (p == 0)
463                 p = le64toh(f->header->header_size);
464         else {
465                 r = journal_file_move_to_object(f, -1, p, &tail);
466                 if (r < 0)
467                         return r;
468
469                 p += ALIGN64(le64toh(tail->object.size));
470         }
471
472         r = journal_file_allocate(f, p, size);
473         if (r < 0)
474                 return r;
475
476         r = journal_file_move_to(f, type, p, size, &t);
477         if (r < 0)
478                 return r;
479
480         o = (Object*) t;
481
482         zero(o->object);
483         o->object.type = type;
484         o->object.size = htole64(size);
485
486         f->header->tail_object_offset = htole64(p);
487         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
488
489         *ret = o;
490         *offset = p;
491
492         return 0;
493 }
494
495 static int journal_file_setup_data_hash_table(JournalFile *f) {
496         uint64_t s, p;
497         Object *o;
498         int r;
499
500         assert(f);
501
502         s = DEFAULT_DATA_HASH_TABLE_SIZE;
503         r = journal_file_append_object(f,
504                                        OBJECT_DATA_HASH_TABLE,
505                                        offsetof(Object, hash_table.items) + s,
506                                        &o, &p);
507         if (r < 0)
508                 return r;
509
510         memset(o->hash_table.items, 0, s);
511
512         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
513         f->header->data_hash_table_size = htole64(s);
514
515         return 0;
516 }
517
518 static int journal_file_setup_field_hash_table(JournalFile *f) {
519         uint64_t s, p;
520         Object *o;
521         int r;
522
523         assert(f);
524
525         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
526         r = journal_file_append_object(f,
527                                        OBJECT_FIELD_HASH_TABLE,
528                                        offsetof(Object, hash_table.items) + s,
529                                        &o, &p);
530         if (r < 0)
531                 return r;
532
533         memset(o->hash_table.items, 0, s);
534
535         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
536         f->header->field_hash_table_size = htole64(s);
537
538         return 0;
539 }
540
541 static int journal_file_map_data_hash_table(JournalFile *f) {
542         uint64_t s, p;
543         void *t;
544         int r;
545
546         assert(f);
547
548         p = le64toh(f->header->data_hash_table_offset);
549         s = le64toh(f->header->data_hash_table_size);
550
551         r = journal_file_move_to(f,
552                                  WINDOW_DATA_HASH_TABLE,
553                                  p, s,
554                                  &t);
555         if (r < 0)
556                 return r;
557
558         f->data_hash_table = t;
559         return 0;
560 }
561
562 static int journal_file_map_field_hash_table(JournalFile *f) {
563         uint64_t s, p;
564         void *t;
565         int r;
566
567         assert(f);
568
569         p = le64toh(f->header->field_hash_table_offset);
570         s = le64toh(f->header->field_hash_table_size);
571
572         r = journal_file_move_to(f,
573                                  WINDOW_FIELD_HASH_TABLE,
574                                  p, s,
575                                  &t);
576         if (r < 0)
577                 return r;
578
579         f->field_hash_table = t;
580         return 0;
581 }
582
583 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
584         uint64_t p, h;
585         int r;
586
587         assert(f);
588         assert(o);
589         assert(offset > 0);
590         assert(o->object.type == OBJECT_DATA);
591
592         /* This might alter the window we are looking at */
593
594         o->data.next_hash_offset = o->data.next_field_offset = 0;
595         o->data.entry_offset = o->data.entry_array_offset = 0;
596         o->data.n_entries = 0;
597
598         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
599         p = le64toh(f->data_hash_table[h].tail_hash_offset);
600         if (p == 0) {
601                 /* Only entry in the hash table is easy */
602                 f->data_hash_table[h].head_hash_offset = htole64(offset);
603         } else {
604                 /* Move back to the previous data object, to patch in
605                  * pointer */
606
607                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
608                 if (r < 0)
609                         return r;
610
611                 o->data.next_hash_offset = htole64(offset);
612         }
613
614         f->data_hash_table[h].tail_hash_offset = htole64(offset);
615
616         return 0;
617 }
618
619 int journal_file_find_data_object_with_hash(
620                 JournalFile *f,
621                 const void *data, uint64_t size, uint64_t hash,
622                 Object **ret, uint64_t *offset) {
623
624         uint64_t p, osize, h;
625         int r;
626
627         assert(f);
628         assert(data || size == 0);
629
630         osize = offsetof(Object, data.payload) + size;
631
632         if (f->header->data_hash_table_size == 0)
633                 return -EBADMSG;
634
635         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
636         p = le64toh(f->data_hash_table[h].head_hash_offset);
637
638         while (p > 0) {
639                 Object *o;
640
641                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
642                 if (r < 0)
643                         return r;
644
645                 if (le64toh(o->data.hash) != hash)
646                         goto next;
647
648                 if (o->object.flags & OBJECT_COMPRESSED) {
649 #ifdef HAVE_XZ
650                         uint64_t l, rsize;
651
652                         l = le64toh(o->object.size);
653                         if (l <= offsetof(Object, data.payload))
654                                 return -EBADMSG;
655
656                         l -= offsetof(Object, data.payload);
657
658                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
659                                 return -EBADMSG;
660
661                         if (rsize == size &&
662                             memcmp(f->compress_buffer, data, size) == 0) {
663
664                                 if (ret)
665                                         *ret = o;
666
667                                 if (offset)
668                                         *offset = p;
669
670                                 return 1;
671                         }
672 #else
673                         return -EPROTONOSUPPORT;
674 #endif
675
676                 } else if (le64toh(o->object.size) == osize &&
677                            memcmp(o->data.payload, data, size) == 0) {
678
679                         if (ret)
680                                 *ret = o;
681
682                         if (offset)
683                                 *offset = p;
684
685                         return 1;
686                 }
687
688         next:
689                 p = le64toh(o->data.next_hash_offset);
690         }
691
692         return 0;
693 }
694
695 int journal_file_find_data_object(
696                 JournalFile *f,
697                 const void *data, uint64_t size,
698                 Object **ret, uint64_t *offset) {
699
700         uint64_t hash;
701
702         assert(f);
703         assert(data || size == 0);
704
705         hash = hash64(data, size);
706
707         return journal_file_find_data_object_with_hash(f,
708                                                        data, size, hash,
709                                                        ret, offset);
710 }
711
712 static int journal_file_append_data(
713                 JournalFile *f,
714                 const void *data, uint64_t size,
715                 Object **ret, uint64_t *offset) {
716
717         uint64_t hash, p;
718         uint64_t osize;
719         Object *o;
720         int r;
721         bool compressed = false;
722
723         assert(f);
724         assert(data || size == 0);
725
726         hash = hash64(data, size);
727
728         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
729         if (r < 0)
730                 return r;
731         else if (r > 0) {
732
733                 if (ret)
734                         *ret = o;
735
736                 if (offset)
737                         *offset = p;
738
739                 return 0;
740         }
741
742         osize = offsetof(Object, data.payload) + size;
743         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
744         if (r < 0)
745                 return r;
746
747         o->data.hash = htole64(hash);
748
749 #ifdef HAVE_XZ
750         if (f->compress &&
751             size >= COMPRESSION_SIZE_THRESHOLD) {
752                 uint64_t rsize;
753
754                 compressed = compress_blob(data, size, o->data.payload, &rsize);
755
756                 if (compressed) {
757                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
758                         o->object.flags |= OBJECT_COMPRESSED;
759
760                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
761
762                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
763                 }
764         }
765 #endif
766
767         if (!compressed)
768                 memcpy(o->data.payload, data, size);
769
770         r = journal_file_link_data(f, o, p, hash);
771         if (r < 0)
772                 return r;
773
774         /* The linking might have altered the window, so let's
775          * refresh our pointer */
776         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
777         if (r < 0)
778                 return r;
779
780         if (ret)
781                 *ret = o;
782
783         if (offset)
784                 *offset = p;
785
786         return 0;
787 }
788
789 uint64_t journal_file_entry_n_items(Object *o) {
790         assert(o);
791         assert(o->object.type == OBJECT_ENTRY);
792
793         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
794 }
795
796 static uint64_t journal_file_entry_array_n_items(Object *o) {
797         assert(o);
798         assert(o->object.type == OBJECT_ENTRY_ARRAY);
799
800         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
801 }
802
803 static int link_entry_into_array(JournalFile *f,
804                                  le64_t *first,
805                                  le64_t *idx,
806                                  uint64_t p) {
807         int r;
808         uint64_t n = 0, ap = 0, q, i, a, hidx;
809         Object *o;
810
811         assert(f);
812         assert(first);
813         assert(idx);
814         assert(p > 0);
815
816         a = le64toh(*first);
817         i = hidx = le64toh(*idx);
818         while (a > 0) {
819
820                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
821                 if (r < 0)
822                         return r;
823
824                 n = journal_file_entry_array_n_items(o);
825                 if (i < n) {
826                         o->entry_array.items[i] = htole64(p);
827                         *idx = htole64(hidx + 1);
828                         return 0;
829                 }
830
831                 i -= n;
832                 ap = a;
833                 a = le64toh(o->entry_array.next_entry_array_offset);
834         }
835
836         if (hidx > n)
837                 n = (hidx+1) * 2;
838         else
839                 n = n * 2;
840
841         if (n < 4)
842                 n = 4;
843
844         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
845                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
846                                        &o, &q);
847         if (r < 0)
848                 return r;
849
850         o->entry_array.items[i] = htole64(p);
851
852         if (ap == 0)
853                 *first = htole64(q);
854         else {
855                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
856                 if (r < 0)
857                         return r;
858
859                 o->entry_array.next_entry_array_offset = htole64(q);
860         }
861
862         *idx = htole64(hidx + 1);
863
864         return 0;
865 }
866
867 static int link_entry_into_array_plus_one(JournalFile *f,
868                                           le64_t *extra,
869                                           le64_t *first,
870                                           le64_t *idx,
871                                           uint64_t p) {
872
873         int r;
874
875         assert(f);
876         assert(extra);
877         assert(first);
878         assert(idx);
879         assert(p > 0);
880
881         if (*idx == 0)
882                 *extra = htole64(p);
883         else {
884                 le64_t i;
885
886                 i = htole64(le64toh(*idx) - 1);
887                 r = link_entry_into_array(f, first, &i, p);
888                 if (r < 0)
889                         return r;
890         }
891
892         *idx = htole64(le64toh(*idx) + 1);
893         return 0;
894 }
895
896 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
897         uint64_t p;
898         int r;
899         assert(f);
900         assert(o);
901         assert(offset > 0);
902
903         p = le64toh(o->entry.items[i].object_offset);
904         if (p == 0)
905                 return -EINVAL;
906
907         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
908         if (r < 0)
909                 return r;
910
911         return link_entry_into_array_plus_one(f,
912                                               &o->data.entry_offset,
913                                               &o->data.entry_array_offset,
914                                               &o->data.n_entries,
915                                               offset);
916 }
917
918 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
919         uint64_t n, i;
920         int r;
921
922         assert(f);
923         assert(o);
924         assert(offset > 0);
925         assert(o->object.type == OBJECT_ENTRY);
926
927         __sync_synchronize();
928
929         /* Link up the entry itself */
930         r = link_entry_into_array(f,
931                                   &f->header->entry_array_offset,
932                                   &f->header->n_entries,
933                                   offset);
934         if (r < 0)
935                 return r;
936
937         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
938
939         if (f->header->head_entry_realtime == 0)
940                 f->header->head_entry_realtime = o->entry.realtime;
941
942         f->header->tail_entry_realtime = o->entry.realtime;
943         f->header->tail_entry_monotonic = o->entry.monotonic;
944
945         f->tail_entry_monotonic_valid = true;
946
947         /* Link up the items */
948         n = journal_file_entry_n_items(o);
949         for (i = 0; i < n; i++) {
950                 r = journal_file_link_entry_item(f, o, offset, i);
951                 if (r < 0)
952                         return r;
953         }
954
955         return 0;
956 }
957
958 static int journal_file_append_entry_internal(
959                 JournalFile *f,
960                 const dual_timestamp *ts,
961                 uint64_t xor_hash,
962                 const EntryItem items[], unsigned n_items,
963                 uint64_t *seqnum,
964                 Object **ret, uint64_t *offset) {
965         uint64_t np;
966         uint64_t osize;
967         Object *o;
968         int r;
969
970         assert(f);
971         assert(items || n_items == 0);
972         assert(ts);
973
974         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
975
976         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
977         if (r < 0)
978                 return r;
979
980         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
981         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
982         o->entry.realtime = htole64(ts->realtime);
983         o->entry.monotonic = htole64(ts->monotonic);
984         o->entry.xor_hash = htole64(xor_hash);
985         o->entry.boot_id = f->header->boot_id;
986
987         r = journal_file_link_entry(f, o, np);
988         if (r < 0)
989                 return r;
990
991         if (ret)
992                 *ret = o;
993
994         if (offset)
995                 *offset = np;
996
997         return 0;
998 }
999
1000 void journal_file_post_change(JournalFile *f) {
1001         assert(f);
1002
1003         /* inotify() does not receive IN_MODIFY events from file
1004          * accesses done via mmap(). After each access we hence
1005          * trigger IN_MODIFY by truncating the journal file to its
1006          * current size which triggers IN_MODIFY. */
1007
1008         __sync_synchronize();
1009
1010         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1011                 log_error("Failed to to truncate file to its own size: %m");
1012 }
1013
1014 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1015         unsigned i;
1016         EntryItem *items;
1017         int r;
1018         uint64_t xor_hash = 0;
1019         struct dual_timestamp _ts;
1020
1021         assert(f);
1022         assert(iovec || n_iovec == 0);
1023
1024         if (!f->writable)
1025                 return -EPERM;
1026
1027         if (!ts) {
1028                 dual_timestamp_get(&_ts);
1029                 ts = &_ts;
1030         }
1031
1032         if (f->tail_entry_monotonic_valid &&
1033             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1034                 return -EINVAL;
1035
1036         items = alloca(sizeof(EntryItem) * n_iovec);
1037
1038         for (i = 0; i < n_iovec; i++) {
1039                 uint64_t p;
1040                 Object *o;
1041
1042                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1043                 if (r < 0)
1044                         return r;
1045
1046                 xor_hash ^= le64toh(o->data.hash);
1047                 items[i].object_offset = htole64(p);
1048                 items[i].hash = o->data.hash;
1049         }
1050
1051         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1052
1053         journal_file_post_change(f);
1054
1055         return r;
1056 }
1057
1058 static int generic_array_get(JournalFile *f,
1059                              uint64_t first,
1060                              uint64_t i,
1061                              Object **ret, uint64_t *offset) {
1062
1063         Object *o;
1064         uint64_t p = 0, a;
1065         int r;
1066
1067         assert(f);
1068
1069         a = first;
1070         while (a > 0) {
1071                 uint64_t n;
1072
1073                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1074                 if (r < 0)
1075                         return r;
1076
1077                 n = journal_file_entry_array_n_items(o);
1078                 if (i < n) {
1079                         p = le64toh(o->entry_array.items[i]);
1080                         break;
1081                 }
1082
1083                 i -= n;
1084                 a = le64toh(o->entry_array.next_entry_array_offset);
1085         }
1086
1087         if (a <= 0 || p <= 0)
1088                 return 0;
1089
1090         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1091         if (r < 0)
1092                 return r;
1093
1094         if (ret)
1095                 *ret = o;
1096
1097         if (offset)
1098                 *offset = p;
1099
1100         return 1;
1101 }
1102
1103 static int generic_array_get_plus_one(JournalFile *f,
1104                                       uint64_t extra,
1105                                       uint64_t first,
1106                                       uint64_t i,
1107                                       Object **ret, uint64_t *offset) {
1108
1109         Object *o;
1110
1111         assert(f);
1112
1113         if (i == 0) {
1114                 int r;
1115
1116                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1117                 if (r < 0)
1118                         return r;
1119
1120                 if (ret)
1121                         *ret = o;
1122
1123                 if (offset)
1124                         *offset = extra;
1125
1126                 return 1;
1127         }
1128
1129         return generic_array_get(f, first, i-1, ret, offset);
1130 }
1131
1132 enum {
1133         TEST_FOUND,
1134         TEST_LEFT,
1135         TEST_RIGHT
1136 };
1137
1138 static int generic_array_bisect(JournalFile *f,
1139                                 uint64_t first,
1140                                 uint64_t n,
1141                                 uint64_t needle,
1142                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1143                                 direction_t direction,
1144                                 Object **ret,
1145                                 uint64_t *offset,
1146                                 uint64_t *idx) {
1147
1148         uint64_t a, p, t = 0, i = 0, last_p = 0;
1149         bool subtract_one = false;
1150         Object *o, *array = NULL;
1151         int r;
1152
1153         assert(f);
1154         assert(test_object);
1155
1156         a = first;
1157         while (a > 0) {
1158                 uint64_t left, right, k, lp;
1159
1160                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1161                 if (r < 0)
1162                         return r;
1163
1164                 k = journal_file_entry_array_n_items(array);
1165                 right = MIN(k, n);
1166                 if (right <= 0)
1167                         return 0;
1168
1169                 i = right - 1;
1170                 lp = p = le64toh(array->entry_array.items[i]);
1171                 if (p <= 0)
1172                         return -EBADMSG;
1173
1174                 r = test_object(f, p, needle);
1175                 if (r < 0)
1176                         return r;
1177
1178                 if (r == TEST_FOUND)
1179                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1180
1181                 if (r == TEST_RIGHT) {
1182                         left = 0;
1183                         right -= 1;
1184                         for (;;) {
1185                                 if (left == right) {
1186                                         if (direction == DIRECTION_UP)
1187                                                 subtract_one = true;
1188
1189                                         i = left;
1190                                         goto found;
1191                                 }
1192
1193                                 assert(left < right);
1194
1195                                 i = (left + right) / 2;
1196                                 p = le64toh(array->entry_array.items[i]);
1197                                 if (p <= 0)
1198                                         return -EBADMSG;
1199
1200                                 r = test_object(f, p, needle);
1201                                 if (r < 0)
1202                                         return r;
1203
1204                                 if (r == TEST_FOUND)
1205                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1206
1207                                 if (r == TEST_RIGHT)
1208                                         right = i;
1209                                 else
1210                                         left = i + 1;
1211                         }
1212                 }
1213
1214                 if (k > n)
1215                         return 0;
1216
1217                 last_p = lp;
1218
1219                 n -= k;
1220                 t += k;
1221                 a = le64toh(array->entry_array.next_entry_array_offset);
1222         }
1223
1224         return 0;
1225
1226 found:
1227         if (subtract_one && t == 0 && i == 0)
1228                 return 0;
1229
1230         if (subtract_one && i == 0)
1231                 p = last_p;
1232         else if (subtract_one)
1233                 p = le64toh(array->entry_array.items[i-1]);
1234         else
1235                 p = le64toh(array->entry_array.items[i]);
1236
1237         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1238         if (r < 0)
1239                 return r;
1240
1241         if (ret)
1242                 *ret = o;
1243
1244         if (offset)
1245                 *offset = p;
1246
1247         if (idx)
1248                 *idx = t + i - (subtract_one ? 1 : 0);
1249
1250         return 1;
1251 }
1252
1253 static int generic_array_bisect_plus_one(JournalFile *f,
1254                                          uint64_t extra,
1255                                          uint64_t first,
1256                                          uint64_t n,
1257                                          uint64_t needle,
1258                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1259                                          direction_t direction,
1260                                          Object **ret,
1261                                          uint64_t *offset,
1262                                          uint64_t *idx) {
1263
1264         int r;
1265
1266         assert(f);
1267         assert(test_object);
1268
1269         if (n <= 0)
1270                 return 0;
1271
1272         /* This bisects the array in object 'first', but first checks
1273          * an extra  */
1274         r = test_object(f, extra, needle);
1275         if (r < 0)
1276                 return r;
1277         else if (r == TEST_FOUND) {
1278                 Object *o;
1279
1280                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1281                 if (r < 0)
1282                         return r;
1283
1284                 if (ret)
1285                         *ret = o;
1286
1287                 if (offset)
1288                         *offset = extra;
1289
1290                 if (idx)
1291                         *idx = 0;
1292
1293                 return 1;
1294         } else if (r == TEST_RIGHT)
1295                 return 0;
1296
1297         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1298
1299         if (r > 0)
1300                 (*idx) ++;
1301
1302         return r;
1303 }
1304
1305 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1306         Object *o;
1307         int r;
1308
1309         assert(f);
1310         assert(p > 0);
1311
1312         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1313         if (r < 0)
1314                 return r;
1315
1316         if (le64toh(o->entry.seqnum) == needle)
1317                 return TEST_FOUND;
1318         else if (le64toh(o->entry.seqnum) < needle)
1319                 return TEST_LEFT;
1320         else
1321                 return TEST_RIGHT;
1322 }
1323
1324 int journal_file_move_to_entry_by_seqnum(
1325                 JournalFile *f,
1326                 uint64_t seqnum,
1327                 direction_t direction,
1328                 Object **ret,
1329                 uint64_t *offset) {
1330
1331         return generic_array_bisect(f,
1332                                     le64toh(f->header->entry_array_offset),
1333                                     le64toh(f->header->n_entries),
1334                                     seqnum,
1335                                     test_object_seqnum,
1336                                     direction,
1337                                     ret, offset, NULL);
1338 }
1339
1340 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1341         Object *o;
1342         int r;
1343
1344         assert(f);
1345         assert(p > 0);
1346
1347         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1348         if (r < 0)
1349                 return r;
1350
1351         if (le64toh(o->entry.realtime) == needle)
1352                 return TEST_FOUND;
1353         else if (le64toh(o->entry.realtime) < needle)
1354                 return TEST_LEFT;
1355         else
1356                 return TEST_RIGHT;
1357 }
1358
1359 int journal_file_move_to_entry_by_realtime(
1360                 JournalFile *f,
1361                 uint64_t realtime,
1362                 direction_t direction,
1363                 Object **ret,
1364                 uint64_t *offset) {
1365
1366         return generic_array_bisect(f,
1367                                     le64toh(f->header->entry_array_offset),
1368                                     le64toh(f->header->n_entries),
1369                                     realtime,
1370                                     test_object_realtime,
1371                                     direction,
1372                                     ret, offset, NULL);
1373 }
1374
1375 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1376         Object *o;
1377         int r;
1378
1379         assert(f);
1380         assert(p > 0);
1381
1382         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1383         if (r < 0)
1384                 return r;
1385
1386         if (le64toh(o->entry.monotonic) == needle)
1387                 return TEST_FOUND;
1388         else if (le64toh(o->entry.monotonic) < needle)
1389                 return TEST_LEFT;
1390         else
1391                 return TEST_RIGHT;
1392 }
1393
1394 int journal_file_move_to_entry_by_monotonic(
1395                 JournalFile *f,
1396                 sd_id128_t boot_id,
1397                 uint64_t monotonic,
1398                 direction_t direction,
1399                 Object **ret,
1400                 uint64_t *offset) {
1401
1402         char t[8+32+1] = "_BOOT_ID=";
1403         Object *o;
1404         int r;
1405
1406         sd_id128_to_string(boot_id, t + 8);
1407
1408         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1409         if (r < 0)
1410                 return r;
1411         else if (r == 0)
1412                 return -ENOENT;
1413
1414         return generic_array_bisect_plus_one(f,
1415                                              le64toh(o->data.entry_offset),
1416                                              le64toh(o->data.entry_array_offset),
1417                                              le64toh(o->data.n_entries),
1418                                              monotonic,
1419                                              test_object_monotonic,
1420                                              direction,
1421                                              ret, offset, NULL);
1422 }
1423
1424 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1425         assert(f);
1426         assert(p > 0);
1427
1428         if (p == needle)
1429                 return TEST_FOUND;
1430         else if (p < needle)
1431                 return TEST_LEFT;
1432         else
1433                 return TEST_RIGHT;
1434 }
1435
1436 int journal_file_next_entry(
1437                 JournalFile *f,
1438                 Object *o, uint64_t p,
1439                 direction_t direction,
1440                 Object **ret, uint64_t *offset) {
1441
1442         uint64_t i, n;
1443         int r;
1444
1445         assert(f);
1446         assert(p > 0 || !o);
1447
1448         n = le64toh(f->header->n_entries);
1449         if (n <= 0)
1450                 return 0;
1451
1452         if (!o)
1453                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1454         else {
1455                 if (o->object.type != OBJECT_ENTRY)
1456                         return -EINVAL;
1457
1458                 r = generic_array_bisect(f,
1459                                          le64toh(f->header->entry_array_offset),
1460                                          le64toh(f->header->n_entries),
1461                                          p,
1462                                          test_object_offset,
1463                                          DIRECTION_DOWN,
1464                                          NULL, NULL,
1465                                          &i);
1466                 if (r <= 0)
1467                         return r;
1468
1469                 if (direction == DIRECTION_DOWN) {
1470                         if (i >= n - 1)
1471                                 return 0;
1472
1473                         i++;
1474                 } else {
1475                         if (i <= 0)
1476                                 return 0;
1477
1478                         i--;
1479                 }
1480         }
1481
1482         /* And jump to it */
1483         return generic_array_get(f,
1484                                  le64toh(f->header->entry_array_offset),
1485                                  i,
1486                                  ret, offset);
1487 }
1488
1489 int journal_file_skip_entry(
1490                 JournalFile *f,
1491                 Object *o, uint64_t p,
1492                 int64_t skip,
1493                 Object **ret, uint64_t *offset) {
1494
1495         uint64_t i, n;
1496         int r;
1497
1498         assert(f);
1499         assert(o);
1500         assert(p > 0);
1501
1502         if (o->object.type != OBJECT_ENTRY)
1503                 return -EINVAL;
1504
1505         r = generic_array_bisect(f,
1506                                  le64toh(f->header->entry_array_offset),
1507                                  le64toh(f->header->n_entries),
1508                                  p,
1509                                  test_object_offset,
1510                                  DIRECTION_DOWN,
1511                                  NULL, NULL,
1512                                  &i);
1513         if (r <= 0)
1514                 return r;
1515
1516         /* Calculate new index */
1517         if (skip < 0) {
1518                 if ((uint64_t) -skip >= i)
1519                         i = 0;
1520                 else
1521                         i = i - (uint64_t) -skip;
1522         } else
1523                 i  += (uint64_t) skip;
1524
1525         n = le64toh(f->header->n_entries);
1526         if (n <= 0)
1527                 return -EBADMSG;
1528
1529         if (i >= n)
1530                 i = n-1;
1531
1532         return generic_array_get(f,
1533                                  le64toh(f->header->entry_array_offset),
1534                                  i,
1535                                  ret, offset);
1536 }
1537
1538 int journal_file_next_entry_for_data(
1539                 JournalFile *f,
1540                 Object *o, uint64_t p,
1541                 uint64_t data_offset,
1542                 direction_t direction,
1543                 Object **ret, uint64_t *offset) {
1544
1545         uint64_t n, i;
1546         int r;
1547         Object *d;
1548
1549         assert(f);
1550         assert(p > 0 || !o);
1551
1552         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1553         if (r < 0)
1554                 return r;
1555
1556         n = le64toh(d->data.n_entries);
1557         if (n <= 0)
1558                 return n;
1559
1560         if (!o)
1561                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1562         else {
1563                 if (o->object.type != OBJECT_ENTRY)
1564                         return -EINVAL;
1565
1566                 r = generic_array_bisect_plus_one(f,
1567                                                   le64toh(d->data.entry_offset),
1568                                                   le64toh(d->data.entry_array_offset),
1569                                                   le64toh(d->data.n_entries),
1570                                                   p,
1571                                                   test_object_offset,
1572                                                   DIRECTION_DOWN,
1573                                                   NULL, NULL,
1574                                                   &i);
1575
1576                 if (r <= 0)
1577                         return r;
1578
1579                 if (direction == DIRECTION_DOWN) {
1580                         if (i >= n - 1)
1581                                 return 0;
1582
1583                         i++;
1584                 } else {
1585                         if (i <= 0)
1586                                 return 0;
1587
1588                         i--;
1589                 }
1590
1591         }
1592
1593         return generic_array_get_plus_one(f,
1594                                           le64toh(d->data.entry_offset),
1595                                           le64toh(d->data.entry_array_offset),
1596                                           i,
1597                                           ret, offset);
1598 }
1599
1600 int journal_file_move_to_entry_by_seqnum_for_data(
1601                 JournalFile *f,
1602                 uint64_t data_offset,
1603                 uint64_t seqnum,
1604                 direction_t direction,
1605                 Object **ret, uint64_t *offset) {
1606
1607         Object *d;
1608         int r;
1609
1610         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1611         if (r <= 0)
1612                 return r;
1613
1614         return generic_array_bisect_plus_one(f,
1615                                              le64toh(d->data.entry_offset),
1616                                              le64toh(d->data.entry_array_offset),
1617                                              le64toh(d->data.n_entries),
1618                                              seqnum,
1619                                              test_object_seqnum,
1620                                              direction,
1621                                              ret, offset, NULL);
1622 }
1623
1624 int journal_file_move_to_entry_by_realtime_for_data(
1625                 JournalFile *f,
1626                 uint64_t data_offset,
1627                 uint64_t realtime,
1628                 direction_t direction,
1629                 Object **ret, uint64_t *offset) {
1630
1631         Object *d;
1632         int r;
1633
1634         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1635         if (r <= 0)
1636                 return r;
1637
1638         return generic_array_bisect_plus_one(f,
1639                                              le64toh(d->data.entry_offset),
1640                                              le64toh(d->data.entry_array_offset),
1641                                              le64toh(d->data.n_entries),
1642                                              realtime,
1643                                              test_object_realtime,
1644                                              direction,
1645                                              ret, offset, NULL);
1646 }
1647
1648 void journal_file_dump(JournalFile *f) {
1649         char a[33], b[33], c[33];
1650         Object *o;
1651         int r;
1652         uint64_t p;
1653
1654         assert(f);
1655
1656         printf("File Path: %s\n"
1657                "File ID: %s\n"
1658                "Machine ID: %s\n"
1659                "Boot ID: %s\n"
1660                "Arena size: %llu\n"
1661                "Objects: %lu\n"
1662                "Entries: %lu\n",
1663                f->path,
1664                sd_id128_to_string(f->header->file_id, a),
1665                sd_id128_to_string(f->header->machine_id, b),
1666                sd_id128_to_string(f->header->boot_id, c),
1667                (unsigned long long) le64toh(f->header->arena_size),
1668                (unsigned long) le64toh(f->header->n_objects),
1669                (unsigned long) le64toh(f->header->n_entries));
1670
1671         p = le64toh(f->header->header_size);
1672         while (p != 0) {
1673                 r = journal_file_move_to_object(f, -1, p, &o);
1674                 if (r < 0)
1675                         goto fail;
1676
1677                 switch (o->object.type) {
1678
1679                 case OBJECT_UNUSED:
1680                         printf("Type: OBJECT_UNUSED\n");
1681                         break;
1682
1683                 case OBJECT_DATA:
1684                         printf("Type: OBJECT_DATA\n");
1685                         break;
1686
1687                 case OBJECT_ENTRY:
1688                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1689                                (unsigned long long) le64toh(o->entry.seqnum),
1690                                (unsigned long long) le64toh(o->entry.monotonic),
1691                                (unsigned long long) le64toh(o->entry.realtime));
1692                         break;
1693
1694                 case OBJECT_FIELD_HASH_TABLE:
1695                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1696                         break;
1697
1698                 case OBJECT_DATA_HASH_TABLE:
1699                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1700                         break;
1701
1702                 case OBJECT_ENTRY_ARRAY:
1703                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1704                         break;
1705
1706                 case OBJECT_SIGNATURE:
1707                         printf("Type: OBJECT_SIGNATURE\n");
1708                         break;
1709                 }
1710
1711                 if (o->object.flags & OBJECT_COMPRESSED)
1712                         printf("Flags: COMPRESSED\n");
1713
1714                 if (p == le64toh(f->header->tail_object_offset))
1715                         p = 0;
1716                 else
1717                         p = p + ALIGN64(le64toh(o->object.size));
1718         }
1719
1720         return;
1721 fail:
1722         log_error("File corrupt");
1723 }
1724
1725 int journal_file_open(
1726                 const char *fname,
1727                 int flags,
1728                 mode_t mode,
1729                 JournalFile *template,
1730                 JournalFile **ret) {
1731
1732         JournalFile *f;
1733         int r;
1734         bool newly_created = false;
1735
1736         assert(fname);
1737
1738         if ((flags & O_ACCMODE) != O_RDONLY &&
1739             (flags & O_ACCMODE) != O_RDWR)
1740                 return -EINVAL;
1741
1742         if (!endswith(fname, ".journal"))
1743                 return -EINVAL;
1744
1745         f = new0(JournalFile, 1);
1746         if (!f)
1747                 return -ENOMEM;
1748
1749         f->fd = -1;
1750         f->flags = flags;
1751         f->mode = mode;
1752         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1753         f->prot = prot_from_flags(flags);
1754
1755         if (template) {
1756                 f->metrics = template->metrics;
1757                 f->compress = template->compress;
1758         }
1759
1760         f->path = strdup(fname);
1761         if (!f->path) {
1762                 r = -ENOMEM;
1763                 goto fail;
1764         }
1765
1766         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1767         if (f->fd < 0) {
1768                 r = -errno;
1769                 goto fail;
1770         }
1771
1772         if (fstat(f->fd, &f->last_stat) < 0) {
1773                 r = -errno;
1774                 goto fail;
1775         }
1776
1777         if (f->last_stat.st_size == 0 && f->writable) {
1778                 newly_created = true;
1779
1780                 r = journal_file_init_header(f, template);
1781                 if (r < 0)
1782                         goto fail;
1783
1784                 if (fstat(f->fd, &f->last_stat) < 0) {
1785                         r = -errno;
1786                         goto fail;
1787                 }
1788         }
1789
1790         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1791                 r = -EIO;
1792                 goto fail;
1793         }
1794
1795         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1796         if (f->header == MAP_FAILED) {
1797                 f->header = NULL;
1798                 r = -errno;
1799                 goto fail;
1800         }
1801
1802         if (!newly_created) {
1803                 r = journal_file_verify_header(f);
1804                 if (r < 0)
1805                         goto fail;
1806         }
1807
1808         if (f->writable) {
1809                 r = journal_file_refresh_header(f);
1810                 if (r < 0)
1811                         goto fail;
1812         }
1813
1814         if (newly_created) {
1815
1816                 r = journal_file_setup_field_hash_table(f);
1817                 if (r < 0)
1818                         goto fail;
1819
1820                 r = journal_file_setup_data_hash_table(f);
1821                 if (r < 0)
1822                         goto fail;
1823         }
1824
1825         r = journal_file_map_field_hash_table(f);
1826         if (r < 0)
1827                 goto fail;
1828
1829         r = journal_file_map_data_hash_table(f);
1830         if (r < 0)
1831                 goto fail;
1832
1833         if (ret)
1834                 *ret = f;
1835
1836         return 0;
1837
1838 fail:
1839         journal_file_close(f);
1840
1841         return r;
1842 }
1843
1844 int journal_file_rotate(JournalFile **f) {
1845         char *p;
1846         size_t l;
1847         JournalFile *old_file, *new_file = NULL;
1848         int r;
1849
1850         assert(f);
1851         assert(*f);
1852
1853         old_file = *f;
1854
1855         if (!old_file->writable)
1856                 return -EINVAL;
1857
1858         if (!endswith(old_file->path, ".journal"))
1859                 return -EINVAL;
1860
1861         l = strlen(old_file->path);
1862
1863         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1864         if (!p)
1865                 return -ENOMEM;
1866
1867         memcpy(p, old_file->path, l - 8);
1868         p[l-8] = '@';
1869         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1870         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1871                  "-%016llx-%016llx.journal",
1872                  (unsigned long long) le64toh((*f)->header->seqnum),
1873                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1874
1875         r = rename(old_file->path, p);
1876         free(p);
1877
1878         if (r < 0)
1879                 return -errno;
1880
1881         old_file->header->state = STATE_ARCHIVED;
1882
1883         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1884         journal_file_close(old_file);
1885
1886         *f = new_file;
1887         return r;
1888 }
1889
1890 int journal_file_open_reliably(
1891                 const char *fname,
1892                 int flags,
1893                 mode_t mode,
1894                 JournalFile *template,
1895                 JournalFile **ret) {
1896
1897         int r;
1898         size_t l;
1899         char *p;
1900
1901         r = journal_file_open(fname, flags, mode, template, ret);
1902         if (r != -EBADMSG && /* corrupted */
1903             r != -ENODATA && /* truncated */
1904             r != -EHOSTDOWN && /* other machine */
1905             r != -EPROTONOSUPPORT) /* incompatible feature */
1906                 return r;
1907
1908         if ((flags & O_ACCMODE) == O_RDONLY)
1909                 return r;
1910
1911         if (!(flags & O_CREAT))
1912                 return r;
1913
1914         /* The file is corrupted. Rotate it away and try it again (but only once) */
1915
1916         l = strlen(fname);
1917         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1918                      (int) (l-8), fname,
1919                      (unsigned long long) now(CLOCK_REALTIME),
1920                      random_ull()) < 0)
1921                 return -ENOMEM;
1922
1923         r = rename(fname, p);
1924         free(p);
1925         if (r < 0)
1926                 return -errno;
1927
1928         log_warning("File %s corrupted, renaming and replacing.", fname);
1929
1930         return journal_file_open(fname, flags, mode, template, ret);
1931 }
1932
1933 struct vacuum_info {
1934         off_t usage;
1935         char *filename;
1936
1937         uint64_t realtime;
1938         sd_id128_t seqnum_id;
1939         uint64_t seqnum;
1940
1941         bool have_seqnum;
1942 };
1943
1944 static int vacuum_compare(const void *_a, const void *_b) {
1945         const struct vacuum_info *a, *b;
1946
1947         a = _a;
1948         b = _b;
1949
1950         if (a->have_seqnum && b->have_seqnum &&
1951             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1952                 if (a->seqnum < b->seqnum)
1953                         return -1;
1954                 else if (a->seqnum > b->seqnum)
1955                         return 1;
1956                 else
1957                         return 0;
1958         }
1959
1960         if (a->realtime < b->realtime)
1961                 return -1;
1962         else if (a->realtime > b->realtime)
1963                 return 1;
1964         else if (a->have_seqnum && b->have_seqnum)
1965                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1966         else
1967                 return strcmp(a->filename, b->filename);
1968 }
1969
1970 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1971         DIR *d;
1972         int r = 0;
1973         struct vacuum_info *list = NULL;
1974         unsigned n_list = 0, n_allocated = 0, i;
1975         uint64_t sum = 0;
1976
1977         assert(directory);
1978
1979         if (max_use <= 0)
1980                 return 0;
1981
1982         d = opendir(directory);
1983         if (!d)
1984                 return -errno;
1985
1986         for (;;) {
1987                 int k;
1988                 struct dirent buf, *de;
1989                 size_t q;
1990                 struct stat st;
1991                 char *p;
1992                 unsigned long long seqnum = 0, realtime;
1993                 sd_id128_t seqnum_id;
1994                 bool have_seqnum;
1995
1996                 k = readdir_r(d, &buf, &de);
1997                 if (k != 0) {
1998                         r = -k;
1999                         goto finish;
2000                 }
2001
2002                 if (!de)
2003                         break;
2004
2005                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2006                         continue;
2007
2008                 if (!S_ISREG(st.st_mode))
2009                         continue;
2010
2011                 q = strlen(de->d_name);
2012
2013                 if (endswith(de->d_name, ".journal")) {
2014
2015                         /* Vacuum archived files */
2016
2017                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2018                                 continue;
2019
2020                         if (de->d_name[q-8-16-1] != '-' ||
2021                             de->d_name[q-8-16-1-16-1] != '-' ||
2022                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2023                                 continue;
2024
2025                         p = strdup(de->d_name);
2026                         if (!p) {
2027                                 r = -ENOMEM;
2028                                 goto finish;
2029                         }
2030
2031                         de->d_name[q-8-16-1-16-1] = 0;
2032                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2033                                 free(p);
2034                                 continue;
2035                         }
2036
2037                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2038                                 free(p);
2039                                 continue;
2040                         }
2041
2042                         have_seqnum = true;
2043
2044                 } else if (endswith(de->d_name, ".journal~")) {
2045                         unsigned long long tmp;
2046
2047                         /* Vacuum corrupted files */
2048
2049                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2050                                 continue;
2051
2052                         if (de->d_name[q-1-8-16-1] != '-' ||
2053                             de->d_name[q-1-8-16-1-16-1] != '@')
2054                                 continue;
2055
2056                         p = strdup(de->d_name);
2057                         if (!p) {
2058                                 r = -ENOMEM;
2059                                 goto finish;
2060                         }
2061
2062                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2063                                 free(p);
2064                                 continue;
2065                         }
2066
2067                         have_seqnum = false;
2068                 } else
2069                         continue;
2070
2071                 if (n_list >= n_allocated) {
2072                         struct vacuum_info *j;
2073
2074                         n_allocated = MAX(n_allocated * 2U, 8U);
2075                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2076                         if (!j) {
2077                                 free(p);
2078                                 r = -ENOMEM;
2079                                 goto finish;
2080                         }
2081
2082                         list = j;
2083                 }
2084
2085                 list[n_list].filename = p;
2086                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2087                 list[n_list].seqnum = seqnum;
2088                 list[n_list].realtime = realtime;
2089                 list[n_list].seqnum_id = seqnum_id;
2090                 list[n_list].have_seqnum = have_seqnum;
2091
2092                 sum += list[n_list].usage;
2093
2094                 n_list ++;
2095         }
2096
2097         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2098
2099         for(i = 0; i < n_list; i++) {
2100                 struct statvfs ss;
2101
2102                 if (fstatvfs(dirfd(d), &ss) < 0) {
2103                         r = -errno;
2104                         goto finish;
2105                 }
2106
2107                 if (sum <= max_use &&
2108                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2109                         break;
2110
2111                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2112                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2113                         sum -= list[i].usage;
2114                 } else if (errno != ENOENT)
2115                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2116         }
2117
2118 finish:
2119         for (i = 0; i < n_list; i++)
2120                 free(list[i].filename);
2121
2122         free(list);
2123
2124         if (d)
2125                 closedir(d);
2126
2127         return r;
2128 }
2129
2130 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2131         uint64_t i, n;
2132         uint64_t q, xor_hash = 0;
2133         int r;
2134         EntryItem *items;
2135         dual_timestamp ts;
2136
2137         assert(from);
2138         assert(to);
2139         assert(o);
2140         assert(p);
2141
2142         if (!to->writable)
2143                 return -EPERM;
2144
2145         ts.monotonic = le64toh(o->entry.monotonic);
2146         ts.realtime = le64toh(o->entry.realtime);
2147
2148         if (to->tail_entry_monotonic_valid &&
2149             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2150                 return -EINVAL;
2151
2152         if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2153                 return -EINVAL;
2154
2155         n = journal_file_entry_n_items(o);
2156         items = alloca(sizeof(EntryItem) * n);
2157
2158         for (i = 0; i < n; i++) {
2159                 uint64_t l, h;
2160                 le64_t le_hash;
2161                 size_t t;
2162                 void *data;
2163                 Object *u;
2164
2165                 q = le64toh(o->entry.items[i].object_offset);
2166                 le_hash = o->entry.items[i].hash;
2167
2168                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2169                 if (r < 0)
2170                         return r;
2171
2172                 if (le_hash != o->data.hash)
2173                         return -EBADMSG;
2174
2175                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2176                 t = (size_t) l;
2177
2178                 /* We hit the limit on 32bit machines */
2179                 if ((uint64_t) t != l)
2180                         return -E2BIG;
2181
2182                 if (o->object.flags & OBJECT_COMPRESSED) {
2183 #ifdef HAVE_XZ
2184                         uint64_t rsize;
2185
2186                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2187                                 return -EBADMSG;
2188
2189                         data = from->compress_buffer;
2190                         l = rsize;
2191 #else
2192                         return -EPROTONOSUPPORT;
2193 #endif
2194                 } else
2195                         data = o->data.payload;
2196
2197                 r = journal_file_append_data(to, data, l, &u, &h);
2198                 if (r < 0)
2199                         return r;
2200
2201                 xor_hash ^= le64toh(u->data.hash);
2202                 items[i].object_offset = htole64(h);
2203                 items[i].hash = u->data.hash;
2204
2205                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2206                 if (r < 0)
2207                         return r;
2208         }
2209
2210         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2211 }
2212
2213 void journal_default_metrics(JournalMetrics *m, int fd) {
2214         uint64_t fs_size = 0;
2215         struct statvfs ss;
2216         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2217
2218         assert(m);
2219         assert(fd >= 0);
2220
2221         if (fstatvfs(fd, &ss) >= 0)
2222                 fs_size = ss.f_frsize * ss.f_blocks;
2223
2224         if (m->max_use == (uint64_t) -1) {
2225
2226                 if (fs_size > 0) {
2227                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2228
2229                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2230                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2231
2232                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2233                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2234                 } else
2235                         m->max_use = DEFAULT_MAX_USE_LOWER;
2236         } else {
2237                 m->max_use = PAGE_ALIGN(m->max_use);
2238
2239                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2240                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2241         }
2242
2243         if (m->max_size == (uint64_t) -1) {
2244                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2245
2246                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2247                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2248         } else
2249                 m->max_size = PAGE_ALIGN(m->max_size);
2250
2251         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2252                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2253
2254         if (m->max_size*2 > m->max_use)
2255                 m->max_use = m->max_size*2;
2256
2257         if (m->min_size == (uint64_t) -1)
2258                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2259         else {
2260                 m->min_size = PAGE_ALIGN(m->min_size);
2261
2262                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2263                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2264
2265                 if (m->min_size > m->max_size)
2266                         m->max_size = m->min_size;
2267         }
2268
2269         if (m->keep_free == (uint64_t) -1) {
2270
2271                 if (fs_size > 0) {
2272                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2273
2274                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2275                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2276
2277                 } else
2278                         m->keep_free = DEFAULT_KEEP_FREE;
2279         }
2280
2281         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2282                  format_bytes(a, sizeof(a), m->max_use),
2283                  format_bytes(b, sizeof(b), m->max_size),
2284                  format_bytes(c, sizeof(c), m->min_size),
2285                  format_bytes(d, sizeof(d), m->keep_free));
2286 }