chiark / gitweb /
journal: replace arena offset by header size
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
62
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
64
65 void journal_file_close(JournalFile *f) {
66         int t;
67
68         assert(f);
69
70         if (f->header) {
71                 if (f->writable)
72                         f->header->state = STATE_OFFLINE;
73
74                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
75         }
76
77         for (t = 0; t < _WINDOW_MAX; t++)
78                 if (f->windows[t].ptr)
79                         munmap(f->windows[t].ptr, f->windows[t].size);
80
81         if (f->fd >= 0)
82                 close_nointr_nofail(f->fd);
83
84         free(f->path);
85
86 #ifdef HAVE_XZ
87         free(f->compress_buffer);
88 #endif
89
90         free(f);
91 }
92
93 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
94         Header h;
95         ssize_t k;
96         int r;
97
98         assert(f);
99
100         zero(h);
101         memcpy(h.signature, signature, 8);
102         h.header_size = htole64(ALIGN64(sizeof(h)));
103
104         r = sd_id128_randomize(&h.file_id);
105         if (r < 0)
106                 return r;
107
108         if (template) {
109                 h.seqnum_id = template->header->seqnum_id;
110                 h.seqnum = template->header->seqnum;
111         } else
112                 h.seqnum_id = h.file_id;
113
114         k = pwrite(f->fd, &h, sizeof(h), 0);
115         if (k < 0)
116                 return -errno;
117
118         if (k != sizeof(h))
119                 return -EIO;
120
121         return 0;
122 }
123
124 static int journal_file_refresh_header(JournalFile *f) {
125         int r;
126         sd_id128_t boot_id;
127
128         assert(f);
129
130         r = sd_id128_get_machine(&f->header->machine_id);
131         if (r < 0)
132                 return r;
133
134         r = sd_id128_get_boot(&boot_id);
135         if (r < 0)
136                 return r;
137
138         if (sd_id128_equal(boot_id, f->header->boot_id))
139                 f->tail_entry_monotonic_valid = true;
140
141         f->header->boot_id = boot_id;
142
143         f->header->state = STATE_ONLINE;
144
145         __sync_synchronize();
146
147         return 0;
148 }
149
150 static int journal_file_verify_header(JournalFile *f) {
151         assert(f);
152
153         if (memcmp(f->header, signature, 8))
154                 return -EBADMSG;
155
156 #ifdef HAVE_XZ
157         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
158                 return -EPROTONOSUPPORT;
159 #else
160         if (f->header->incompatible_flags != 0)
161                 return -EPROTONOSUPPORT;
162 #endif
163
164         if (f->header->header_size != htole64(ALIGN64(sizeof(*(f->header)))))
165                 return -EBADMSG;
166
167         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
168                 return -ENODATA;
169
170         if (f->writable) {
171                 uint8_t state;
172                 sd_id128_t machine_id;
173                 int r;
174
175                 r = sd_id128_get_machine(&machine_id);
176                 if (r < 0)
177                         return r;
178
179                 if (!sd_id128_equal(machine_id, f->header->machine_id))
180                         return -EHOSTDOWN;
181
182                 state = f->header->state;
183
184                 if (state == STATE_ONLINE)
185                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
186                 else if (state == STATE_ARCHIVED)
187                         return -ESHUTDOWN;
188                 else if (state != STATE_OFFLINE)
189                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
190         }
191
192         return 0;
193 }
194
195 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
196         uint64_t old_size, new_size;
197         int r;
198
199         assert(f);
200
201         /* We assume that this file is not sparse, and we know that
202          * for sure, since we always call posix_fallocate()
203          * ourselves */
204
205         old_size =
206                 le64toh(f->header->header_size) +
207                 le64toh(f->header->arena_size);
208
209         new_size = PAGE_ALIGN(offset + size);
210         if (new_size < le64toh(f->header->header_size))
211                 new_size = le64toh(f->header->header_size);
212
213         if (new_size <= old_size)
214                 return 0;
215
216         if (f->metrics.max_size > 0 &&
217             new_size > f->metrics.max_size)
218                 return -E2BIG;
219
220         if (new_size > f->metrics.min_size &&
221             f->metrics.keep_free > 0) {
222                 struct statvfs svfs;
223
224                 if (fstatvfs(f->fd, &svfs) >= 0) {
225                         uint64_t available;
226
227                         available = svfs.f_bfree * svfs.f_bsize;
228
229                         if (available >= f->metrics.keep_free)
230                                 available -= f->metrics.keep_free;
231                         else
232                                 available = 0;
233
234                         if (new_size - old_size > available)
235                                 return -E2BIG;
236                 }
237         }
238
239         /* Note that the glibc fallocate() fallback is very
240            inefficient, hence we try to minimize the allocation area
241            as we can. */
242         r = posix_fallocate(f->fd, old_size, new_size - old_size);
243         if (r != 0)
244                 return -r;
245
246         if (fstat(f->fd, &f->last_stat) < 0)
247                 return -errno;
248
249         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
250
251         return 0;
252 }
253
254 static int journal_file_map(
255                 JournalFile *f,
256                 uint64_t offset,
257                 uint64_t size,
258                 void **_window,
259                 uint64_t *_woffset,
260                 uint64_t *_wsize,
261                 void **ret) {
262
263         uint64_t woffset, wsize;
264         void *window;
265
266         assert(f);
267         assert(size > 0);
268         assert(ret);
269
270         woffset = offset & ~((uint64_t) page_size() - 1ULL);
271         wsize = size + (offset - woffset);
272         wsize = PAGE_ALIGN(wsize);
273
274         /* Avoid SIGBUS on invalid accesses */
275         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
276                 return -EADDRNOTAVAIL;
277
278         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
279         if (window == MAP_FAILED)
280                 return -errno;
281
282         if (_window)
283                 *_window = window;
284
285         if (_woffset)
286                 *_woffset = woffset;
287
288         if (_wsize)
289                 *_wsize = wsize;
290
291         *ret = (uint8_t*) window + (offset - woffset);
292
293         return 0;
294 }
295
296 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
297         void *p = NULL;
298         uint64_t delta;
299         int r;
300         Window *w;
301
302         assert(f);
303         assert(ret);
304         assert(wt >= 0);
305         assert(wt < _WINDOW_MAX);
306
307         if (offset + size > (uint64_t) f->last_stat.st_size) {
308                 /* Hmm, out of range? Let's refresh the fstat() data
309                  * first, before we trust that check. */
310
311                 if (fstat(f->fd, &f->last_stat) < 0 ||
312                     offset + size > (uint64_t) f->last_stat.st_size)
313                         return -EADDRNOTAVAIL;
314         }
315
316         w = f->windows + wt;
317
318         if (_likely_(w->ptr &&
319                      w->offset <= offset &&
320                      w->offset + w->size >= offset + size)) {
321
322                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
323                 return 0;
324         }
325
326         if (w->ptr) {
327                 if (munmap(w->ptr, w->size) < 0)
328                         return -errno;
329
330                 w->ptr = NULL;
331                 w->size = w->offset = 0;
332         }
333
334         if (size < DEFAULT_WINDOW_SIZE) {
335                 /* If the default window size is larger then what was
336                  * asked for extend the mapping a bit in the hope to
337                  * minimize needed remappings later on. We add half
338                  * the window space before and half behind the
339                  * requested mapping */
340
341                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
342
343                 if (delta > offset)
344                         delta = offset;
345
346                 offset -= delta;
347                 size = DEFAULT_WINDOW_SIZE;
348         } else
349                 delta = 0;
350
351         if (offset + size > (uint64_t) f->last_stat.st_size)
352                 size = (uint64_t) f->last_stat.st_size - offset;
353
354         if (size <= 0)
355                 return -EADDRNOTAVAIL;
356
357         r = journal_file_map(f,
358                              offset, size,
359                              &w->ptr, &w->offset, &w->size,
360                              &p);
361
362         if (r < 0)
363                 return r;
364
365         *ret = (uint8_t*) p + delta;
366         return 0;
367 }
368
369 static bool verify_hash(Object *o) {
370         uint64_t h1, h2;
371
372         assert(o);
373
374         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
375                 h1 = le64toh(o->data.hash);
376                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
377         } else if (o->object.type == OBJECT_FIELD) {
378                 h1 = le64toh(o->field.hash);
379                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
380         } else
381                 return true;
382
383         return h1 == h2;
384 }
385
386 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
387         int r;
388         void *t;
389         Object *o;
390         uint64_t s;
391
392         assert(f);
393         assert(ret);
394         assert(type < _OBJECT_TYPE_MAX);
395
396         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
397         if (r < 0)
398                 return r;
399
400         o = (Object*) t;
401         s = le64toh(o->object.size);
402
403         if (s < sizeof(ObjectHeader))
404                 return -EBADMSG;
405
406         if (type >= 0 && o->object.type != type)
407                 return -EBADMSG;
408
409         if (s > sizeof(ObjectHeader)) {
410                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
411                 if (r < 0)
412                         return r;
413
414                 o = (Object*) t;
415         }
416
417         if (!verify_hash(o))
418                 return -EBADMSG;
419
420         *ret = o;
421         return 0;
422 }
423
424 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
425         uint64_t r;
426
427         assert(f);
428
429         r = le64toh(f->header->seqnum) + 1;
430
431         if (seqnum) {
432                 /* If an external seqnum counter was passed, we update
433                  * both the local and the external one, and set it to
434                  * the maximum of both */
435
436                 if (*seqnum + 1 > r)
437                         r = *seqnum + 1;
438
439                 *seqnum = r;
440         }
441
442         f->header->seqnum = htole64(r);
443
444         if (f->header->first_seqnum == 0)
445                 f->header->first_seqnum = htole64(r);
446
447         return r;
448 }
449
450 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
451         int r;
452         uint64_t p;
453         Object *tail, *o;
454         void *t;
455
456         assert(f);
457         assert(size >= sizeof(ObjectHeader));
458         assert(offset);
459         assert(ret);
460
461         p = le64toh(f->header->tail_object_offset);
462         if (p == 0)
463                 p = le64toh(f->header->header_size);
464         else {
465                 r = journal_file_move_to_object(f, -1, p, &tail);
466                 if (r < 0)
467                         return r;
468
469                 p += ALIGN64(le64toh(tail->object.size));
470         }
471
472         r = journal_file_allocate(f, p, size);
473         if (r < 0)
474                 return r;
475
476         r = journal_file_move_to(f, type, p, size, &t);
477         if (r < 0)
478                 return r;
479
480         o = (Object*) t;
481
482         zero(o->object);
483         o->object.type = type;
484         o->object.size = htole64(size);
485
486         f->header->tail_object_offset = htole64(p);
487         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
488
489         *ret = o;
490         *offset = p;
491
492         return 0;
493 }
494
495 static int journal_file_setup_data_hash_table(JournalFile *f) {
496         uint64_t s, p;
497         Object *o;
498         int r;
499
500         assert(f);
501
502         s = DEFAULT_DATA_HASH_TABLE_SIZE;
503         r = journal_file_append_object(f,
504                                        OBJECT_DATA_HASH_TABLE,
505                                        offsetof(Object, hash_table.items) + s,
506                                        &o, &p);
507         if (r < 0)
508                 return r;
509
510         memset(o->hash_table.items, 0, s);
511
512         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
513         f->header->data_hash_table_size = htole64(s);
514
515         return 0;
516 }
517
518 static int journal_file_setup_field_hash_table(JournalFile *f) {
519         uint64_t s, p;
520         Object *o;
521         int r;
522
523         assert(f);
524
525         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
526         r = journal_file_append_object(f,
527                                        OBJECT_FIELD_HASH_TABLE,
528                                        offsetof(Object, hash_table.items) + s,
529                                        &o, &p);
530         if (r < 0)
531                 return r;
532
533         memset(o->hash_table.items, 0, s);
534
535         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
536         f->header->field_hash_table_size = htole64(s);
537
538         return 0;
539 }
540
541 static int journal_file_map_data_hash_table(JournalFile *f) {
542         uint64_t s, p;
543         void *t;
544         int r;
545
546         assert(f);
547
548         p = le64toh(f->header->data_hash_table_offset);
549         s = le64toh(f->header->data_hash_table_size);
550
551         r = journal_file_move_to(f,
552                                  WINDOW_DATA_HASH_TABLE,
553                                  p, s,
554                                  &t);
555         if (r < 0)
556                 return r;
557
558         f->data_hash_table = t;
559         return 0;
560 }
561
562 static int journal_file_map_field_hash_table(JournalFile *f) {
563         uint64_t s, p;
564         void *t;
565         int r;
566
567         assert(f);
568
569         p = le64toh(f->header->field_hash_table_offset);
570         s = le64toh(f->header->field_hash_table_size);
571
572         r = journal_file_move_to(f,
573                                  WINDOW_FIELD_HASH_TABLE,
574                                  p, s,
575                                  &t);
576         if (r < 0)
577                 return r;
578
579         f->field_hash_table = t;
580         return 0;
581 }
582
583 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
584         uint64_t p, h;
585         int r;
586
587         assert(f);
588         assert(o);
589         assert(offset > 0);
590         assert(o->object.type == OBJECT_DATA);
591
592         /* This might alter the window we are looking at */
593
594         o->data.next_hash_offset = o->data.next_field_offset = 0;
595         o->data.entry_offset = o->data.entry_array_offset = 0;
596         o->data.n_entries = 0;
597
598         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
599         p = le64toh(f->data_hash_table[h].head_hash_offset);
600         if (p == 0) {
601                 /* Only entry in the hash table is easy */
602                 f->data_hash_table[h].head_hash_offset = htole64(offset);
603         } else {
604                 /* Move back to the previous data object, to patch in
605                  * pointer */
606
607                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
608                 if (r < 0)
609                         return r;
610
611                 o->data.next_hash_offset = htole64(offset);
612         }
613
614         f->data_hash_table[h].tail_hash_offset = htole64(offset);
615
616         return 0;
617 }
618
619 int journal_file_find_data_object_with_hash(
620                 JournalFile *f,
621                 const void *data, uint64_t size, uint64_t hash,
622                 Object **ret, uint64_t *offset) {
623
624         uint64_t p, osize, h;
625         int r;
626
627         assert(f);
628         assert(data || size == 0);
629
630         osize = offsetof(Object, data.payload) + size;
631
632         if (f->header->data_hash_table_size == 0)
633                 return -EBADMSG;
634
635         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
636         p = le64toh(f->data_hash_table[h].head_hash_offset);
637
638         while (p > 0) {
639                 Object *o;
640
641                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
642                 if (r < 0)
643                         return r;
644
645                 if (le64toh(o->data.hash) != hash)
646                         goto next;
647
648                 if (o->object.flags & OBJECT_COMPRESSED) {
649 #ifdef HAVE_XZ
650                         uint64_t l, rsize;
651
652                         l = le64toh(o->object.size);
653                         if (l <= offsetof(Object, data.payload))
654                                 return -EBADMSG;
655
656                         l -= offsetof(Object, data.payload);
657
658                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
659                                 return -EBADMSG;
660
661                         if (rsize == size &&
662                             memcmp(f->compress_buffer, data, size) == 0) {
663
664                                 if (ret)
665                                         *ret = o;
666
667                                 if (offset)
668                                         *offset = p;
669
670                                 return 1;
671                         }
672 #else
673                         return -EPROTONOSUPPORT;
674 #endif
675
676                 } else if (le64toh(o->object.size) == osize &&
677                            memcmp(o->data.payload, data, size) == 0) {
678
679                         if (ret)
680                                 *ret = o;
681
682                         if (offset)
683                                 *offset = p;
684
685                         return 1;
686                 }
687
688         next:
689                 p = le64toh(o->data.next_hash_offset);
690         }
691
692         return 0;
693 }
694
695 int journal_file_find_data_object(
696                 JournalFile *f,
697                 const void *data, uint64_t size,
698                 Object **ret, uint64_t *offset) {
699
700         uint64_t hash;
701
702         assert(f);
703         assert(data || size == 0);
704
705         hash = hash64(data, size);
706
707         return journal_file_find_data_object_with_hash(f,
708                                                        data, size, hash,
709                                                        ret, offset);
710 }
711
712 static int journal_file_append_data(
713                 JournalFile *f,
714                 const void *data, uint64_t size,
715                 Object **ret, uint64_t *offset) {
716
717         uint64_t hash, p;
718         uint64_t osize;
719         Object *o;
720         int r;
721         bool compressed = false;
722
723         assert(f);
724         assert(data || size == 0);
725
726         hash = hash64(data, size);
727
728         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
729         if (r < 0)
730                 return r;
731         else if (r > 0) {
732
733                 if (ret)
734                         *ret = o;
735
736                 if (offset)
737                         *offset = p;
738
739                 return 0;
740         }
741
742         osize = offsetof(Object, data.payload) + size;
743         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
744         if (r < 0)
745                 return r;
746
747         o->data.hash = htole64(hash);
748
749 #ifdef HAVE_XZ
750         if (f->compress &&
751             size >= COMPRESSION_SIZE_THRESHOLD) {
752                 uint64_t rsize;
753
754                 compressed = compress_blob(data, size, o->data.payload, &rsize);
755
756                 if (compressed) {
757                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
758                         o->object.flags |= OBJECT_COMPRESSED;
759
760                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
761
762                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
763                 }
764         }
765 #endif
766
767         if (!compressed)
768                 memcpy(o->data.payload, data, size);
769
770         r = journal_file_link_data(f, o, p, hash);
771         if (r < 0)
772                 return r;
773
774         /* The linking might have altered the window, so let's
775          * refresh our pointer */
776         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
777         if (r < 0)
778                 return r;
779
780         if (ret)
781                 *ret = o;
782
783         if (offset)
784                 *offset = p;
785
786         return 0;
787 }
788
789 uint64_t journal_file_entry_n_items(Object *o) {
790         assert(o);
791         assert(o->object.type == OBJECT_ENTRY);
792
793         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
794 }
795
796 static uint64_t journal_file_entry_array_n_items(Object *o) {
797         assert(o);
798         assert(o->object.type == OBJECT_ENTRY_ARRAY);
799
800         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
801 }
802
803 static int link_entry_into_array(JournalFile *f,
804                                  le64_t *first,
805                                  le64_t *idx,
806                                  uint64_t p) {
807         int r;
808         uint64_t n = 0, ap = 0, q, i, a, hidx;
809         Object *o;
810
811         assert(f);
812         assert(first);
813         assert(idx);
814         assert(p > 0);
815
816         a = le64toh(*first);
817         i = hidx = le64toh(*idx);
818         while (a > 0) {
819
820                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
821                 if (r < 0)
822                         return r;
823
824                 n = journal_file_entry_array_n_items(o);
825                 if (i < n) {
826                         o->entry_array.items[i] = htole64(p);
827                         *idx = htole64(hidx + 1);
828                         return 0;
829                 }
830
831                 i -= n;
832                 ap = a;
833                 a = le64toh(o->entry_array.next_entry_array_offset);
834         }
835
836         if (hidx > n)
837                 n = (hidx+1) * 2;
838         else
839                 n = n * 2;
840
841         if (n < 4)
842                 n = 4;
843
844         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
845                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
846                                        &o, &q);
847         if (r < 0)
848                 return r;
849
850         o->entry_array.items[i] = htole64(p);
851
852         if (ap == 0)
853                 *first = htole64(q);
854         else {
855                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
856                 if (r < 0)
857                         return r;
858
859                 o->entry_array.next_entry_array_offset = htole64(q);
860         }
861
862         *idx = htole64(hidx + 1);
863
864         return 0;
865 }
866
867 static int link_entry_into_array_plus_one(JournalFile *f,
868                                           le64_t *extra,
869                                           le64_t *first,
870                                           le64_t *idx,
871                                           uint64_t p) {
872
873         int r;
874
875         assert(f);
876         assert(extra);
877         assert(first);
878         assert(idx);
879         assert(p > 0);
880
881         if (*idx == 0)
882                 *extra = htole64(p);
883         else {
884                 le64_t i;
885
886                 i = htole64(le64toh(*idx) - 1);
887                 r = link_entry_into_array(f, first, &i, p);
888                 if (r < 0)
889                         return r;
890         }
891
892         *idx = htole64(le64toh(*idx) + 1);
893         return 0;
894 }
895
896 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
897         uint64_t p;
898         int r;
899         assert(f);
900         assert(o);
901         assert(offset > 0);
902
903         p = le64toh(o->entry.items[i].object_offset);
904         if (p == 0)
905                 return -EINVAL;
906
907         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
908         if (r < 0)
909                 return r;
910
911         return link_entry_into_array_plus_one(f,
912                                               &o->data.entry_offset,
913                                               &o->data.entry_array_offset,
914                                               &o->data.n_entries,
915                                               offset);
916 }
917
918 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
919         uint64_t n, i;
920         int r;
921
922         assert(f);
923         assert(o);
924         assert(offset > 0);
925         assert(o->object.type == OBJECT_ENTRY);
926
927         __sync_synchronize();
928
929         /* Link up the entry itself */
930         r = link_entry_into_array(f,
931                                   &f->header->entry_array_offset,
932                                   &f->header->n_entries,
933                                   offset);
934         if (r < 0)
935                 return r;
936
937         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
938
939         if (f->header->head_entry_realtime == 0)
940                 f->header->head_entry_realtime = o->entry.realtime;
941
942         f->header->tail_entry_realtime = o->entry.realtime;
943         f->header->tail_entry_monotonic = o->entry.monotonic;
944
945         f->tail_entry_monotonic_valid = true;
946
947         /* Link up the items */
948         n = journal_file_entry_n_items(o);
949         for (i = 0; i < n; i++) {
950                 r = journal_file_link_entry_item(f, o, offset, i);
951                 if (r < 0)
952                         return r;
953         }
954
955         return 0;
956 }
957
958 static int journal_file_append_entry_internal(
959                 JournalFile *f,
960                 const dual_timestamp *ts,
961                 uint64_t xor_hash,
962                 const EntryItem items[], unsigned n_items,
963                 uint64_t *seqnum,
964                 Object **ret, uint64_t *offset) {
965         uint64_t np;
966         uint64_t osize;
967         Object *o;
968         int r;
969
970         assert(f);
971         assert(items || n_items == 0);
972         assert(ts);
973
974         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
975
976         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
977         if (r < 0)
978                 return r;
979
980         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
981         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
982         o->entry.realtime = htole64(ts->realtime);
983         o->entry.monotonic = htole64(ts->monotonic);
984         o->entry.xor_hash = htole64(xor_hash);
985         o->entry.boot_id = f->header->boot_id;
986
987         r = journal_file_link_entry(f, o, np);
988         if (r < 0)
989                 return r;
990
991         if (ret)
992                 *ret = o;
993
994         if (offset)
995                 *offset = np;
996
997         return 0;
998 }
999
1000 void journal_file_post_change(JournalFile *f) {
1001         assert(f);
1002
1003         /* inotify() does not receive IN_MODIFY events from file
1004          * accesses done via mmap(). After each access we hence
1005          * trigger IN_MODIFY by truncating the journal file to its
1006          * current size which triggers IN_MODIFY. */
1007
1008         __sync_synchronize();
1009
1010         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1011                 log_error("Failed to to truncate file to its own size: %m");
1012 }
1013
1014 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1015         unsigned i;
1016         EntryItem *items;
1017         int r;
1018         uint64_t xor_hash = 0;
1019         struct dual_timestamp _ts;
1020
1021         assert(f);
1022         assert(iovec || n_iovec == 0);
1023
1024         if (!f->writable)
1025                 return -EPERM;
1026
1027         if (!ts) {
1028                 dual_timestamp_get(&_ts);
1029                 ts = &_ts;
1030         }
1031
1032         if (f->tail_entry_monotonic_valid &&
1033             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1034                 return -EINVAL;
1035
1036         items = alloca(sizeof(EntryItem) * n_iovec);
1037
1038         for (i = 0; i < n_iovec; i++) {
1039                 uint64_t p;
1040                 Object *o;
1041
1042                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1043                 if (r < 0)
1044                         return r;
1045
1046                 xor_hash ^= le64toh(o->data.hash);
1047                 items[i].object_offset = htole64(p);
1048                 items[i].hash = o->data.hash;
1049         }
1050
1051         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1052
1053         journal_file_post_change(f);
1054
1055         return r;
1056 }
1057
1058 static int generic_array_get(JournalFile *f,
1059                              uint64_t first,
1060                              uint64_t i,
1061                              Object **ret, uint64_t *offset) {
1062
1063         Object *o;
1064         uint64_t p = 0, a;
1065         int r;
1066
1067         assert(f);
1068
1069         a = first;
1070         while (a > 0) {
1071                 uint64_t n;
1072
1073                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1074                 if (r < 0)
1075                         return r;
1076
1077                 n = journal_file_entry_array_n_items(o);
1078                 if (i < n) {
1079                         p = le64toh(o->entry_array.items[i]);
1080                         break;
1081                 }
1082
1083                 i -= n;
1084                 a = le64toh(o->entry_array.next_entry_array_offset);
1085         }
1086
1087         if (a <= 0 || p <= 0)
1088                 return 0;
1089
1090         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1091         if (r < 0)
1092                 return r;
1093
1094         if (ret)
1095                 *ret = o;
1096
1097         if (offset)
1098                 *offset = p;
1099
1100         return 1;
1101 }
1102
1103 static int generic_array_get_plus_one(JournalFile *f,
1104                                       uint64_t extra,
1105                                       uint64_t first,
1106                                       uint64_t i,
1107                                       Object **ret, uint64_t *offset) {
1108
1109         Object *o;
1110
1111         assert(f);
1112
1113         if (i == 0) {
1114                 int r;
1115
1116                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1117                 if (r < 0)
1118                         return r;
1119
1120                 if (ret)
1121                         *ret = o;
1122
1123                 if (offset)
1124                         *offset = extra;
1125
1126                 return 1;
1127         }
1128
1129         return generic_array_get(f, first, i-1, ret, offset);
1130 }
1131
1132 enum {
1133         TEST_FOUND,
1134         TEST_LEFT,
1135         TEST_RIGHT
1136 };
1137
1138 static int generic_array_bisect(JournalFile *f,
1139                                 uint64_t first,
1140                                 uint64_t n,
1141                                 uint64_t needle,
1142                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1143                                 direction_t direction,
1144                                 Object **ret,
1145                                 uint64_t *offset,
1146                                 uint64_t *idx) {
1147
1148         uint64_t a, p, t = 0, i = 0, last_p = 0;
1149         bool subtract_one = false;
1150         Object *o, *array = NULL;
1151         int r;
1152
1153         assert(f);
1154         assert(test_object);
1155
1156         a = first;
1157         while (a > 0) {
1158                 uint64_t left, right, k, lp;
1159
1160                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1161                 if (r < 0)
1162                         return r;
1163
1164                 k = journal_file_entry_array_n_items(array);
1165                 right = MIN(k, n);
1166                 if (right <= 0)
1167                         return 0;
1168
1169                 i = right - 1;
1170                 lp = p = le64toh(array->entry_array.items[i]);
1171                 if (p <= 0)
1172                         return -EBADMSG;
1173
1174                 r = test_object(f, p, needle);
1175                 if (r < 0)
1176                         return r;
1177
1178                 if (r == TEST_FOUND)
1179                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1180
1181                 if (r == TEST_RIGHT) {
1182                         left = 0;
1183                         right -= 1;
1184                         for (;;) {
1185                                 if (left == right) {
1186                                         if (direction == DIRECTION_UP)
1187                                                 subtract_one = true;
1188
1189                                         i = left;
1190                                         goto found;
1191                                 }
1192
1193                                 assert(left < right);
1194
1195                                 i = (left + right) / 2;
1196                                 p = le64toh(array->entry_array.items[i]);
1197                                 if (p <= 0)
1198                                         return -EBADMSG;
1199
1200                                 r = test_object(f, p, needle);
1201                                 if (r < 0)
1202                                         return r;
1203
1204                                 if (r == TEST_FOUND)
1205                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1206
1207                                 if (r == TEST_RIGHT)
1208                                         right = i;
1209                                 else
1210                                         left = i + 1;
1211                         }
1212                 }
1213
1214                 if (k > n)
1215                         return 0;
1216
1217                 last_p = lp;
1218
1219                 n -= k;
1220                 t += k;
1221                 a = le64toh(array->entry_array.next_entry_array_offset);
1222         }
1223
1224         return 0;
1225
1226 found:
1227         if (subtract_one && t == 0 && i == 0)
1228                 return 0;
1229
1230         if (subtract_one && i == 0)
1231                 p = last_p;
1232         else if (subtract_one)
1233                 p = le64toh(array->entry_array.items[i-1]);
1234         else
1235                 p = le64toh(array->entry_array.items[i]);
1236
1237         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1238         if (r < 0)
1239                 return r;
1240
1241         if (ret)
1242                 *ret = o;
1243
1244         if (offset)
1245                 *offset = p;
1246
1247         if (idx)
1248                 *idx = t + i - (subtract_one ? 1 : 0);
1249
1250         return 1;
1251 }
1252
1253 static int generic_array_bisect_plus_one(JournalFile *f,
1254                                          uint64_t extra,
1255                                          uint64_t first,
1256                                          uint64_t n,
1257                                          uint64_t needle,
1258                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1259                                          direction_t direction,
1260                                          Object **ret,
1261                                          uint64_t *offset,
1262                                          uint64_t *idx) {
1263
1264         int r;
1265
1266         assert(f);
1267         assert(test_object);
1268
1269         if (n <= 0)
1270                 return 0;
1271
1272         /* This bisects the array in object 'first', but first checks
1273          * an extra  */
1274         r = test_object(f, extra, needle);
1275         if (r < 0)
1276                 return r;
1277         else if (r == TEST_FOUND) {
1278                 Object *o;
1279
1280                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1281                 if (r < 0)
1282                         return r;
1283
1284                 if (ret)
1285                         *ret = o;
1286
1287                 if (offset)
1288                         *offset = extra;
1289
1290                 if (idx)
1291                         *idx = 0;
1292
1293                 return 1;
1294         } else if (r == TEST_RIGHT)
1295                 return 0;
1296
1297         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1298
1299         if (r > 0)
1300                 (*idx) ++;
1301
1302         return r;
1303 }
1304
1305 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1306         Object *o;
1307         int r;
1308
1309         assert(f);
1310         assert(p > 0);
1311
1312         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1313         if (r < 0)
1314                 return r;
1315
1316         if (le64toh(o->entry.seqnum) == needle)
1317                 return TEST_FOUND;
1318         else if (le64toh(o->entry.seqnum) < needle)
1319                 return TEST_LEFT;
1320         else
1321                 return TEST_RIGHT;
1322 }
1323
1324 int journal_file_move_to_entry_by_seqnum(
1325                 JournalFile *f,
1326                 uint64_t seqnum,
1327                 direction_t direction,
1328                 Object **ret,
1329                 uint64_t *offset) {
1330
1331         return generic_array_bisect(f,
1332                                     le64toh(f->header->entry_array_offset),
1333                                     le64toh(f->header->n_entries),
1334                                     seqnum,
1335                                     test_object_seqnum,
1336                                     direction,
1337                                     ret, offset, NULL);
1338 }
1339
1340 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1341         Object *o;
1342         int r;
1343
1344         assert(f);
1345         assert(p > 0);
1346
1347         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1348         if (r < 0)
1349                 return r;
1350
1351         if (le64toh(o->entry.realtime) == needle)
1352                 return TEST_FOUND;
1353         else if (le64toh(o->entry.realtime) < needle)
1354                 return TEST_LEFT;
1355         else
1356                 return TEST_RIGHT;
1357 }
1358
1359 int journal_file_move_to_entry_by_realtime(
1360                 JournalFile *f,
1361                 uint64_t realtime,
1362                 direction_t direction,
1363                 Object **ret,
1364                 uint64_t *offset) {
1365
1366         return generic_array_bisect(f,
1367                                     le64toh(f->header->entry_array_offset),
1368                                     le64toh(f->header->n_entries),
1369                                     realtime,
1370                                     test_object_realtime,
1371                                     direction,
1372                                     ret, offset, NULL);
1373 }
1374
1375 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1376         Object *o;
1377         int r;
1378
1379         assert(f);
1380         assert(p > 0);
1381
1382         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1383         if (r < 0)
1384                 return r;
1385
1386         if (le64toh(o->entry.monotonic) == needle)
1387                 return TEST_FOUND;
1388         else if (le64toh(o->entry.monotonic) < needle)
1389                 return TEST_LEFT;
1390         else
1391                 return TEST_RIGHT;
1392 }
1393
1394 int journal_file_move_to_entry_by_monotonic(
1395                 JournalFile *f,
1396                 sd_id128_t boot_id,
1397                 uint64_t monotonic,
1398                 direction_t direction,
1399                 Object **ret,
1400                 uint64_t *offset) {
1401
1402         char t[8+32+1] = "_BOOT_ID=";
1403         Object *o;
1404         int r;
1405
1406         sd_id128_to_string(boot_id, t + 8);
1407
1408         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1409         if (r < 0)
1410                 return r;
1411         else if (r == 0)
1412                 return -ENOENT;
1413
1414         return generic_array_bisect_plus_one(f,
1415                                              le64toh(o->data.entry_offset),
1416                                              le64toh(o->data.entry_array_offset),
1417                                              le64toh(o->data.n_entries),
1418                                              monotonic,
1419                                              test_object_monotonic,
1420                                              direction,
1421                                              ret, offset, NULL);
1422 }
1423
1424 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1425         assert(f);
1426         assert(p > 0);
1427
1428         if (p == needle)
1429                 return TEST_FOUND;
1430         else if (p < needle)
1431                 return TEST_LEFT;
1432         else
1433                 return TEST_RIGHT;
1434 }
1435
1436 int journal_file_next_entry(
1437                 JournalFile *f,
1438                 Object *o, uint64_t p,
1439                 direction_t direction,
1440                 Object **ret, uint64_t *offset) {
1441
1442         uint64_t i, n;
1443         int r;
1444
1445         assert(f);
1446         assert(p > 0 || !o);
1447
1448         n = le64toh(f->header->n_entries);
1449         if (n <= 0)
1450                 return 0;
1451
1452         if (!o)
1453                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1454         else {
1455                 if (o->object.type != OBJECT_ENTRY)
1456                         return -EINVAL;
1457
1458                 r = generic_array_bisect(f,
1459                                          le64toh(f->header->entry_array_offset),
1460                                          le64toh(f->header->n_entries),
1461                                          p,
1462                                          test_object_offset,
1463                                          DIRECTION_DOWN,
1464                                          NULL, NULL,
1465                                          &i);
1466                 if (r <= 0)
1467                         return r;
1468
1469                 if (direction == DIRECTION_DOWN) {
1470                         if (i >= n - 1)
1471                                 return 0;
1472
1473                         i++;
1474                 } else {
1475                         if (i <= 0)
1476                                 return 0;
1477
1478                         i--;
1479                 }
1480         }
1481
1482         /* And jump to it */
1483         return generic_array_get(f,
1484                                  le64toh(f->header->entry_array_offset),
1485                                  i,
1486                                  ret, offset);
1487 }
1488
1489 int journal_file_skip_entry(
1490                 JournalFile *f,
1491                 Object *o, uint64_t p,
1492                 int64_t skip,
1493                 Object **ret, uint64_t *offset) {
1494
1495         uint64_t i, n;
1496         int r;
1497
1498         assert(f);
1499         assert(o);
1500         assert(p > 0);
1501
1502         if (o->object.type != OBJECT_ENTRY)
1503                 return -EINVAL;
1504
1505         r = generic_array_bisect(f,
1506                                  le64toh(f->header->entry_array_offset),
1507                                  le64toh(f->header->n_entries),
1508                                  p,
1509                                  test_object_offset,
1510                                  DIRECTION_DOWN,
1511                                  NULL, NULL,
1512                                  &i);
1513         if (r <= 0)
1514                 return r;
1515
1516         /* Calculate new index */
1517         if (skip < 0) {
1518                 if ((uint64_t) -skip >= i)
1519                         i = 0;
1520                 else
1521                         i = i - (uint64_t) -skip;
1522         } else
1523                 i  += (uint64_t) skip;
1524
1525         n = le64toh(f->header->n_entries);
1526         if (n <= 0)
1527                 return -EBADMSG;
1528
1529         if (i >= n)
1530                 i = n-1;
1531
1532         return generic_array_get(f,
1533                                  le64toh(f->header->entry_array_offset),
1534                                  i,
1535                                  ret, offset);
1536 }
1537
1538 int journal_file_next_entry_for_data(
1539                 JournalFile *f,
1540                 Object *o, uint64_t p,
1541                 uint64_t data_offset,
1542                 direction_t direction,
1543                 Object **ret, uint64_t *offset) {
1544
1545         uint64_t n, i;
1546         int r;
1547         Object *d;
1548
1549         assert(f);
1550         assert(p > 0 || !o);
1551
1552         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1553         if (r < 0)
1554                 return r;
1555
1556         n = le64toh(d->data.n_entries);
1557         if (n <= 0)
1558                 return n;
1559
1560         if (!o)
1561                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1562         else {
1563                 if (o->object.type != OBJECT_ENTRY)
1564                         return -EINVAL;
1565
1566                 r = generic_array_bisect_plus_one(f,
1567                                                   le64toh(d->data.entry_offset),
1568                                                   le64toh(d->data.entry_array_offset),
1569                                                   le64toh(d->data.n_entries),
1570                                                   p,
1571                                                   test_object_offset,
1572                                                   DIRECTION_DOWN,
1573                                                   NULL, NULL,
1574                                                   &i);
1575
1576                 if (r <= 0)
1577                         return r;
1578
1579                 if (direction == DIRECTION_DOWN) {
1580                         if (i >= n - 1)
1581                                 return 0;
1582
1583                         i++;
1584                 } else {
1585                         if (i <= 0)
1586                                 return 0;
1587
1588                         i--;
1589                 }
1590
1591         }
1592
1593         return generic_array_get_plus_one(f,
1594                                           le64toh(d->data.entry_offset),
1595                                           le64toh(d->data.entry_array_offset),
1596                                           i,
1597                                           ret, offset);
1598 }
1599
1600 int journal_file_move_to_entry_by_seqnum_for_data(
1601                 JournalFile *f,
1602                 uint64_t data_offset,
1603                 uint64_t seqnum,
1604                 direction_t direction,
1605                 Object **ret, uint64_t *offset) {
1606
1607         Object *d;
1608         int r;
1609
1610         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1611         if (r <= 0)
1612                 return r;
1613
1614         return generic_array_bisect_plus_one(f,
1615                                              le64toh(d->data.entry_offset),
1616                                              le64toh(d->data.entry_array_offset),
1617                                              le64toh(d->data.n_entries),
1618                                              seqnum,
1619                                              test_object_seqnum,
1620                                              direction,
1621                                              ret, offset, NULL);
1622 }
1623
1624 int journal_file_move_to_entry_by_realtime_for_data(
1625                 JournalFile *f,
1626                 uint64_t data_offset,
1627                 uint64_t realtime,
1628                 direction_t direction,
1629                 Object **ret, uint64_t *offset) {
1630
1631         Object *d;
1632         int r;
1633
1634         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1635         if (r <= 0)
1636                 return r;
1637
1638         return generic_array_bisect_plus_one(f,
1639                                              le64toh(d->data.entry_offset),
1640                                              le64toh(d->data.entry_array_offset),
1641                                              le64toh(d->data.n_entries),
1642                                              realtime,
1643                                              test_object_realtime,
1644                                              direction,
1645                                              ret, offset, NULL);
1646 }
1647
1648 void journal_file_dump(JournalFile *f) {
1649         char a[33], b[33], c[33];
1650         Object *o;
1651         int r;
1652         uint64_t p;
1653
1654         assert(f);
1655
1656         printf("File Path: %s\n"
1657                "File ID: %s\n"
1658                "Machine ID: %s\n"
1659                "Boot ID: %s\n"
1660                "Arena size: %llu\n"
1661                "Objects: %lu\n"
1662                "Entries: %lu\n",
1663                f->path,
1664                sd_id128_to_string(f->header->file_id, a),
1665                sd_id128_to_string(f->header->machine_id, b),
1666                sd_id128_to_string(f->header->boot_id, c),
1667                (unsigned long long) le64toh(f->header->arena_size),
1668                (unsigned long) le64toh(f->header->n_objects),
1669                (unsigned long) le64toh(f->header->n_entries));
1670
1671         p = le64toh(f->header->header_size);
1672         while (p != 0) {
1673                 r = journal_file_move_to_object(f, -1, p, &o);
1674                 if (r < 0)
1675                         goto fail;
1676
1677                 switch (o->object.type) {
1678
1679                 case OBJECT_UNUSED:
1680                         printf("Type: OBJECT_UNUSED\n");
1681                         break;
1682
1683                 case OBJECT_DATA:
1684                         printf("Type: OBJECT_DATA\n");
1685                         break;
1686
1687                 case OBJECT_ENTRY:
1688                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1689                                (unsigned long long) le64toh(o->entry.seqnum),
1690                                (unsigned long long) le64toh(o->entry.monotonic),
1691                                (unsigned long long) le64toh(o->entry.realtime));
1692                         break;
1693
1694                 case OBJECT_FIELD_HASH_TABLE:
1695                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1696                         break;
1697
1698                 case OBJECT_DATA_HASH_TABLE:
1699                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1700                         break;
1701
1702                 case OBJECT_ENTRY_ARRAY:
1703                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1704                         break;
1705                 }
1706
1707                 if (o->object.flags & OBJECT_COMPRESSED)
1708                         printf("Flags: COMPRESSED\n");
1709
1710                 if (p == le64toh(f->header->tail_object_offset))
1711                         p = 0;
1712                 else
1713                         p = p + ALIGN64(le64toh(o->object.size));
1714         }
1715
1716         return;
1717 fail:
1718         log_error("File corrupt");
1719 }
1720
1721 int journal_file_open(
1722                 const char *fname,
1723                 int flags,
1724                 mode_t mode,
1725                 JournalFile *template,
1726                 JournalFile **ret) {
1727
1728         JournalFile *f;
1729         int r;
1730         bool newly_created = false;
1731
1732         assert(fname);
1733
1734         if ((flags & O_ACCMODE) != O_RDONLY &&
1735             (flags & O_ACCMODE) != O_RDWR)
1736                 return -EINVAL;
1737
1738         if (!endswith(fname, ".journal"))
1739                 return -EINVAL;
1740
1741         f = new0(JournalFile, 1);
1742         if (!f)
1743                 return -ENOMEM;
1744
1745         f->fd = -1;
1746         f->flags = flags;
1747         f->mode = mode;
1748         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1749         f->prot = prot_from_flags(flags);
1750
1751         if (template) {
1752                 f->metrics = template->metrics;
1753                 f->compress = template->compress;
1754         }
1755
1756         f->path = strdup(fname);
1757         if (!f->path) {
1758                 r = -ENOMEM;
1759                 goto fail;
1760         }
1761
1762         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1763         if (f->fd < 0) {
1764                 r = -errno;
1765                 goto fail;
1766         }
1767
1768         if (fstat(f->fd, &f->last_stat) < 0) {
1769                 r = -errno;
1770                 goto fail;
1771         }
1772
1773         if (f->last_stat.st_size == 0 && f->writable) {
1774                 newly_created = true;
1775
1776                 r = journal_file_init_header(f, template);
1777                 if (r < 0)
1778                         goto fail;
1779
1780                 if (fstat(f->fd, &f->last_stat) < 0) {
1781                         r = -errno;
1782                         goto fail;
1783                 }
1784         }
1785
1786         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1787                 r = -EIO;
1788                 goto fail;
1789         }
1790
1791         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1792         if (f->header == MAP_FAILED) {
1793                 f->header = NULL;
1794                 r = -errno;
1795                 goto fail;
1796         }
1797
1798         if (!newly_created) {
1799                 r = journal_file_verify_header(f);
1800                 if (r < 0)
1801                         goto fail;
1802         }
1803
1804         if (f->writable) {
1805                 r = journal_file_refresh_header(f);
1806                 if (r < 0)
1807                         goto fail;
1808         }
1809
1810         if (newly_created) {
1811
1812                 r = journal_file_setup_field_hash_table(f);
1813                 if (r < 0)
1814                         goto fail;
1815
1816                 r = journal_file_setup_data_hash_table(f);
1817                 if (r < 0)
1818                         goto fail;
1819         }
1820
1821         r = journal_file_map_field_hash_table(f);
1822         if (r < 0)
1823                 goto fail;
1824
1825         r = journal_file_map_data_hash_table(f);
1826         if (r < 0)
1827                 goto fail;
1828
1829         if (ret)
1830                 *ret = f;
1831
1832         return 0;
1833
1834 fail:
1835         journal_file_close(f);
1836
1837         return r;
1838 }
1839
1840 int journal_file_rotate(JournalFile **f) {
1841         char *p;
1842         size_t l;
1843         JournalFile *old_file, *new_file = NULL;
1844         int r;
1845
1846         assert(f);
1847         assert(*f);
1848
1849         old_file = *f;
1850
1851         if (!old_file->writable)
1852                 return -EINVAL;
1853
1854         if (!endswith(old_file->path, ".journal"))
1855                 return -EINVAL;
1856
1857         l = strlen(old_file->path);
1858
1859         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1860         if (!p)
1861                 return -ENOMEM;
1862
1863         memcpy(p, old_file->path, l - 8);
1864         p[l-8] = '@';
1865         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1866         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1867                  "-%016llx-%016llx.journal",
1868                  (unsigned long long) le64toh((*f)->header->seqnum),
1869                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1870
1871         r = rename(old_file->path, p);
1872         free(p);
1873
1874         if (r < 0)
1875                 return -errno;
1876
1877         old_file->header->state = STATE_ARCHIVED;
1878
1879         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1880         journal_file_close(old_file);
1881
1882         *f = new_file;
1883         return r;
1884 }
1885
1886 int journal_file_open_reliably(
1887                 const char *fname,
1888                 int flags,
1889                 mode_t mode,
1890                 JournalFile *template,
1891                 JournalFile **ret) {
1892
1893         int r;
1894         size_t l;
1895         char *p;
1896
1897         r = journal_file_open(fname, flags, mode, template, ret);
1898         if (r != -EBADMSG && /* corrupted */
1899             r != -ENODATA && /* truncated */
1900             r != -EHOSTDOWN && /* other machine */
1901             r != -EPROTONOSUPPORT) /* incompatible feature */
1902                 return r;
1903
1904         if ((flags & O_ACCMODE) == O_RDONLY)
1905                 return r;
1906
1907         if (!(flags & O_CREAT))
1908                 return r;
1909
1910         /* The file is corrupted. Rotate it away and try it again (but only once) */
1911
1912         l = strlen(fname);
1913         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1914                      (int) (l-8), fname,
1915                      (unsigned long long) now(CLOCK_REALTIME),
1916                      random_ull()) < 0)
1917                 return -ENOMEM;
1918
1919         r = rename(fname, p);
1920         free(p);
1921         if (r < 0)
1922                 return -errno;
1923
1924         log_warning("File %s corrupted, renaming and replacing.", fname);
1925
1926         return journal_file_open(fname, flags, mode, template, ret);
1927 }
1928
1929 struct vacuum_info {
1930         off_t usage;
1931         char *filename;
1932
1933         uint64_t realtime;
1934         sd_id128_t seqnum_id;
1935         uint64_t seqnum;
1936
1937         bool have_seqnum;
1938 };
1939
1940 static int vacuum_compare(const void *_a, const void *_b) {
1941         const struct vacuum_info *a, *b;
1942
1943         a = _a;
1944         b = _b;
1945
1946         if (a->have_seqnum && b->have_seqnum &&
1947             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1948                 if (a->seqnum < b->seqnum)
1949                         return -1;
1950                 else if (a->seqnum > b->seqnum)
1951                         return 1;
1952                 else
1953                         return 0;
1954         }
1955
1956         if (a->realtime < b->realtime)
1957                 return -1;
1958         else if (a->realtime > b->realtime)
1959                 return 1;
1960         else if (a->have_seqnum && b->have_seqnum)
1961                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1962         else
1963                 return strcmp(a->filename, b->filename);
1964 }
1965
1966 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1967         DIR *d;
1968         int r = 0;
1969         struct vacuum_info *list = NULL;
1970         unsigned n_list = 0, n_allocated = 0, i;
1971         uint64_t sum = 0;
1972
1973         assert(directory);
1974
1975         if (max_use <= 0)
1976                 return 0;
1977
1978         d = opendir(directory);
1979         if (!d)
1980                 return -errno;
1981
1982         for (;;) {
1983                 int k;
1984                 struct dirent buf, *de;
1985                 size_t q;
1986                 struct stat st;
1987                 char *p;
1988                 unsigned long long seqnum = 0, realtime;
1989                 sd_id128_t seqnum_id;
1990                 bool have_seqnum;
1991
1992                 k = readdir_r(d, &buf, &de);
1993                 if (k != 0) {
1994                         r = -k;
1995                         goto finish;
1996                 }
1997
1998                 if (!de)
1999                         break;
2000
2001                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2002                         continue;
2003
2004                 if (!S_ISREG(st.st_mode))
2005                         continue;
2006
2007                 q = strlen(de->d_name);
2008
2009                 if (endswith(de->d_name, ".journal")) {
2010
2011                         /* Vacuum archived files */
2012
2013                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2014                                 continue;
2015
2016                         if (de->d_name[q-8-16-1] != '-' ||
2017                             de->d_name[q-8-16-1-16-1] != '-' ||
2018                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2019                                 continue;
2020
2021                         p = strdup(de->d_name);
2022                         if (!p) {
2023                                 r = -ENOMEM;
2024                                 goto finish;
2025                         }
2026
2027                         de->d_name[q-8-16-1-16-1] = 0;
2028                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2029                                 free(p);
2030                                 continue;
2031                         }
2032
2033                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2034                                 free(p);
2035                                 continue;
2036                         }
2037
2038                         have_seqnum = true;
2039
2040                 } else if (endswith(de->d_name, ".journal~")) {
2041                         unsigned long long tmp;
2042
2043                         /* Vacuum corrupted files */
2044
2045                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2046                                 continue;
2047
2048                         if (de->d_name[q-1-8-16-1] != '-' ||
2049                             de->d_name[q-1-8-16-1-16-1] != '@')
2050                                 continue;
2051
2052                         p = strdup(de->d_name);
2053                         if (!p) {
2054                                 r = -ENOMEM;
2055                                 goto finish;
2056                         }
2057
2058                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2059                                 free(p);
2060                                 continue;
2061                         }
2062
2063                         have_seqnum = false;
2064                 } else
2065                         continue;
2066
2067                 if (n_list >= n_allocated) {
2068                         struct vacuum_info *j;
2069
2070                         n_allocated = MAX(n_allocated * 2U, 8U);
2071                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2072                         if (!j) {
2073                                 free(p);
2074                                 r = -ENOMEM;
2075                                 goto finish;
2076                         }
2077
2078                         list = j;
2079                 }
2080
2081                 list[n_list].filename = p;
2082                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2083                 list[n_list].seqnum = seqnum;
2084                 list[n_list].realtime = realtime;
2085                 list[n_list].seqnum_id = seqnum_id;
2086                 list[n_list].have_seqnum = have_seqnum;
2087
2088                 sum += list[n_list].usage;
2089
2090                 n_list ++;
2091         }
2092
2093         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2094
2095         for(i = 0; i < n_list; i++) {
2096                 struct statvfs ss;
2097
2098                 if (fstatvfs(dirfd(d), &ss) < 0) {
2099                         r = -errno;
2100                         goto finish;
2101                 }
2102
2103                 if (sum <= max_use &&
2104                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2105                         break;
2106
2107                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2108                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2109                         sum -= list[i].usage;
2110                 } else if (errno != ENOENT)
2111                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2112         }
2113
2114 finish:
2115         for (i = 0; i < n_list; i++)
2116                 free(list[i].filename);
2117
2118         free(list);
2119
2120         if (d)
2121                 closedir(d);
2122
2123         return r;
2124 }
2125
2126 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2127         uint64_t i, n;
2128         uint64_t q, xor_hash = 0;
2129         int r;
2130         EntryItem *items;
2131         dual_timestamp ts;
2132
2133         assert(from);
2134         assert(to);
2135         assert(o);
2136         assert(p);
2137
2138         if (!to->writable)
2139                 return -EPERM;
2140
2141         ts.monotonic = le64toh(o->entry.monotonic);
2142         ts.realtime = le64toh(o->entry.realtime);
2143
2144         if (to->tail_entry_monotonic_valid &&
2145             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2146                 return -EINVAL;
2147
2148         if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2149                 return -EINVAL;
2150
2151         n = journal_file_entry_n_items(o);
2152         items = alloca(sizeof(EntryItem) * n);
2153
2154         for (i = 0; i < n; i++) {
2155                 uint64_t l, h;
2156                 le64_t le_hash;
2157                 size_t t;
2158                 void *data;
2159                 Object *u;
2160
2161                 q = le64toh(o->entry.items[i].object_offset);
2162                 le_hash = o->entry.items[i].hash;
2163
2164                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2165                 if (r < 0)
2166                         return r;
2167
2168                 if (le_hash != o->data.hash)
2169                         return -EBADMSG;
2170
2171                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2172                 t = (size_t) l;
2173
2174                 /* We hit the limit on 32bit machines */
2175                 if ((uint64_t) t != l)
2176                         return -E2BIG;
2177
2178                 if (o->object.flags & OBJECT_COMPRESSED) {
2179 #ifdef HAVE_XZ
2180                         uint64_t rsize;
2181
2182                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2183                                 return -EBADMSG;
2184
2185                         data = from->compress_buffer;
2186                         l = rsize;
2187 #else
2188                         return -EPROTONOSUPPORT;
2189 #endif
2190                 } else
2191                         data = o->data.payload;
2192
2193                 r = journal_file_append_data(to, data, l, &u, &h);
2194                 if (r < 0)
2195                         return r;
2196
2197                 xor_hash ^= le64toh(u->data.hash);
2198                 items[i].object_offset = htole64(h);
2199                 items[i].hash = u->data.hash;
2200
2201                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2202                 if (r < 0)
2203                         return r;
2204         }
2205
2206         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2207 }
2208
2209 void journal_default_metrics(JournalMetrics *m, int fd) {
2210         uint64_t fs_size = 0;
2211         struct statvfs ss;
2212         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2213
2214         assert(m);
2215         assert(fd >= 0);
2216
2217         if (fstatvfs(fd, &ss) >= 0)
2218                 fs_size = ss.f_frsize * ss.f_blocks;
2219
2220         if (m->max_use == (uint64_t) -1) {
2221
2222                 if (fs_size > 0) {
2223                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2224
2225                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2226                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2227
2228                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2229                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2230                 } else
2231                         m->max_use = DEFAULT_MAX_USE_LOWER;
2232         } else {
2233                 m->max_use = PAGE_ALIGN(m->max_use);
2234
2235                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2236                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2237         }
2238
2239         if (m->max_size == (uint64_t) -1) {
2240                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2241
2242                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2243                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2244         } else
2245                 m->max_size = PAGE_ALIGN(m->max_size);
2246
2247         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2248                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2249
2250         if (m->max_size*2 > m->max_use)
2251                 m->max_use = m->max_size*2;
2252
2253         if (m->min_size == (uint64_t) -1)
2254                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2255         else {
2256                 m->min_size = PAGE_ALIGN(m->min_size);
2257
2258                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2259                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2260
2261                 if (m->min_size > m->max_size)
2262                         m->max_size = m->min_size;
2263         }
2264
2265         if (m->keep_free == (uint64_t) -1) {
2266
2267                 if (fs_size > 0) {
2268                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2269
2270                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2271                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2272
2273                 } else
2274                         m->keep_free = DEFAULT_KEEP_FREE;
2275         }
2276
2277         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2278                  format_bytes(a, sizeof(a), m->max_use),
2279                  format_bytes(b, sizeof(b), m->max_size),
2280                  format_bytes(c, sizeof(c), m->min_size),
2281                  format_bytes(d, sizeof(d), m->keep_free));
2282 }