chiark / gitweb /
journal: fix bisection logic for first entry
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
62
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
64
65 void journal_file_close(JournalFile *f) {
66         int t;
67
68         assert(f);
69
70         if (f->header) {
71                 if (f->writable)
72                         f->header->state = STATE_OFFLINE;
73
74                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
75         }
76
77         for (t = 0; t < _WINDOW_MAX; t++)
78                 if (f->windows[t].ptr)
79                         munmap(f->windows[t].ptr, f->windows[t].size);
80
81         if (f->fd >= 0)
82                 close_nointr_nofail(f->fd);
83
84         free(f->path);
85
86 #ifdef HAVE_XZ
87         free(f->compress_buffer);
88 #endif
89
90         free(f);
91 }
92
93 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
94         Header h;
95         ssize_t k;
96         int r;
97
98         assert(f);
99
100         zero(h);
101         memcpy(h.signature, signature, 8);
102         h.header_size = htole64(ALIGN64(sizeof(h)));
103
104         r = sd_id128_randomize(&h.file_id);
105         if (r < 0)
106                 return r;
107
108         if (template) {
109                 h.seqnum_id = template->header->seqnum_id;
110                 h.seqnum = template->header->seqnum;
111         } else
112                 h.seqnum_id = h.file_id;
113
114         k = pwrite(f->fd, &h, sizeof(h), 0);
115         if (k < 0)
116                 return -errno;
117
118         if (k != sizeof(h))
119                 return -EIO;
120
121         return 0;
122 }
123
124 static int journal_file_refresh_header(JournalFile *f) {
125         int r;
126         sd_id128_t boot_id;
127
128         assert(f);
129
130         r = sd_id128_get_machine(&f->header->machine_id);
131         if (r < 0)
132                 return r;
133
134         r = sd_id128_get_boot(&boot_id);
135         if (r < 0)
136                 return r;
137
138         if (sd_id128_equal(boot_id, f->header->boot_id))
139                 f->tail_entry_monotonic_valid = true;
140
141         f->header->boot_id = boot_id;
142
143         f->header->state = STATE_ONLINE;
144
145         __sync_synchronize();
146
147         return 0;
148 }
149
150 static int journal_file_verify_header(JournalFile *f) {
151         assert(f);
152
153         if (memcmp(f->header, signature, 8))
154                 return -EBADMSG;
155
156 #ifdef HAVE_XZ
157         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
158                 return -EPROTONOSUPPORT;
159 #else
160         if (f->header->incompatible_flags != 0)
161                 return -EPROTONOSUPPORT;
162 #endif
163
164         if (f->header->header_size != htole64(ALIGN64(sizeof(*(f->header)))))
165                 return -EBADMSG;
166
167         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
168                 return -ENODATA;
169
170         if (f->writable) {
171                 uint8_t state;
172                 sd_id128_t machine_id;
173                 int r;
174
175                 r = sd_id128_get_machine(&machine_id);
176                 if (r < 0)
177                         return r;
178
179                 if (!sd_id128_equal(machine_id, f->header->machine_id))
180                         return -EHOSTDOWN;
181
182                 state = f->header->state;
183
184                 if (state == STATE_ONLINE)
185                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
186                         /* FIXME: immediately rotate */
187                 else if (state == STATE_ARCHIVED)
188                         return -ESHUTDOWN;
189                 else if (state != STATE_OFFLINE)
190                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
191         }
192
193         return 0;
194 }
195
196 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
197         uint64_t old_size, new_size;
198         int r;
199
200         assert(f);
201
202         /* We assume that this file is not sparse, and we know that
203          * for sure, since we always call posix_fallocate()
204          * ourselves */
205
206         old_size =
207                 le64toh(f->header->header_size) +
208                 le64toh(f->header->arena_size);
209
210         new_size = PAGE_ALIGN(offset + size);
211         if (new_size < le64toh(f->header->header_size))
212                 new_size = le64toh(f->header->header_size);
213
214         if (new_size <= old_size)
215                 return 0;
216
217         if (f->metrics.max_size > 0 &&
218             new_size > f->metrics.max_size)
219                 return -E2BIG;
220
221         if (new_size > f->metrics.min_size &&
222             f->metrics.keep_free > 0) {
223                 struct statvfs svfs;
224
225                 if (fstatvfs(f->fd, &svfs) >= 0) {
226                         uint64_t available;
227
228                         available = svfs.f_bfree * svfs.f_bsize;
229
230                         if (available >= f->metrics.keep_free)
231                                 available -= f->metrics.keep_free;
232                         else
233                                 available = 0;
234
235                         if (new_size - old_size > available)
236                                 return -E2BIG;
237                 }
238         }
239
240         /* Note that the glibc fallocate() fallback is very
241            inefficient, hence we try to minimize the allocation area
242            as we can. */
243         r = posix_fallocate(f->fd, old_size, new_size - old_size);
244         if (r != 0)
245                 return -r;
246
247         if (fstat(f->fd, &f->last_stat) < 0)
248                 return -errno;
249
250         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
251
252         return 0;
253 }
254
255 static int journal_file_map(
256                 JournalFile *f,
257                 uint64_t offset,
258                 uint64_t size,
259                 void **_window,
260                 uint64_t *_woffset,
261                 uint64_t *_wsize,
262                 void **ret) {
263
264         uint64_t woffset, wsize;
265         void *window;
266
267         assert(f);
268         assert(size > 0);
269         assert(ret);
270
271         woffset = offset & ~((uint64_t) page_size() - 1ULL);
272         wsize = size + (offset - woffset);
273         wsize = PAGE_ALIGN(wsize);
274
275         /* Avoid SIGBUS on invalid accesses */
276         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
277                 return -EADDRNOTAVAIL;
278
279         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
280         if (window == MAP_FAILED)
281                 return -errno;
282
283         if (_window)
284                 *_window = window;
285
286         if (_woffset)
287                 *_woffset = woffset;
288
289         if (_wsize)
290                 *_wsize = wsize;
291
292         *ret = (uint8_t*) window + (offset - woffset);
293
294         return 0;
295 }
296
297 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
298         void *p = NULL;
299         uint64_t delta;
300         int r;
301         Window *w;
302
303         assert(f);
304         assert(ret);
305         assert(wt >= 0);
306         assert(wt < _WINDOW_MAX);
307
308         if (offset + size > (uint64_t) f->last_stat.st_size) {
309                 /* Hmm, out of range? Let's refresh the fstat() data
310                  * first, before we trust that check. */
311
312                 if (fstat(f->fd, &f->last_stat) < 0 ||
313                     offset + size > (uint64_t) f->last_stat.st_size)
314                         return -EADDRNOTAVAIL;
315         }
316
317         w = f->windows + wt;
318
319         if (_likely_(w->ptr &&
320                      w->offset <= offset &&
321                      w->offset + w->size >= offset + size)) {
322
323                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
324                 return 0;
325         }
326
327         if (w->ptr) {
328                 if (munmap(w->ptr, w->size) < 0)
329                         return -errno;
330
331                 w->ptr = NULL;
332                 w->size = w->offset = 0;
333         }
334
335         if (size < DEFAULT_WINDOW_SIZE) {
336                 /* If the default window size is larger then what was
337                  * asked for extend the mapping a bit in the hope to
338                  * minimize needed remappings later on. We add half
339                  * the window space before and half behind the
340                  * requested mapping */
341
342                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
343
344                 if (delta > offset)
345                         delta = offset;
346
347                 offset -= delta;
348                 size = DEFAULT_WINDOW_SIZE;
349         } else
350                 delta = 0;
351
352         if (offset + size > (uint64_t) f->last_stat.st_size)
353                 size = (uint64_t) f->last_stat.st_size - offset;
354
355         if (size <= 0)
356                 return -EADDRNOTAVAIL;
357
358         r = journal_file_map(f,
359                              offset, size,
360                              &w->ptr, &w->offset, &w->size,
361                              &p);
362
363         if (r < 0)
364                 return r;
365
366         *ret = (uint8_t*) p + delta;
367         return 0;
368 }
369
370 static bool verify_hash(Object *o) {
371         uint64_t h1, h2;
372
373         assert(o);
374
375         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
376                 h1 = le64toh(o->data.hash);
377                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
378         } else if (o->object.type == OBJECT_FIELD) {
379                 h1 = le64toh(o->field.hash);
380                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
381         } else
382                 return true;
383
384         return h1 == h2;
385 }
386
387 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
388         int r;
389         void *t;
390         Object *o;
391         uint64_t s;
392
393         assert(f);
394         assert(ret);
395         assert(type < _OBJECT_TYPE_MAX);
396
397         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
398         if (r < 0)
399                 return r;
400
401         o = (Object*) t;
402         s = le64toh(o->object.size);
403
404         if (s < sizeof(ObjectHeader))
405                 return -EBADMSG;
406
407         if (type >= 0 && o->object.type != type)
408                 return -EBADMSG;
409
410         if (s > sizeof(ObjectHeader)) {
411                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
412                 if (r < 0)
413                         return r;
414
415                 o = (Object*) t;
416         }
417
418         if (!verify_hash(o))
419                 return -EBADMSG;
420
421         *ret = o;
422         return 0;
423 }
424
425 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
426         uint64_t r;
427
428         assert(f);
429
430         r = le64toh(f->header->seqnum) + 1;
431
432         if (seqnum) {
433                 /* If an external seqnum counter was passed, we update
434                  * both the local and the external one, and set it to
435                  * the maximum of both */
436
437                 if (*seqnum + 1 > r)
438                         r = *seqnum + 1;
439
440                 *seqnum = r;
441         }
442
443         f->header->seqnum = htole64(r);
444
445         if (f->header->first_seqnum == 0)
446                 f->header->first_seqnum = htole64(r);
447
448         return r;
449 }
450
451 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
452         int r;
453         uint64_t p;
454         Object *tail, *o;
455         void *t;
456
457         assert(f);
458         assert(size >= sizeof(ObjectHeader));
459         assert(offset);
460         assert(ret);
461
462         p = le64toh(f->header->tail_object_offset);
463         if (p == 0)
464                 p = le64toh(f->header->header_size);
465         else {
466                 r = journal_file_move_to_object(f, -1, p, &tail);
467                 if (r < 0)
468                         return r;
469
470                 p += ALIGN64(le64toh(tail->object.size));
471         }
472
473         r = journal_file_allocate(f, p, size);
474         if (r < 0)
475                 return r;
476
477         r = journal_file_move_to(f, type, p, size, &t);
478         if (r < 0)
479                 return r;
480
481         o = (Object*) t;
482
483         zero(o->object);
484         o->object.type = type;
485         o->object.size = htole64(size);
486
487         f->header->tail_object_offset = htole64(p);
488         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
489
490         *ret = o;
491         *offset = p;
492
493         return 0;
494 }
495
496 static int journal_file_setup_data_hash_table(JournalFile *f) {
497         uint64_t s, p;
498         Object *o;
499         int r;
500
501         assert(f);
502
503         s = DEFAULT_DATA_HASH_TABLE_SIZE;
504         r = journal_file_append_object(f,
505                                        OBJECT_DATA_HASH_TABLE,
506                                        offsetof(Object, hash_table.items) + s,
507                                        &o, &p);
508         if (r < 0)
509                 return r;
510
511         memset(o->hash_table.items, 0, s);
512
513         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
514         f->header->data_hash_table_size = htole64(s);
515
516         return 0;
517 }
518
519 static int journal_file_setup_field_hash_table(JournalFile *f) {
520         uint64_t s, p;
521         Object *o;
522         int r;
523
524         assert(f);
525
526         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
527         r = journal_file_append_object(f,
528                                        OBJECT_FIELD_HASH_TABLE,
529                                        offsetof(Object, hash_table.items) + s,
530                                        &o, &p);
531         if (r < 0)
532                 return r;
533
534         memset(o->hash_table.items, 0, s);
535
536         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
537         f->header->field_hash_table_size = htole64(s);
538
539         return 0;
540 }
541
542 static int journal_file_map_data_hash_table(JournalFile *f) {
543         uint64_t s, p;
544         void *t;
545         int r;
546
547         assert(f);
548
549         p = le64toh(f->header->data_hash_table_offset);
550         s = le64toh(f->header->data_hash_table_size);
551
552         r = journal_file_move_to(f,
553                                  WINDOW_DATA_HASH_TABLE,
554                                  p, s,
555                                  &t);
556         if (r < 0)
557                 return r;
558
559         f->data_hash_table = t;
560         return 0;
561 }
562
563 static int journal_file_map_field_hash_table(JournalFile *f) {
564         uint64_t s, p;
565         void *t;
566         int r;
567
568         assert(f);
569
570         p = le64toh(f->header->field_hash_table_offset);
571         s = le64toh(f->header->field_hash_table_size);
572
573         r = journal_file_move_to(f,
574                                  WINDOW_FIELD_HASH_TABLE,
575                                  p, s,
576                                  &t);
577         if (r < 0)
578                 return r;
579
580         f->field_hash_table = t;
581         return 0;
582 }
583
584 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
585         uint64_t p, h;
586         int r;
587
588         assert(f);
589         assert(o);
590         assert(offset > 0);
591         assert(o->object.type == OBJECT_DATA);
592
593         /* This might alter the window we are looking at */
594
595         o->data.next_hash_offset = o->data.next_field_offset = 0;
596         o->data.entry_offset = o->data.entry_array_offset = 0;
597         o->data.n_entries = 0;
598
599         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
600         p = le64toh(f->data_hash_table[h].tail_hash_offset);
601         if (p == 0) {
602                 /* Only entry in the hash table is easy */
603                 f->data_hash_table[h].head_hash_offset = htole64(offset);
604         } else {
605                 /* Move back to the previous data object, to patch in
606                  * pointer */
607
608                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
609                 if (r < 0)
610                         return r;
611
612                 o->data.next_hash_offset = htole64(offset);
613         }
614
615         f->data_hash_table[h].tail_hash_offset = htole64(offset);
616
617         return 0;
618 }
619
620 int journal_file_find_data_object_with_hash(
621                 JournalFile *f,
622                 const void *data, uint64_t size, uint64_t hash,
623                 Object **ret, uint64_t *offset) {
624
625         uint64_t p, osize, h;
626         int r;
627
628         assert(f);
629         assert(data || size == 0);
630
631         osize = offsetof(Object, data.payload) + size;
632
633         if (f->header->data_hash_table_size == 0)
634                 return -EBADMSG;
635
636         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
637         p = le64toh(f->data_hash_table[h].head_hash_offset);
638
639         while (p > 0) {
640                 Object *o;
641
642                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
643                 if (r < 0)
644                         return r;
645
646                 if (le64toh(o->data.hash) != hash)
647                         goto next;
648
649                 if (o->object.flags & OBJECT_COMPRESSED) {
650 #ifdef HAVE_XZ
651                         uint64_t l, rsize;
652
653                         l = le64toh(o->object.size);
654                         if (l <= offsetof(Object, data.payload))
655                                 return -EBADMSG;
656
657                         l -= offsetof(Object, data.payload);
658
659                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
660                                 return -EBADMSG;
661
662                         if (rsize == size &&
663                             memcmp(f->compress_buffer, data, size) == 0) {
664
665                                 if (ret)
666                                         *ret = o;
667
668                                 if (offset)
669                                         *offset = p;
670
671                                 return 1;
672                         }
673 #else
674                         return -EPROTONOSUPPORT;
675 #endif
676
677                 } else if (le64toh(o->object.size) == osize &&
678                            memcmp(o->data.payload, data, size) == 0) {
679
680                         if (ret)
681                                 *ret = o;
682
683                         if (offset)
684                                 *offset = p;
685
686                         return 1;
687                 }
688
689         next:
690                 p = le64toh(o->data.next_hash_offset);
691         }
692
693         return 0;
694 }
695
696 int journal_file_find_data_object(
697                 JournalFile *f,
698                 const void *data, uint64_t size,
699                 Object **ret, uint64_t *offset) {
700
701         uint64_t hash;
702
703         assert(f);
704         assert(data || size == 0);
705
706         hash = hash64(data, size);
707
708         return journal_file_find_data_object_with_hash(f,
709                                                        data, size, hash,
710                                                        ret, offset);
711 }
712
713 static int journal_file_append_data(
714                 JournalFile *f,
715                 const void *data, uint64_t size,
716                 Object **ret, uint64_t *offset) {
717
718         uint64_t hash, p;
719         uint64_t osize;
720         Object *o;
721         int r;
722         bool compressed = false;
723
724         assert(f);
725         assert(data || size == 0);
726
727         hash = hash64(data, size);
728
729         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
730         if (r < 0)
731                 return r;
732         else if (r > 0) {
733
734                 if (ret)
735                         *ret = o;
736
737                 if (offset)
738                         *offset = p;
739
740                 return 0;
741         }
742
743         osize = offsetof(Object, data.payload) + size;
744         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
745         if (r < 0)
746                 return r;
747
748         o->data.hash = htole64(hash);
749
750 #ifdef HAVE_XZ
751         if (f->compress &&
752             size >= COMPRESSION_SIZE_THRESHOLD) {
753                 uint64_t rsize;
754
755                 compressed = compress_blob(data, size, o->data.payload, &rsize);
756
757                 if (compressed) {
758                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
759                         o->object.flags |= OBJECT_COMPRESSED;
760
761                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
762
763                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
764                 }
765         }
766 #endif
767
768         if (!compressed)
769                 memcpy(o->data.payload, data, size);
770
771         r = journal_file_link_data(f, o, p, hash);
772         if (r < 0)
773                 return r;
774
775         /* The linking might have altered the window, so let's
776          * refresh our pointer */
777         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
778         if (r < 0)
779                 return r;
780
781         if (ret)
782                 *ret = o;
783
784         if (offset)
785                 *offset = p;
786
787         return 0;
788 }
789
790 uint64_t journal_file_entry_n_items(Object *o) {
791         assert(o);
792         assert(o->object.type == OBJECT_ENTRY);
793
794         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
795 }
796
797 static uint64_t journal_file_entry_array_n_items(Object *o) {
798         assert(o);
799         assert(o->object.type == OBJECT_ENTRY_ARRAY);
800
801         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
802 }
803
804 static int link_entry_into_array(JournalFile *f,
805                                  le64_t *first,
806                                  le64_t *idx,
807                                  uint64_t p) {
808         int r;
809         uint64_t n = 0, ap = 0, q, i, a, hidx;
810         Object *o;
811
812         assert(f);
813         assert(first);
814         assert(idx);
815         assert(p > 0);
816
817         a = le64toh(*first);
818         i = hidx = le64toh(*idx);
819         while (a > 0) {
820
821                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
822                 if (r < 0)
823                         return r;
824
825                 n = journal_file_entry_array_n_items(o);
826                 if (i < n) {
827                         o->entry_array.items[i] = htole64(p);
828                         *idx = htole64(hidx + 1);
829                         return 0;
830                 }
831
832                 i -= n;
833                 ap = a;
834                 a = le64toh(o->entry_array.next_entry_array_offset);
835         }
836
837         if (hidx > n)
838                 n = (hidx+1) * 2;
839         else
840                 n = n * 2;
841
842         if (n < 4)
843                 n = 4;
844
845         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
846                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
847                                        &o, &q);
848         if (r < 0)
849                 return r;
850
851         o->entry_array.items[i] = htole64(p);
852
853         if (ap == 0)
854                 *first = htole64(q);
855         else {
856                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
857                 if (r < 0)
858                         return r;
859
860                 o->entry_array.next_entry_array_offset = htole64(q);
861         }
862
863         *idx = htole64(hidx + 1);
864
865         return 0;
866 }
867
868 static int link_entry_into_array_plus_one(JournalFile *f,
869                                           le64_t *extra,
870                                           le64_t *first,
871                                           le64_t *idx,
872                                           uint64_t p) {
873
874         int r;
875
876         assert(f);
877         assert(extra);
878         assert(first);
879         assert(idx);
880         assert(p > 0);
881
882         if (*idx == 0)
883                 *extra = htole64(p);
884         else {
885                 le64_t i;
886
887                 i = htole64(le64toh(*idx) - 1);
888                 r = link_entry_into_array(f, first, &i, p);
889                 if (r < 0)
890                         return r;
891         }
892
893         *idx = htole64(le64toh(*idx) + 1);
894         return 0;
895 }
896
897 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
898         uint64_t p;
899         int r;
900         assert(f);
901         assert(o);
902         assert(offset > 0);
903
904         p = le64toh(o->entry.items[i].object_offset);
905         if (p == 0)
906                 return -EINVAL;
907
908         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
909         if (r < 0)
910                 return r;
911
912         return link_entry_into_array_plus_one(f,
913                                               &o->data.entry_offset,
914                                               &o->data.entry_array_offset,
915                                               &o->data.n_entries,
916                                               offset);
917 }
918
919 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
920         uint64_t n, i;
921         int r;
922
923         assert(f);
924         assert(o);
925         assert(offset > 0);
926         assert(o->object.type == OBJECT_ENTRY);
927
928         __sync_synchronize();
929
930         /* Link up the entry itself */
931         r = link_entry_into_array(f,
932                                   &f->header->entry_array_offset,
933                                   &f->header->n_entries,
934                                   offset);
935         if (r < 0)
936                 return r;
937
938         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
939
940         if (f->header->head_entry_realtime == 0)
941                 f->header->head_entry_realtime = o->entry.realtime;
942
943         f->header->tail_entry_realtime = o->entry.realtime;
944         f->header->tail_entry_monotonic = o->entry.monotonic;
945
946         f->tail_entry_monotonic_valid = true;
947
948         /* Link up the items */
949         n = journal_file_entry_n_items(o);
950         for (i = 0; i < n; i++) {
951                 r = journal_file_link_entry_item(f, o, offset, i);
952                 if (r < 0)
953                         return r;
954         }
955
956         return 0;
957 }
958
959 static int journal_file_append_entry_internal(
960                 JournalFile *f,
961                 const dual_timestamp *ts,
962                 uint64_t xor_hash,
963                 const EntryItem items[], unsigned n_items,
964                 uint64_t *seqnum,
965                 Object **ret, uint64_t *offset) {
966         uint64_t np;
967         uint64_t osize;
968         Object *o;
969         int r;
970
971         assert(f);
972         assert(items || n_items == 0);
973         assert(ts);
974
975         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
976
977         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
978         if (r < 0)
979                 return r;
980
981         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
982         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
983         o->entry.realtime = htole64(ts->realtime);
984         o->entry.monotonic = htole64(ts->monotonic);
985         o->entry.xor_hash = htole64(xor_hash);
986         o->entry.boot_id = f->header->boot_id;
987
988         r = journal_file_link_entry(f, o, np);
989         if (r < 0)
990                 return r;
991
992         if (ret)
993                 *ret = o;
994
995         if (offset)
996                 *offset = np;
997
998         return 0;
999 }
1000
1001 void journal_file_post_change(JournalFile *f) {
1002         assert(f);
1003
1004         /* inotify() does not receive IN_MODIFY events from file
1005          * accesses done via mmap(). After each access we hence
1006          * trigger IN_MODIFY by truncating the journal file to its
1007          * current size which triggers IN_MODIFY. */
1008
1009         __sync_synchronize();
1010
1011         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1012                 log_error("Failed to to truncate file to its own size: %m");
1013 }
1014
1015 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1016         unsigned i;
1017         EntryItem *items;
1018         int r;
1019         uint64_t xor_hash = 0;
1020         struct dual_timestamp _ts;
1021
1022         assert(f);
1023         assert(iovec || n_iovec == 0);
1024
1025         if (!f->writable)
1026                 return -EPERM;
1027
1028         if (!ts) {
1029                 dual_timestamp_get(&_ts);
1030                 ts = &_ts;
1031         }
1032
1033         if (f->tail_entry_monotonic_valid &&
1034             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1035                 return -EINVAL;
1036
1037         items = alloca(sizeof(EntryItem) * n_iovec);
1038
1039         for (i = 0; i < n_iovec; i++) {
1040                 uint64_t p;
1041                 Object *o;
1042
1043                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1044                 if (r < 0)
1045                         return r;
1046
1047                 xor_hash ^= le64toh(o->data.hash);
1048                 items[i].object_offset = htole64(p);
1049                 items[i].hash = o->data.hash;
1050         }
1051
1052         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1053
1054         journal_file_post_change(f);
1055
1056         return r;
1057 }
1058
1059 static int generic_array_get(JournalFile *f,
1060                              uint64_t first,
1061                              uint64_t i,
1062                              Object **ret, uint64_t *offset) {
1063
1064         Object *o;
1065         uint64_t p = 0, a;
1066         int r;
1067
1068         assert(f);
1069
1070         a = first;
1071         while (a > 0) {
1072                 uint64_t n;
1073
1074                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1075                 if (r < 0)
1076                         return r;
1077
1078                 n = journal_file_entry_array_n_items(o);
1079                 if (i < n) {
1080                         p = le64toh(o->entry_array.items[i]);
1081                         break;
1082                 }
1083
1084                 i -= n;
1085                 a = le64toh(o->entry_array.next_entry_array_offset);
1086         }
1087
1088         if (a <= 0 || p <= 0)
1089                 return 0;
1090
1091         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1092         if (r < 0)
1093                 return r;
1094
1095         if (ret)
1096                 *ret = o;
1097
1098         if (offset)
1099                 *offset = p;
1100
1101         return 1;
1102 }
1103
1104 static int generic_array_get_plus_one(JournalFile *f,
1105                                       uint64_t extra,
1106                                       uint64_t first,
1107                                       uint64_t i,
1108                                       Object **ret, uint64_t *offset) {
1109
1110         Object *o;
1111
1112         assert(f);
1113
1114         if (i == 0) {
1115                 int r;
1116
1117                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1118                 if (r < 0)
1119                         return r;
1120
1121                 if (ret)
1122                         *ret = o;
1123
1124                 if (offset)
1125                         *offset = extra;
1126
1127                 return 1;
1128         }
1129
1130         return generic_array_get(f, first, i-1, ret, offset);
1131 }
1132
1133 enum {
1134         TEST_FOUND,
1135         TEST_LEFT,
1136         TEST_RIGHT
1137 };
1138
1139 static int generic_array_bisect(JournalFile *f,
1140                                 uint64_t first,
1141                                 uint64_t n,
1142                                 uint64_t needle,
1143                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1144                                 direction_t direction,
1145                                 Object **ret,
1146                                 uint64_t *offset,
1147                                 uint64_t *idx) {
1148
1149         uint64_t a, p, t = 0, i = 0, last_p = 0;
1150         bool subtract_one = false;
1151         Object *o, *array = NULL;
1152         int r;
1153
1154         assert(f);
1155         assert(test_object);
1156
1157         a = first;
1158         while (a > 0) {
1159                 uint64_t left, right, k, lp;
1160
1161                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1162                 if (r < 0)
1163                         return r;
1164
1165                 k = journal_file_entry_array_n_items(array);
1166                 right = MIN(k, n);
1167                 if (right <= 0)
1168                         return 0;
1169
1170                 i = right - 1;
1171                 lp = p = le64toh(array->entry_array.items[i]);
1172                 if (p <= 0)
1173                         return -EBADMSG;
1174
1175                 r = test_object(f, p, needle);
1176                 if (r < 0)
1177                         return r;
1178
1179                 if (r == TEST_FOUND)
1180                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1181
1182                 if (r == TEST_RIGHT) {
1183                         left = 0;
1184                         right -= 1;
1185                         for (;;) {
1186                                 if (left == right) {
1187                                         if (direction == DIRECTION_UP)
1188                                                 subtract_one = true;
1189
1190                                         i = left;
1191                                         goto found;
1192                                 }
1193
1194                                 assert(left < right);
1195
1196                                 i = (left + right) / 2;
1197                                 p = le64toh(array->entry_array.items[i]);
1198                                 if (p <= 0)
1199                                         return -EBADMSG;
1200
1201                                 r = test_object(f, p, needle);
1202                                 if (r < 0)
1203                                         return r;
1204
1205                                 if (r == TEST_FOUND)
1206                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1207
1208                                 if (r == TEST_RIGHT)
1209                                         right = i;
1210                                 else
1211                                         left = i + 1;
1212                         }
1213                 }
1214
1215                 if (k > n)
1216                         return 0;
1217
1218                 last_p = lp;
1219
1220                 n -= k;
1221                 t += k;
1222                 a = le64toh(array->entry_array.next_entry_array_offset);
1223         }
1224
1225         return 0;
1226
1227 found:
1228         if (subtract_one && t == 0 && i == 0)
1229                 return 0;
1230
1231         if (subtract_one && i == 0)
1232                 p = last_p;
1233         else if (subtract_one)
1234                 p = le64toh(array->entry_array.items[i-1]);
1235         else
1236                 p = le64toh(array->entry_array.items[i]);
1237
1238         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1239         if (r < 0)
1240                 return r;
1241
1242         if (ret)
1243                 *ret = o;
1244
1245         if (offset)
1246                 *offset = p;
1247
1248         if (idx)
1249                 *idx = t + i - (subtract_one ? 1 : 0);
1250
1251         return 1;
1252 }
1253
1254 static int generic_array_bisect_plus_one(JournalFile *f,
1255                                          uint64_t extra,
1256                                          uint64_t first,
1257                                          uint64_t n,
1258                                          uint64_t needle,
1259                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1260                                          direction_t direction,
1261                                          Object **ret,
1262                                          uint64_t *offset,
1263                                          uint64_t *idx) {
1264
1265         int r;
1266
1267         assert(f);
1268         assert(test_object);
1269
1270         if (n <= 0)
1271                 return 0;
1272
1273         /* This bisects the array in object 'first', but first checks
1274          * an extra  */
1275         r = test_object(f, extra, needle);
1276         if (r < 0)
1277                 return r;
1278
1279         if (r == TEST_FOUND)
1280                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1281
1282         if (r == TEST_RIGHT) {
1283                 Object *o;
1284
1285                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1286                 if (r < 0)
1287                         return r;
1288
1289                 if (ret)
1290                         *ret = o;
1291
1292                 if (offset)
1293                         *offset = extra;
1294
1295                 if (idx)
1296                         *idx = 0;
1297
1298                 return 1;
1299         }
1300
1301         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1302
1303         if (r > 0)
1304                 (*idx) ++;
1305
1306         return r;
1307 }
1308
1309 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1310         Object *o;
1311         int r;
1312
1313         assert(f);
1314         assert(p > 0);
1315
1316         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1317         if (r < 0)
1318                 return r;
1319
1320         if (le64toh(o->entry.seqnum) == needle)
1321                 return TEST_FOUND;
1322         else if (le64toh(o->entry.seqnum) < needle)
1323                 return TEST_LEFT;
1324         else
1325                 return TEST_RIGHT;
1326 }
1327
1328 int journal_file_move_to_entry_by_seqnum(
1329                 JournalFile *f,
1330                 uint64_t seqnum,
1331                 direction_t direction,
1332                 Object **ret,
1333                 uint64_t *offset) {
1334
1335         return generic_array_bisect(f,
1336                                     le64toh(f->header->entry_array_offset),
1337                                     le64toh(f->header->n_entries),
1338                                     seqnum,
1339                                     test_object_seqnum,
1340                                     direction,
1341                                     ret, offset, NULL);
1342 }
1343
1344 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1345         Object *o;
1346         int r;
1347
1348         assert(f);
1349         assert(p > 0);
1350
1351         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1352         if (r < 0)
1353                 return r;
1354
1355         if (le64toh(o->entry.realtime) == needle)
1356                 return TEST_FOUND;
1357         else if (le64toh(o->entry.realtime) < needle)
1358                 return TEST_LEFT;
1359         else
1360                 return TEST_RIGHT;
1361 }
1362
1363 int journal_file_move_to_entry_by_realtime(
1364                 JournalFile *f,
1365                 uint64_t realtime,
1366                 direction_t direction,
1367                 Object **ret,
1368                 uint64_t *offset) {
1369
1370         return generic_array_bisect(f,
1371                                     le64toh(f->header->entry_array_offset),
1372                                     le64toh(f->header->n_entries),
1373                                     realtime,
1374                                     test_object_realtime,
1375                                     direction,
1376                                     ret, offset, NULL);
1377 }
1378
1379 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1380         Object *o;
1381         int r;
1382
1383         assert(f);
1384         assert(p > 0);
1385
1386         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1387         if (r < 0)
1388                 return r;
1389
1390         if (le64toh(o->entry.monotonic) == needle)
1391                 return TEST_FOUND;
1392         else if (le64toh(o->entry.monotonic) < needle)
1393                 return TEST_LEFT;
1394         else
1395                 return TEST_RIGHT;
1396 }
1397
1398 int journal_file_move_to_entry_by_monotonic(
1399                 JournalFile *f,
1400                 sd_id128_t boot_id,
1401                 uint64_t monotonic,
1402                 direction_t direction,
1403                 Object **ret,
1404                 uint64_t *offset) {
1405
1406         char t[9+32+1] = "_BOOT_ID=";
1407         Object *o;
1408         int r;
1409
1410         sd_id128_to_string(boot_id, t + 9);
1411
1412         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1413         if (r < 0)
1414                 return r;
1415         else if (r == 0)
1416                 return -ENOENT;
1417
1418         return generic_array_bisect_plus_one(f,
1419                                              le64toh(o->data.entry_offset),
1420                                              le64toh(o->data.entry_array_offset),
1421                                              le64toh(o->data.n_entries),
1422                                              monotonic,
1423                                              test_object_monotonic,
1424                                              direction,
1425                                              ret, offset, NULL);
1426 }
1427
1428 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1429         assert(f);
1430         assert(p > 0);
1431
1432         if (p == needle)
1433                 return TEST_FOUND;
1434         else if (p < needle)
1435                 return TEST_LEFT;
1436         else
1437                 return TEST_RIGHT;
1438 }
1439
1440 int journal_file_next_entry(
1441                 JournalFile *f,
1442                 Object *o, uint64_t p,
1443                 direction_t direction,
1444                 Object **ret, uint64_t *offset) {
1445
1446         uint64_t i, n;
1447         int r;
1448
1449         assert(f);
1450         assert(p > 0 || !o);
1451
1452         n = le64toh(f->header->n_entries);
1453         if (n <= 0)
1454                 return 0;
1455
1456         if (!o)
1457                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1458         else {
1459                 if (o->object.type != OBJECT_ENTRY)
1460                         return -EINVAL;
1461
1462                 r = generic_array_bisect(f,
1463                                          le64toh(f->header->entry_array_offset),
1464                                          le64toh(f->header->n_entries),
1465                                          p,
1466                                          test_object_offset,
1467                                          DIRECTION_DOWN,
1468                                          NULL, NULL,
1469                                          &i);
1470                 if (r <= 0)
1471                         return r;
1472
1473                 if (direction == DIRECTION_DOWN) {
1474                         if (i >= n - 1)
1475                                 return 0;
1476
1477                         i++;
1478                 } else {
1479                         if (i <= 0)
1480                                 return 0;
1481
1482                         i--;
1483                 }
1484         }
1485
1486         /* And jump to it */
1487         return generic_array_get(f,
1488                                  le64toh(f->header->entry_array_offset),
1489                                  i,
1490                                  ret, offset);
1491 }
1492
1493 int journal_file_skip_entry(
1494                 JournalFile *f,
1495                 Object *o, uint64_t p,
1496                 int64_t skip,
1497                 Object **ret, uint64_t *offset) {
1498
1499         uint64_t i, n;
1500         int r;
1501
1502         assert(f);
1503         assert(o);
1504         assert(p > 0);
1505
1506         if (o->object.type != OBJECT_ENTRY)
1507                 return -EINVAL;
1508
1509         r = generic_array_bisect(f,
1510                                  le64toh(f->header->entry_array_offset),
1511                                  le64toh(f->header->n_entries),
1512                                  p,
1513                                  test_object_offset,
1514                                  DIRECTION_DOWN,
1515                                  NULL, NULL,
1516                                  &i);
1517         if (r <= 0)
1518                 return r;
1519
1520         /* Calculate new index */
1521         if (skip < 0) {
1522                 if ((uint64_t) -skip >= i)
1523                         i = 0;
1524                 else
1525                         i = i - (uint64_t) -skip;
1526         } else
1527                 i  += (uint64_t) skip;
1528
1529         n = le64toh(f->header->n_entries);
1530         if (n <= 0)
1531                 return -EBADMSG;
1532
1533         if (i >= n)
1534                 i = n-1;
1535
1536         return generic_array_get(f,
1537                                  le64toh(f->header->entry_array_offset),
1538                                  i,
1539                                  ret, offset);
1540 }
1541
1542 int journal_file_next_entry_for_data(
1543                 JournalFile *f,
1544                 Object *o, uint64_t p,
1545                 uint64_t data_offset,
1546                 direction_t direction,
1547                 Object **ret, uint64_t *offset) {
1548
1549         uint64_t n, i;
1550         int r;
1551         Object *d;
1552
1553         assert(f);
1554         assert(p > 0 || !o);
1555
1556         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1557         if (r < 0)
1558                 return r;
1559
1560         n = le64toh(d->data.n_entries);
1561         if (n <= 0)
1562                 return n;
1563
1564         if (!o)
1565                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1566         else {
1567                 if (o->object.type != OBJECT_ENTRY)
1568                         return -EINVAL;
1569
1570                 r = generic_array_bisect_plus_one(f,
1571                                                   le64toh(d->data.entry_offset),
1572                                                   le64toh(d->data.entry_array_offset),
1573                                                   le64toh(d->data.n_entries),
1574                                                   p,
1575                                                   test_object_offset,
1576                                                   DIRECTION_DOWN,
1577                                                   NULL, NULL,
1578                                                   &i);
1579
1580                 if (r <= 0)
1581                         return r;
1582
1583                 if (direction == DIRECTION_DOWN) {
1584                         if (i >= n - 1)
1585                                 return 0;
1586
1587                         i++;
1588                 } else {
1589                         if (i <= 0)
1590                                 return 0;
1591
1592                         i--;
1593                 }
1594
1595         }
1596
1597         return generic_array_get_plus_one(f,
1598                                           le64toh(d->data.entry_offset),
1599                                           le64toh(d->data.entry_array_offset),
1600                                           i,
1601                                           ret, offset);
1602 }
1603
1604 int journal_file_move_to_entry_by_seqnum_for_data(
1605                 JournalFile *f,
1606                 uint64_t data_offset,
1607                 uint64_t seqnum,
1608                 direction_t direction,
1609                 Object **ret, uint64_t *offset) {
1610
1611         Object *d;
1612         int r;
1613
1614         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1615         if (r <= 0)
1616                 return r;
1617
1618         return generic_array_bisect_plus_one(f,
1619                                              le64toh(d->data.entry_offset),
1620                                              le64toh(d->data.entry_array_offset),
1621                                              le64toh(d->data.n_entries),
1622                                              seqnum,
1623                                              test_object_seqnum,
1624                                              direction,
1625                                              ret, offset, NULL);
1626 }
1627
1628 int journal_file_move_to_entry_by_realtime_for_data(
1629                 JournalFile *f,
1630                 uint64_t data_offset,
1631                 uint64_t realtime,
1632                 direction_t direction,
1633                 Object **ret, uint64_t *offset) {
1634
1635         Object *d;
1636         int r;
1637
1638         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1639         if (r <= 0)
1640                 return r;
1641
1642         return generic_array_bisect_plus_one(f,
1643                                              le64toh(d->data.entry_offset),
1644                                              le64toh(d->data.entry_array_offset),
1645                                              le64toh(d->data.n_entries),
1646                                              realtime,
1647                                              test_object_realtime,
1648                                              direction,
1649                                              ret, offset, NULL);
1650 }
1651
1652 void journal_file_dump(JournalFile *f) {
1653         char a[33], b[33], c[33];
1654         Object *o;
1655         int r;
1656         uint64_t p;
1657
1658         assert(f);
1659
1660         printf("File Path: %s\n"
1661                "File ID: %s\n"
1662                "Machine ID: %s\n"
1663                "Boot ID: %s\n"
1664                "Arena size: %llu\n"
1665                "Objects: %lu\n"
1666                "Entries: %lu\n",
1667                f->path,
1668                sd_id128_to_string(f->header->file_id, a),
1669                sd_id128_to_string(f->header->machine_id, b),
1670                sd_id128_to_string(f->header->boot_id, c),
1671                (unsigned long long) le64toh(f->header->arena_size),
1672                (unsigned long) le64toh(f->header->n_objects),
1673                (unsigned long) le64toh(f->header->n_entries));
1674
1675         p = le64toh(f->header->header_size);
1676         while (p != 0) {
1677                 r = journal_file_move_to_object(f, -1, p, &o);
1678                 if (r < 0)
1679                         goto fail;
1680
1681                 switch (o->object.type) {
1682
1683                 case OBJECT_UNUSED:
1684                         printf("Type: OBJECT_UNUSED\n");
1685                         break;
1686
1687                 case OBJECT_DATA:
1688                         printf("Type: OBJECT_DATA\n");
1689                         break;
1690
1691                 case OBJECT_ENTRY:
1692                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1693                                (unsigned long long) le64toh(o->entry.seqnum),
1694                                (unsigned long long) le64toh(o->entry.monotonic),
1695                                (unsigned long long) le64toh(o->entry.realtime));
1696                         break;
1697
1698                 case OBJECT_FIELD_HASH_TABLE:
1699                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1700                         break;
1701
1702                 case OBJECT_DATA_HASH_TABLE:
1703                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1704                         break;
1705
1706                 case OBJECT_ENTRY_ARRAY:
1707                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1708                         break;
1709
1710                 case OBJECT_SIGNATURE:
1711                         printf("Type: OBJECT_SIGNATURE\n");
1712                         break;
1713                 }
1714
1715                 if (o->object.flags & OBJECT_COMPRESSED)
1716                         printf("Flags: COMPRESSED\n");
1717
1718                 if (p == le64toh(f->header->tail_object_offset))
1719                         p = 0;
1720                 else
1721                         p = p + ALIGN64(le64toh(o->object.size));
1722         }
1723
1724         return;
1725 fail:
1726         log_error("File corrupt");
1727 }
1728
1729 int journal_file_open(
1730                 const char *fname,
1731                 int flags,
1732                 mode_t mode,
1733                 JournalFile *template,
1734                 JournalFile **ret) {
1735
1736         JournalFile *f;
1737         int r;
1738         bool newly_created = false;
1739
1740         assert(fname);
1741
1742         if ((flags & O_ACCMODE) != O_RDONLY &&
1743             (flags & O_ACCMODE) != O_RDWR)
1744                 return -EINVAL;
1745
1746         if (!endswith(fname, ".journal"))
1747                 return -EINVAL;
1748
1749         f = new0(JournalFile, 1);
1750         if (!f)
1751                 return -ENOMEM;
1752
1753         f->fd = -1;
1754         f->flags = flags;
1755         f->mode = mode;
1756         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1757         f->prot = prot_from_flags(flags);
1758
1759         if (template) {
1760                 f->metrics = template->metrics;
1761                 f->compress = template->compress;
1762         }
1763
1764         f->path = strdup(fname);
1765         if (!f->path) {
1766                 r = -ENOMEM;
1767                 goto fail;
1768         }
1769
1770         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1771         if (f->fd < 0) {
1772                 r = -errno;
1773                 goto fail;
1774         }
1775
1776         if (fstat(f->fd, &f->last_stat) < 0) {
1777                 r = -errno;
1778                 goto fail;
1779         }
1780
1781         if (f->last_stat.st_size == 0 && f->writable) {
1782                 newly_created = true;
1783
1784                 r = journal_file_init_header(f, template);
1785                 if (r < 0)
1786                         goto fail;
1787
1788                 if (fstat(f->fd, &f->last_stat) < 0) {
1789                         r = -errno;
1790                         goto fail;
1791                 }
1792         }
1793
1794         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1795                 r = -EIO;
1796                 goto fail;
1797         }
1798
1799         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1800         if (f->header == MAP_FAILED) {
1801                 f->header = NULL;
1802                 r = -errno;
1803                 goto fail;
1804         }
1805
1806         if (!newly_created) {
1807                 r = journal_file_verify_header(f);
1808                 if (r < 0)
1809                         goto fail;
1810         }
1811
1812         if (f->writable) {
1813                 r = journal_file_refresh_header(f);
1814                 if (r < 0)
1815                         goto fail;
1816         }
1817
1818         if (newly_created) {
1819
1820                 r = journal_file_setup_field_hash_table(f);
1821                 if (r < 0)
1822                         goto fail;
1823
1824                 r = journal_file_setup_data_hash_table(f);
1825                 if (r < 0)
1826                         goto fail;
1827         }
1828
1829         r = journal_file_map_field_hash_table(f);
1830         if (r < 0)
1831                 goto fail;
1832
1833         r = journal_file_map_data_hash_table(f);
1834         if (r < 0)
1835                 goto fail;
1836
1837         if (ret)
1838                 *ret = f;
1839
1840         return 0;
1841
1842 fail:
1843         journal_file_close(f);
1844
1845         return r;
1846 }
1847
1848 int journal_file_rotate(JournalFile **f) {
1849         char *p;
1850         size_t l;
1851         JournalFile *old_file, *new_file = NULL;
1852         int r;
1853
1854         assert(f);
1855         assert(*f);
1856
1857         old_file = *f;
1858
1859         if (!old_file->writable)
1860                 return -EINVAL;
1861
1862         if (!endswith(old_file->path, ".journal"))
1863                 return -EINVAL;
1864
1865         l = strlen(old_file->path);
1866
1867         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1868         if (!p)
1869                 return -ENOMEM;
1870
1871         memcpy(p, old_file->path, l - 8);
1872         p[l-8] = '@';
1873         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1874         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1875                  "-%016llx-%016llx.journal",
1876                  (unsigned long long) le64toh((*f)->header->seqnum),
1877                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1878
1879         r = rename(old_file->path, p);
1880         free(p);
1881
1882         if (r < 0)
1883                 return -errno;
1884
1885         old_file->header->state = STATE_ARCHIVED;
1886
1887         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1888         journal_file_close(old_file);
1889
1890         *f = new_file;
1891         return r;
1892 }
1893
1894 int journal_file_open_reliably(
1895                 const char *fname,
1896                 int flags,
1897                 mode_t mode,
1898                 JournalFile *template,
1899                 JournalFile **ret) {
1900
1901         int r;
1902         size_t l;
1903         char *p;
1904
1905         r = journal_file_open(fname, flags, mode, template, ret);
1906         if (r != -EBADMSG && /* corrupted */
1907             r != -ENODATA && /* truncated */
1908             r != -EHOSTDOWN && /* other machine */
1909             r != -EPROTONOSUPPORT) /* incompatible feature */
1910                 return r;
1911
1912         if ((flags & O_ACCMODE) == O_RDONLY)
1913                 return r;
1914
1915         if (!(flags & O_CREAT))
1916                 return r;
1917
1918         /* The file is corrupted. Rotate it away and try it again (but only once) */
1919
1920         l = strlen(fname);
1921         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1922                      (int) (l-8), fname,
1923                      (unsigned long long) now(CLOCK_REALTIME),
1924                      random_ull()) < 0)
1925                 return -ENOMEM;
1926
1927         r = rename(fname, p);
1928         free(p);
1929         if (r < 0)
1930                 return -errno;
1931
1932         log_warning("File %s corrupted, renaming and replacing.", fname);
1933
1934         return journal_file_open(fname, flags, mode, template, ret);
1935 }
1936
1937 struct vacuum_info {
1938         off_t usage;
1939         char *filename;
1940
1941         uint64_t realtime;
1942         sd_id128_t seqnum_id;
1943         uint64_t seqnum;
1944
1945         bool have_seqnum;
1946 };
1947
1948 static int vacuum_compare(const void *_a, const void *_b) {
1949         const struct vacuum_info *a, *b;
1950
1951         a = _a;
1952         b = _b;
1953
1954         if (a->have_seqnum && b->have_seqnum &&
1955             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1956                 if (a->seqnum < b->seqnum)
1957                         return -1;
1958                 else if (a->seqnum > b->seqnum)
1959                         return 1;
1960                 else
1961                         return 0;
1962         }
1963
1964         if (a->realtime < b->realtime)
1965                 return -1;
1966         else if (a->realtime > b->realtime)
1967                 return 1;
1968         else if (a->have_seqnum && b->have_seqnum)
1969                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1970         else
1971                 return strcmp(a->filename, b->filename);
1972 }
1973
1974 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1975         DIR *d;
1976         int r = 0;
1977         struct vacuum_info *list = NULL;
1978         unsigned n_list = 0, n_allocated = 0, i;
1979         uint64_t sum = 0;
1980
1981         assert(directory);
1982
1983         if (max_use <= 0)
1984                 return 0;
1985
1986         d = opendir(directory);
1987         if (!d)
1988                 return -errno;
1989
1990         for (;;) {
1991                 int k;
1992                 struct dirent buf, *de;
1993                 size_t q;
1994                 struct stat st;
1995                 char *p;
1996                 unsigned long long seqnum = 0, realtime;
1997                 sd_id128_t seqnum_id;
1998                 bool have_seqnum;
1999
2000                 k = readdir_r(d, &buf, &de);
2001                 if (k != 0) {
2002                         r = -k;
2003                         goto finish;
2004                 }
2005
2006                 if (!de)
2007                         break;
2008
2009                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2010                         continue;
2011
2012                 if (!S_ISREG(st.st_mode))
2013                         continue;
2014
2015                 q = strlen(de->d_name);
2016
2017                 if (endswith(de->d_name, ".journal")) {
2018
2019                         /* Vacuum archived files */
2020
2021                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2022                                 continue;
2023
2024                         if (de->d_name[q-8-16-1] != '-' ||
2025                             de->d_name[q-8-16-1-16-1] != '-' ||
2026                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2027                                 continue;
2028
2029                         p = strdup(de->d_name);
2030                         if (!p) {
2031                                 r = -ENOMEM;
2032                                 goto finish;
2033                         }
2034
2035                         de->d_name[q-8-16-1-16-1] = 0;
2036                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2037                                 free(p);
2038                                 continue;
2039                         }
2040
2041                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2042                                 free(p);
2043                                 continue;
2044                         }
2045
2046                         have_seqnum = true;
2047
2048                 } else if (endswith(de->d_name, ".journal~")) {
2049                         unsigned long long tmp;
2050
2051                         /* Vacuum corrupted files */
2052
2053                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2054                                 continue;
2055
2056                         if (de->d_name[q-1-8-16-1] != '-' ||
2057                             de->d_name[q-1-8-16-1-16-1] != '@')
2058                                 continue;
2059
2060                         p = strdup(de->d_name);
2061                         if (!p) {
2062                                 r = -ENOMEM;
2063                                 goto finish;
2064                         }
2065
2066                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2067                                 free(p);
2068                                 continue;
2069                         }
2070
2071                         have_seqnum = false;
2072                 } else
2073                         continue;
2074
2075                 if (n_list >= n_allocated) {
2076                         struct vacuum_info *j;
2077
2078                         n_allocated = MAX(n_allocated * 2U, 8U);
2079                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2080                         if (!j) {
2081                                 free(p);
2082                                 r = -ENOMEM;
2083                                 goto finish;
2084                         }
2085
2086                         list = j;
2087                 }
2088
2089                 list[n_list].filename = p;
2090                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2091                 list[n_list].seqnum = seqnum;
2092                 list[n_list].realtime = realtime;
2093                 list[n_list].seqnum_id = seqnum_id;
2094                 list[n_list].have_seqnum = have_seqnum;
2095
2096                 sum += list[n_list].usage;
2097
2098                 n_list ++;
2099         }
2100
2101         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2102
2103         for(i = 0; i < n_list; i++) {
2104                 struct statvfs ss;
2105
2106                 if (fstatvfs(dirfd(d), &ss) < 0) {
2107                         r = -errno;
2108                         goto finish;
2109                 }
2110
2111                 if (sum <= max_use &&
2112                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2113                         break;
2114
2115                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2116                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2117                         sum -= list[i].usage;
2118                 } else if (errno != ENOENT)
2119                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2120         }
2121
2122 finish:
2123         for (i = 0; i < n_list; i++)
2124                 free(list[i].filename);
2125
2126         free(list);
2127
2128         if (d)
2129                 closedir(d);
2130
2131         return r;
2132 }
2133
2134 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2135         uint64_t i, n;
2136         uint64_t q, xor_hash = 0;
2137         int r;
2138         EntryItem *items;
2139         dual_timestamp ts;
2140
2141         assert(from);
2142         assert(to);
2143         assert(o);
2144         assert(p);
2145
2146         if (!to->writable)
2147                 return -EPERM;
2148
2149         ts.monotonic = le64toh(o->entry.monotonic);
2150         ts.realtime = le64toh(o->entry.realtime);
2151
2152         if (to->tail_entry_monotonic_valid &&
2153             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2154                 return -EINVAL;
2155
2156         if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2157                 return -EINVAL;
2158
2159         n = journal_file_entry_n_items(o);
2160         items = alloca(sizeof(EntryItem) * n);
2161
2162         for (i = 0; i < n; i++) {
2163                 uint64_t l, h;
2164                 le64_t le_hash;
2165                 size_t t;
2166                 void *data;
2167                 Object *u;
2168
2169                 q = le64toh(o->entry.items[i].object_offset);
2170                 le_hash = o->entry.items[i].hash;
2171
2172                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2173                 if (r < 0)
2174                         return r;
2175
2176                 if (le_hash != o->data.hash)
2177                         return -EBADMSG;
2178
2179                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2180                 t = (size_t) l;
2181
2182                 /* We hit the limit on 32bit machines */
2183                 if ((uint64_t) t != l)
2184                         return -E2BIG;
2185
2186                 if (o->object.flags & OBJECT_COMPRESSED) {
2187 #ifdef HAVE_XZ
2188                         uint64_t rsize;
2189
2190                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2191                                 return -EBADMSG;
2192
2193                         data = from->compress_buffer;
2194                         l = rsize;
2195 #else
2196                         return -EPROTONOSUPPORT;
2197 #endif
2198                 } else
2199                         data = o->data.payload;
2200
2201                 r = journal_file_append_data(to, data, l, &u, &h);
2202                 if (r < 0)
2203                         return r;
2204
2205                 xor_hash ^= le64toh(u->data.hash);
2206                 items[i].object_offset = htole64(h);
2207                 items[i].hash = u->data.hash;
2208
2209                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2210                 if (r < 0)
2211                         return r;
2212         }
2213
2214         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2215 }
2216
2217 void journal_default_metrics(JournalMetrics *m, int fd) {
2218         uint64_t fs_size = 0;
2219         struct statvfs ss;
2220         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2221
2222         assert(m);
2223         assert(fd >= 0);
2224
2225         if (fstatvfs(fd, &ss) >= 0)
2226                 fs_size = ss.f_frsize * ss.f_blocks;
2227
2228         if (m->max_use == (uint64_t) -1) {
2229
2230                 if (fs_size > 0) {
2231                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2232
2233                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2234                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2235
2236                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2237                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2238                 } else
2239                         m->max_use = DEFAULT_MAX_USE_LOWER;
2240         } else {
2241                 m->max_use = PAGE_ALIGN(m->max_use);
2242
2243                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2244                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2245         }
2246
2247         if (m->max_size == (uint64_t) -1) {
2248                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2249
2250                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2251                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2252         } else
2253                 m->max_size = PAGE_ALIGN(m->max_size);
2254
2255         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2256                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2257
2258         if (m->max_size*2 > m->max_use)
2259                 m->max_use = m->max_size*2;
2260
2261         if (m->min_size == (uint64_t) -1)
2262                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2263         else {
2264                 m->min_size = PAGE_ALIGN(m->min_size);
2265
2266                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2267                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2268
2269                 if (m->min_size > m->max_size)
2270                         m->max_size = m->min_size;
2271         }
2272
2273         if (m->keep_free == (uint64_t) -1) {
2274
2275                 if (fs_size > 0) {
2276                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2277
2278                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2279                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2280
2281                 } else
2282                         m->keep_free = DEFAULT_KEEP_FREE;
2283         }
2284
2285         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2286                  format_bytes(a, sizeof(a), m->max_use),
2287                  format_bytes(b, sizeof(b), m->max_size),
2288                  format_bytes(c, sizeof(c), m->min_size),
2289                  format_bytes(d, sizeof(d), m->keep_free));
2290 }
2291
2292 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2293         Object *o;
2294         int r;
2295
2296         assert(f);
2297         assert(from || to);
2298
2299         if (from) {
2300                 r = journal_file_next_entry(f, NULL, 0, DIRECTION_DOWN, &o, NULL);
2301                 if (r <= 0)
2302                         return r;
2303
2304                 *from = le64toh(o->entry.realtime);
2305         }
2306
2307         if (to) {
2308                 r = journal_file_next_entry(f, NULL, 0, DIRECTION_UP, &o, NULL);
2309                 if (r <= 0)
2310                         return r;
2311
2312                 *to = le64toh(o->entry.realtime);
2313         }
2314
2315         return 1;
2316 }
2317
2318 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2319         char t[9+32+1] = "_BOOT_ID=";
2320         Object *o;
2321         uint64_t p;
2322         int r;
2323
2324         assert(f);
2325         assert(from || to);
2326
2327         sd_id128_to_string(boot_id, t + 9);
2328
2329         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2330         if (r <= 0)
2331                 return r;
2332
2333         if (le64toh(o->data.n_entries) <= 0)
2334                 return 0;
2335
2336         if (from) {
2337                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2338                 if (r < 0)
2339                         return r;
2340
2341                 *from = le64toh(o->entry.monotonic);
2342         }
2343
2344         if (to) {
2345                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2346                 if (r < 0)
2347                         return r;
2348
2349                 r = generic_array_get_plus_one(f,
2350                                                le64toh(o->data.entry_offset),
2351                                                le64toh(o->data.entry_array_offset),
2352                                                le64toh(o->data.n_entries)-1,
2353                                                &o, NULL);
2354                 if (r <= 0)
2355                         return r;
2356
2357                 *to = le64toh(o->entry.monotonic);
2358         }
2359
2360         return 1;
2361 }