chiark / gitweb /
journal: expose and make use of cutoff times of journal
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
62
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
64
65 void journal_file_close(JournalFile *f) {
66         int t;
67
68         assert(f);
69
70         if (f->header) {
71                 if (f->writable)
72                         f->header->state = STATE_OFFLINE;
73
74                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
75         }
76
77         for (t = 0; t < _WINDOW_MAX; t++)
78                 if (f->windows[t].ptr)
79                         munmap(f->windows[t].ptr, f->windows[t].size);
80
81         if (f->fd >= 0)
82                 close_nointr_nofail(f->fd);
83
84         free(f->path);
85
86 #ifdef HAVE_XZ
87         free(f->compress_buffer);
88 #endif
89
90         free(f);
91 }
92
93 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
94         Header h;
95         ssize_t k;
96         int r;
97
98         assert(f);
99
100         zero(h);
101         memcpy(h.signature, signature, 8);
102         h.header_size = htole64(ALIGN64(sizeof(h)));
103
104         r = sd_id128_randomize(&h.file_id);
105         if (r < 0)
106                 return r;
107
108         if (template) {
109                 h.seqnum_id = template->header->seqnum_id;
110                 h.seqnum = template->header->seqnum;
111         } else
112                 h.seqnum_id = h.file_id;
113
114         k = pwrite(f->fd, &h, sizeof(h), 0);
115         if (k < 0)
116                 return -errno;
117
118         if (k != sizeof(h))
119                 return -EIO;
120
121         return 0;
122 }
123
124 static int journal_file_refresh_header(JournalFile *f) {
125         int r;
126         sd_id128_t boot_id;
127
128         assert(f);
129
130         r = sd_id128_get_machine(&f->header->machine_id);
131         if (r < 0)
132                 return r;
133
134         r = sd_id128_get_boot(&boot_id);
135         if (r < 0)
136                 return r;
137
138         if (sd_id128_equal(boot_id, f->header->boot_id))
139                 f->tail_entry_monotonic_valid = true;
140
141         f->header->boot_id = boot_id;
142
143         f->header->state = STATE_ONLINE;
144
145         __sync_synchronize();
146
147         return 0;
148 }
149
150 static int journal_file_verify_header(JournalFile *f) {
151         assert(f);
152
153         if (memcmp(f->header, signature, 8))
154                 return -EBADMSG;
155
156 #ifdef HAVE_XZ
157         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
158                 return -EPROTONOSUPPORT;
159 #else
160         if (f->header->incompatible_flags != 0)
161                 return -EPROTONOSUPPORT;
162 #endif
163
164         if (f->header->header_size != htole64(ALIGN64(sizeof(*(f->header)))))
165                 return -EBADMSG;
166
167         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
168                 return -ENODATA;
169
170         if (f->writable) {
171                 uint8_t state;
172                 sd_id128_t machine_id;
173                 int r;
174
175                 r = sd_id128_get_machine(&machine_id);
176                 if (r < 0)
177                         return r;
178
179                 if (!sd_id128_equal(machine_id, f->header->machine_id))
180                         return -EHOSTDOWN;
181
182                 state = f->header->state;
183
184                 if (state == STATE_ONLINE)
185                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
186                         /* FIXME: immediately rotate */
187                 else if (state == STATE_ARCHIVED)
188                         return -ESHUTDOWN;
189                 else if (state != STATE_OFFLINE)
190                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
191         }
192
193         return 0;
194 }
195
196 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
197         uint64_t old_size, new_size;
198         int r;
199
200         assert(f);
201
202         /* We assume that this file is not sparse, and we know that
203          * for sure, since we always call posix_fallocate()
204          * ourselves */
205
206         old_size =
207                 le64toh(f->header->header_size) +
208                 le64toh(f->header->arena_size);
209
210         new_size = PAGE_ALIGN(offset + size);
211         if (new_size < le64toh(f->header->header_size))
212                 new_size = le64toh(f->header->header_size);
213
214         if (new_size <= old_size)
215                 return 0;
216
217         if (f->metrics.max_size > 0 &&
218             new_size > f->metrics.max_size)
219                 return -E2BIG;
220
221         if (new_size > f->metrics.min_size &&
222             f->metrics.keep_free > 0) {
223                 struct statvfs svfs;
224
225                 if (fstatvfs(f->fd, &svfs) >= 0) {
226                         uint64_t available;
227
228                         available = svfs.f_bfree * svfs.f_bsize;
229
230                         if (available >= f->metrics.keep_free)
231                                 available -= f->metrics.keep_free;
232                         else
233                                 available = 0;
234
235                         if (new_size - old_size > available)
236                                 return -E2BIG;
237                 }
238         }
239
240         /* Note that the glibc fallocate() fallback is very
241            inefficient, hence we try to minimize the allocation area
242            as we can. */
243         r = posix_fallocate(f->fd, old_size, new_size - old_size);
244         if (r != 0)
245                 return -r;
246
247         if (fstat(f->fd, &f->last_stat) < 0)
248                 return -errno;
249
250         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
251
252         return 0;
253 }
254
255 static int journal_file_map(
256                 JournalFile *f,
257                 uint64_t offset,
258                 uint64_t size,
259                 void **_window,
260                 uint64_t *_woffset,
261                 uint64_t *_wsize,
262                 void **ret) {
263
264         uint64_t woffset, wsize;
265         void *window;
266
267         assert(f);
268         assert(size > 0);
269         assert(ret);
270
271         woffset = offset & ~((uint64_t) page_size() - 1ULL);
272         wsize = size + (offset - woffset);
273         wsize = PAGE_ALIGN(wsize);
274
275         /* Avoid SIGBUS on invalid accesses */
276         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
277                 return -EADDRNOTAVAIL;
278
279         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
280         if (window == MAP_FAILED)
281                 return -errno;
282
283         if (_window)
284                 *_window = window;
285
286         if (_woffset)
287                 *_woffset = woffset;
288
289         if (_wsize)
290                 *_wsize = wsize;
291
292         *ret = (uint8_t*) window + (offset - woffset);
293
294         return 0;
295 }
296
297 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
298         void *p = NULL;
299         uint64_t delta;
300         int r;
301         Window *w;
302
303         assert(f);
304         assert(ret);
305         assert(wt >= 0);
306         assert(wt < _WINDOW_MAX);
307
308         if (offset + size > (uint64_t) f->last_stat.st_size) {
309                 /* Hmm, out of range? Let's refresh the fstat() data
310                  * first, before we trust that check. */
311
312                 if (fstat(f->fd, &f->last_stat) < 0 ||
313                     offset + size > (uint64_t) f->last_stat.st_size)
314                         return -EADDRNOTAVAIL;
315         }
316
317         w = f->windows + wt;
318
319         if (_likely_(w->ptr &&
320                      w->offset <= offset &&
321                      w->offset + w->size >= offset + size)) {
322
323                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
324                 return 0;
325         }
326
327         if (w->ptr) {
328                 if (munmap(w->ptr, w->size) < 0)
329                         return -errno;
330
331                 w->ptr = NULL;
332                 w->size = w->offset = 0;
333         }
334
335         if (size < DEFAULT_WINDOW_SIZE) {
336                 /* If the default window size is larger then what was
337                  * asked for extend the mapping a bit in the hope to
338                  * minimize needed remappings later on. We add half
339                  * the window space before and half behind the
340                  * requested mapping */
341
342                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
343
344                 if (delta > offset)
345                         delta = offset;
346
347                 offset -= delta;
348                 size = DEFAULT_WINDOW_SIZE;
349         } else
350                 delta = 0;
351
352         if (offset + size > (uint64_t) f->last_stat.st_size)
353                 size = (uint64_t) f->last_stat.st_size - offset;
354
355         if (size <= 0)
356                 return -EADDRNOTAVAIL;
357
358         r = journal_file_map(f,
359                              offset, size,
360                              &w->ptr, &w->offset, &w->size,
361                              &p);
362
363         if (r < 0)
364                 return r;
365
366         *ret = (uint8_t*) p + delta;
367         return 0;
368 }
369
370 static bool verify_hash(Object *o) {
371         uint64_t h1, h2;
372
373         assert(o);
374
375         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
376                 h1 = le64toh(o->data.hash);
377                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
378         } else if (o->object.type == OBJECT_FIELD) {
379                 h1 = le64toh(o->field.hash);
380                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
381         } else
382                 return true;
383
384         return h1 == h2;
385 }
386
387 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
388         int r;
389         void *t;
390         Object *o;
391         uint64_t s;
392
393         assert(f);
394         assert(ret);
395         assert(type < _OBJECT_TYPE_MAX);
396
397         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
398         if (r < 0)
399                 return r;
400
401         o = (Object*) t;
402         s = le64toh(o->object.size);
403
404         if (s < sizeof(ObjectHeader))
405                 return -EBADMSG;
406
407         if (type >= 0 && o->object.type != type)
408                 return -EBADMSG;
409
410         if (s > sizeof(ObjectHeader)) {
411                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
412                 if (r < 0)
413                         return r;
414
415                 o = (Object*) t;
416         }
417
418         if (!verify_hash(o))
419                 return -EBADMSG;
420
421         *ret = o;
422         return 0;
423 }
424
425 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
426         uint64_t r;
427
428         assert(f);
429
430         r = le64toh(f->header->seqnum) + 1;
431
432         if (seqnum) {
433                 /* If an external seqnum counter was passed, we update
434                  * both the local and the external one, and set it to
435                  * the maximum of both */
436
437                 if (*seqnum + 1 > r)
438                         r = *seqnum + 1;
439
440                 *seqnum = r;
441         }
442
443         f->header->seqnum = htole64(r);
444
445         if (f->header->first_seqnum == 0)
446                 f->header->first_seqnum = htole64(r);
447
448         return r;
449 }
450
451 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
452         int r;
453         uint64_t p;
454         Object *tail, *o;
455         void *t;
456
457         assert(f);
458         assert(size >= sizeof(ObjectHeader));
459         assert(offset);
460         assert(ret);
461
462         p = le64toh(f->header->tail_object_offset);
463         if (p == 0)
464                 p = le64toh(f->header->header_size);
465         else {
466                 r = journal_file_move_to_object(f, -1, p, &tail);
467                 if (r < 0)
468                         return r;
469
470                 p += ALIGN64(le64toh(tail->object.size));
471         }
472
473         r = journal_file_allocate(f, p, size);
474         if (r < 0)
475                 return r;
476
477         r = journal_file_move_to(f, type, p, size, &t);
478         if (r < 0)
479                 return r;
480
481         o = (Object*) t;
482
483         zero(o->object);
484         o->object.type = type;
485         o->object.size = htole64(size);
486
487         f->header->tail_object_offset = htole64(p);
488         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
489
490         *ret = o;
491         *offset = p;
492
493         return 0;
494 }
495
496 static int journal_file_setup_data_hash_table(JournalFile *f) {
497         uint64_t s, p;
498         Object *o;
499         int r;
500
501         assert(f);
502
503         s = DEFAULT_DATA_HASH_TABLE_SIZE;
504         r = journal_file_append_object(f,
505                                        OBJECT_DATA_HASH_TABLE,
506                                        offsetof(Object, hash_table.items) + s,
507                                        &o, &p);
508         if (r < 0)
509                 return r;
510
511         memset(o->hash_table.items, 0, s);
512
513         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
514         f->header->data_hash_table_size = htole64(s);
515
516         return 0;
517 }
518
519 static int journal_file_setup_field_hash_table(JournalFile *f) {
520         uint64_t s, p;
521         Object *o;
522         int r;
523
524         assert(f);
525
526         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
527         r = journal_file_append_object(f,
528                                        OBJECT_FIELD_HASH_TABLE,
529                                        offsetof(Object, hash_table.items) + s,
530                                        &o, &p);
531         if (r < 0)
532                 return r;
533
534         memset(o->hash_table.items, 0, s);
535
536         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
537         f->header->field_hash_table_size = htole64(s);
538
539         return 0;
540 }
541
542 static int journal_file_map_data_hash_table(JournalFile *f) {
543         uint64_t s, p;
544         void *t;
545         int r;
546
547         assert(f);
548
549         p = le64toh(f->header->data_hash_table_offset);
550         s = le64toh(f->header->data_hash_table_size);
551
552         r = journal_file_move_to(f,
553                                  WINDOW_DATA_HASH_TABLE,
554                                  p, s,
555                                  &t);
556         if (r < 0)
557                 return r;
558
559         f->data_hash_table = t;
560         return 0;
561 }
562
563 static int journal_file_map_field_hash_table(JournalFile *f) {
564         uint64_t s, p;
565         void *t;
566         int r;
567
568         assert(f);
569
570         p = le64toh(f->header->field_hash_table_offset);
571         s = le64toh(f->header->field_hash_table_size);
572
573         r = journal_file_move_to(f,
574                                  WINDOW_FIELD_HASH_TABLE,
575                                  p, s,
576                                  &t);
577         if (r < 0)
578                 return r;
579
580         f->field_hash_table = t;
581         return 0;
582 }
583
584 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
585         uint64_t p, h;
586         int r;
587
588         assert(f);
589         assert(o);
590         assert(offset > 0);
591         assert(o->object.type == OBJECT_DATA);
592
593         /* This might alter the window we are looking at */
594
595         o->data.next_hash_offset = o->data.next_field_offset = 0;
596         o->data.entry_offset = o->data.entry_array_offset = 0;
597         o->data.n_entries = 0;
598
599         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
600         p = le64toh(f->data_hash_table[h].tail_hash_offset);
601         if (p == 0) {
602                 /* Only entry in the hash table is easy */
603                 f->data_hash_table[h].head_hash_offset = htole64(offset);
604         } else {
605                 /* Move back to the previous data object, to patch in
606                  * pointer */
607
608                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
609                 if (r < 0)
610                         return r;
611
612                 o->data.next_hash_offset = htole64(offset);
613         }
614
615         f->data_hash_table[h].tail_hash_offset = htole64(offset);
616
617         return 0;
618 }
619
620 int journal_file_find_data_object_with_hash(
621                 JournalFile *f,
622                 const void *data, uint64_t size, uint64_t hash,
623                 Object **ret, uint64_t *offset) {
624
625         uint64_t p, osize, h;
626         int r;
627
628         assert(f);
629         assert(data || size == 0);
630
631         osize = offsetof(Object, data.payload) + size;
632
633         if (f->header->data_hash_table_size == 0)
634                 return -EBADMSG;
635
636         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
637         p = le64toh(f->data_hash_table[h].head_hash_offset);
638
639         while (p > 0) {
640                 Object *o;
641
642                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
643                 if (r < 0)
644                         return r;
645
646                 if (le64toh(o->data.hash) != hash)
647                         goto next;
648
649                 if (o->object.flags & OBJECT_COMPRESSED) {
650 #ifdef HAVE_XZ
651                         uint64_t l, rsize;
652
653                         l = le64toh(o->object.size);
654                         if (l <= offsetof(Object, data.payload))
655                                 return -EBADMSG;
656
657                         l -= offsetof(Object, data.payload);
658
659                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
660                                 return -EBADMSG;
661
662                         if (rsize == size &&
663                             memcmp(f->compress_buffer, data, size) == 0) {
664
665                                 if (ret)
666                                         *ret = o;
667
668                                 if (offset)
669                                         *offset = p;
670
671                                 return 1;
672                         }
673 #else
674                         return -EPROTONOSUPPORT;
675 #endif
676
677                 } else if (le64toh(o->object.size) == osize &&
678                            memcmp(o->data.payload, data, size) == 0) {
679
680                         if (ret)
681                                 *ret = o;
682
683                         if (offset)
684                                 *offset = p;
685
686                         return 1;
687                 }
688
689         next:
690                 p = le64toh(o->data.next_hash_offset);
691         }
692
693         return 0;
694 }
695
696 int journal_file_find_data_object(
697                 JournalFile *f,
698                 const void *data, uint64_t size,
699                 Object **ret, uint64_t *offset) {
700
701         uint64_t hash;
702
703         assert(f);
704         assert(data || size == 0);
705
706         hash = hash64(data, size);
707
708         return journal_file_find_data_object_with_hash(f,
709                                                        data, size, hash,
710                                                        ret, offset);
711 }
712
713 static int journal_file_append_data(
714                 JournalFile *f,
715                 const void *data, uint64_t size,
716                 Object **ret, uint64_t *offset) {
717
718         uint64_t hash, p;
719         uint64_t osize;
720         Object *o;
721         int r;
722         bool compressed = false;
723
724         assert(f);
725         assert(data || size == 0);
726
727         hash = hash64(data, size);
728
729         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
730         if (r < 0)
731                 return r;
732         else if (r > 0) {
733
734                 if (ret)
735                         *ret = o;
736
737                 if (offset)
738                         *offset = p;
739
740                 return 0;
741         }
742
743         osize = offsetof(Object, data.payload) + size;
744         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
745         if (r < 0)
746                 return r;
747
748         o->data.hash = htole64(hash);
749
750 #ifdef HAVE_XZ
751         if (f->compress &&
752             size >= COMPRESSION_SIZE_THRESHOLD) {
753                 uint64_t rsize;
754
755                 compressed = compress_blob(data, size, o->data.payload, &rsize);
756
757                 if (compressed) {
758                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
759                         o->object.flags |= OBJECT_COMPRESSED;
760
761                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
762
763                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
764                 }
765         }
766 #endif
767
768         if (!compressed)
769                 memcpy(o->data.payload, data, size);
770
771         r = journal_file_link_data(f, o, p, hash);
772         if (r < 0)
773                 return r;
774
775         /* The linking might have altered the window, so let's
776          * refresh our pointer */
777         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
778         if (r < 0)
779                 return r;
780
781         if (ret)
782                 *ret = o;
783
784         if (offset)
785                 *offset = p;
786
787         return 0;
788 }
789
790 uint64_t journal_file_entry_n_items(Object *o) {
791         assert(o);
792         assert(o->object.type == OBJECT_ENTRY);
793
794         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
795 }
796
797 static uint64_t journal_file_entry_array_n_items(Object *o) {
798         assert(o);
799         assert(o->object.type == OBJECT_ENTRY_ARRAY);
800
801         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
802 }
803
804 static int link_entry_into_array(JournalFile *f,
805                                  le64_t *first,
806                                  le64_t *idx,
807                                  uint64_t p) {
808         int r;
809         uint64_t n = 0, ap = 0, q, i, a, hidx;
810         Object *o;
811
812         assert(f);
813         assert(first);
814         assert(idx);
815         assert(p > 0);
816
817         a = le64toh(*first);
818         i = hidx = le64toh(*idx);
819         while (a > 0) {
820
821                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
822                 if (r < 0)
823                         return r;
824
825                 n = journal_file_entry_array_n_items(o);
826                 if (i < n) {
827                         o->entry_array.items[i] = htole64(p);
828                         *idx = htole64(hidx + 1);
829                         return 0;
830                 }
831
832                 i -= n;
833                 ap = a;
834                 a = le64toh(o->entry_array.next_entry_array_offset);
835         }
836
837         if (hidx > n)
838                 n = (hidx+1) * 2;
839         else
840                 n = n * 2;
841
842         if (n < 4)
843                 n = 4;
844
845         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
846                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
847                                        &o, &q);
848         if (r < 0)
849                 return r;
850
851         o->entry_array.items[i] = htole64(p);
852
853         if (ap == 0)
854                 *first = htole64(q);
855         else {
856                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
857                 if (r < 0)
858                         return r;
859
860                 o->entry_array.next_entry_array_offset = htole64(q);
861         }
862
863         *idx = htole64(hidx + 1);
864
865         return 0;
866 }
867
868 static int link_entry_into_array_plus_one(JournalFile *f,
869                                           le64_t *extra,
870                                           le64_t *first,
871                                           le64_t *idx,
872                                           uint64_t p) {
873
874         int r;
875
876         assert(f);
877         assert(extra);
878         assert(first);
879         assert(idx);
880         assert(p > 0);
881
882         if (*idx == 0)
883                 *extra = htole64(p);
884         else {
885                 le64_t i;
886
887                 i = htole64(le64toh(*idx) - 1);
888                 r = link_entry_into_array(f, first, &i, p);
889                 if (r < 0)
890                         return r;
891         }
892
893         *idx = htole64(le64toh(*idx) + 1);
894         return 0;
895 }
896
897 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
898         uint64_t p;
899         int r;
900         assert(f);
901         assert(o);
902         assert(offset > 0);
903
904         p = le64toh(o->entry.items[i].object_offset);
905         if (p == 0)
906                 return -EINVAL;
907
908         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
909         if (r < 0)
910                 return r;
911
912         return link_entry_into_array_plus_one(f,
913                                               &o->data.entry_offset,
914                                               &o->data.entry_array_offset,
915                                               &o->data.n_entries,
916                                               offset);
917 }
918
919 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
920         uint64_t n, i;
921         int r;
922
923         assert(f);
924         assert(o);
925         assert(offset > 0);
926         assert(o->object.type == OBJECT_ENTRY);
927
928         __sync_synchronize();
929
930         /* Link up the entry itself */
931         r = link_entry_into_array(f,
932                                   &f->header->entry_array_offset,
933                                   &f->header->n_entries,
934                                   offset);
935         if (r < 0)
936                 return r;
937
938         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
939
940         if (f->header->head_entry_realtime == 0)
941                 f->header->head_entry_realtime = o->entry.realtime;
942
943         f->header->tail_entry_realtime = o->entry.realtime;
944         f->header->tail_entry_monotonic = o->entry.monotonic;
945
946         f->tail_entry_monotonic_valid = true;
947
948         /* Link up the items */
949         n = journal_file_entry_n_items(o);
950         for (i = 0; i < n; i++) {
951                 r = journal_file_link_entry_item(f, o, offset, i);
952                 if (r < 0)
953                         return r;
954         }
955
956         return 0;
957 }
958
959 static int journal_file_append_entry_internal(
960                 JournalFile *f,
961                 const dual_timestamp *ts,
962                 uint64_t xor_hash,
963                 const EntryItem items[], unsigned n_items,
964                 uint64_t *seqnum,
965                 Object **ret, uint64_t *offset) {
966         uint64_t np;
967         uint64_t osize;
968         Object *o;
969         int r;
970
971         assert(f);
972         assert(items || n_items == 0);
973         assert(ts);
974
975         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
976
977         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
978         if (r < 0)
979                 return r;
980
981         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
982         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
983         o->entry.realtime = htole64(ts->realtime);
984         o->entry.monotonic = htole64(ts->monotonic);
985         o->entry.xor_hash = htole64(xor_hash);
986         o->entry.boot_id = f->header->boot_id;
987
988         r = journal_file_link_entry(f, o, np);
989         if (r < 0)
990                 return r;
991
992         if (ret)
993                 *ret = o;
994
995         if (offset)
996                 *offset = np;
997
998         return 0;
999 }
1000
1001 void journal_file_post_change(JournalFile *f) {
1002         assert(f);
1003
1004         /* inotify() does not receive IN_MODIFY events from file
1005          * accesses done via mmap(). After each access we hence
1006          * trigger IN_MODIFY by truncating the journal file to its
1007          * current size which triggers IN_MODIFY. */
1008
1009         __sync_synchronize();
1010
1011         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1012                 log_error("Failed to to truncate file to its own size: %m");
1013 }
1014
1015 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1016         unsigned i;
1017         EntryItem *items;
1018         int r;
1019         uint64_t xor_hash = 0;
1020         struct dual_timestamp _ts;
1021
1022         assert(f);
1023         assert(iovec || n_iovec == 0);
1024
1025         if (!f->writable)
1026                 return -EPERM;
1027
1028         if (!ts) {
1029                 dual_timestamp_get(&_ts);
1030                 ts = &_ts;
1031         }
1032
1033         if (f->tail_entry_monotonic_valid &&
1034             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1035                 return -EINVAL;
1036
1037         items = alloca(sizeof(EntryItem) * n_iovec);
1038
1039         for (i = 0; i < n_iovec; i++) {
1040                 uint64_t p;
1041                 Object *o;
1042
1043                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1044                 if (r < 0)
1045                         return r;
1046
1047                 xor_hash ^= le64toh(o->data.hash);
1048                 items[i].object_offset = htole64(p);
1049                 items[i].hash = o->data.hash;
1050         }
1051
1052         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1053
1054         journal_file_post_change(f);
1055
1056         return r;
1057 }
1058
1059 static int generic_array_get(JournalFile *f,
1060                              uint64_t first,
1061                              uint64_t i,
1062                              Object **ret, uint64_t *offset) {
1063
1064         Object *o;
1065         uint64_t p = 0, a;
1066         int r;
1067
1068         assert(f);
1069
1070         a = first;
1071         while (a > 0) {
1072                 uint64_t n;
1073
1074                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1075                 if (r < 0)
1076                         return r;
1077
1078                 n = journal_file_entry_array_n_items(o);
1079                 if (i < n) {
1080                         p = le64toh(o->entry_array.items[i]);
1081                         break;
1082                 }
1083
1084                 i -= n;
1085                 a = le64toh(o->entry_array.next_entry_array_offset);
1086         }
1087
1088         if (a <= 0 || p <= 0)
1089                 return 0;
1090
1091         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1092         if (r < 0)
1093                 return r;
1094
1095         if (ret)
1096                 *ret = o;
1097
1098         if (offset)
1099                 *offset = p;
1100
1101         return 1;
1102 }
1103
1104 static int generic_array_get_plus_one(JournalFile *f,
1105                                       uint64_t extra,
1106                                       uint64_t first,
1107                                       uint64_t i,
1108                                       Object **ret, uint64_t *offset) {
1109
1110         Object *o;
1111
1112         assert(f);
1113
1114         if (i == 0) {
1115                 int r;
1116
1117                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1118                 if (r < 0)
1119                         return r;
1120
1121                 if (ret)
1122                         *ret = o;
1123
1124                 if (offset)
1125                         *offset = extra;
1126
1127                 return 1;
1128         }
1129
1130         return generic_array_get(f, first, i-1, ret, offset);
1131 }
1132
1133 enum {
1134         TEST_FOUND,
1135         TEST_LEFT,
1136         TEST_RIGHT
1137 };
1138
1139 static int generic_array_bisect(JournalFile *f,
1140                                 uint64_t first,
1141                                 uint64_t n,
1142                                 uint64_t needle,
1143                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1144                                 direction_t direction,
1145                                 Object **ret,
1146                                 uint64_t *offset,
1147                                 uint64_t *idx) {
1148
1149         uint64_t a, p, t = 0, i = 0, last_p = 0;
1150         bool subtract_one = false;
1151         Object *o, *array = NULL;
1152         int r;
1153
1154         assert(f);
1155         assert(test_object);
1156
1157         a = first;
1158         while (a > 0) {
1159                 uint64_t left, right, k, lp;
1160
1161                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1162                 if (r < 0)
1163                         return r;
1164
1165                 k = journal_file_entry_array_n_items(array);
1166                 right = MIN(k, n);
1167                 if (right <= 0)
1168                         return 0;
1169
1170                 i = right - 1;
1171                 lp = p = le64toh(array->entry_array.items[i]);
1172                 if (p <= 0)
1173                         return -EBADMSG;
1174
1175                 r = test_object(f, p, needle);
1176                 if (r < 0)
1177                         return r;
1178
1179                 if (r == TEST_FOUND)
1180                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1181
1182                 if (r == TEST_RIGHT) {
1183                         left = 0;
1184                         right -= 1;
1185                         for (;;) {
1186                                 if (left == right) {
1187                                         if (direction == DIRECTION_UP)
1188                                                 subtract_one = true;
1189
1190                                         i = left;
1191                                         goto found;
1192                                 }
1193
1194                                 assert(left < right);
1195
1196                                 i = (left + right) / 2;
1197                                 p = le64toh(array->entry_array.items[i]);
1198                                 if (p <= 0)
1199                                         return -EBADMSG;
1200
1201                                 r = test_object(f, p, needle);
1202                                 if (r < 0)
1203                                         return r;
1204
1205                                 if (r == TEST_FOUND)
1206                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1207
1208                                 if (r == TEST_RIGHT)
1209                                         right = i;
1210                                 else
1211                                         left = i + 1;
1212                         }
1213                 }
1214
1215                 if (k > n)
1216                         return 0;
1217
1218                 last_p = lp;
1219
1220                 n -= k;
1221                 t += k;
1222                 a = le64toh(array->entry_array.next_entry_array_offset);
1223         }
1224
1225         return 0;
1226
1227 found:
1228         if (subtract_one && t == 0 && i == 0)
1229                 return 0;
1230
1231         if (subtract_one && i == 0)
1232                 p = last_p;
1233         else if (subtract_one)
1234                 p = le64toh(array->entry_array.items[i-1]);
1235         else
1236                 p = le64toh(array->entry_array.items[i]);
1237
1238         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1239         if (r < 0)
1240                 return r;
1241
1242         if (ret)
1243                 *ret = o;
1244
1245         if (offset)
1246                 *offset = p;
1247
1248         if (idx)
1249                 *idx = t + i - (subtract_one ? 1 : 0);
1250
1251         return 1;
1252 }
1253
1254 static int generic_array_bisect_plus_one(JournalFile *f,
1255                                          uint64_t extra,
1256                                          uint64_t first,
1257                                          uint64_t n,
1258                                          uint64_t needle,
1259                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1260                                          direction_t direction,
1261                                          Object **ret,
1262                                          uint64_t *offset,
1263                                          uint64_t *idx) {
1264
1265         int r;
1266
1267         assert(f);
1268         assert(test_object);
1269
1270         if (n <= 0)
1271                 return 0;
1272
1273         /* This bisects the array in object 'first', but first checks
1274          * an extra  */
1275         r = test_object(f, extra, needle);
1276         if (r < 0)
1277                 return r;
1278         else if (r == TEST_FOUND) {
1279                 Object *o;
1280
1281                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1282                 if (r < 0)
1283                         return r;
1284
1285                 if (ret)
1286                         *ret = o;
1287
1288                 if (offset)
1289                         *offset = extra;
1290
1291                 if (idx)
1292                         *idx = 0;
1293
1294                 return 1;
1295         } else if (r == TEST_RIGHT)
1296                 return 0;
1297
1298         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1299
1300         if (r > 0)
1301                 (*idx) ++;
1302
1303         return r;
1304 }
1305
1306 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1307         Object *o;
1308         int r;
1309
1310         assert(f);
1311         assert(p > 0);
1312
1313         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1314         if (r < 0)
1315                 return r;
1316
1317         if (le64toh(o->entry.seqnum) == needle)
1318                 return TEST_FOUND;
1319         else if (le64toh(o->entry.seqnum) < needle)
1320                 return TEST_LEFT;
1321         else
1322                 return TEST_RIGHT;
1323 }
1324
1325 int journal_file_move_to_entry_by_seqnum(
1326                 JournalFile *f,
1327                 uint64_t seqnum,
1328                 direction_t direction,
1329                 Object **ret,
1330                 uint64_t *offset) {
1331
1332         return generic_array_bisect(f,
1333                                     le64toh(f->header->entry_array_offset),
1334                                     le64toh(f->header->n_entries),
1335                                     seqnum,
1336                                     test_object_seqnum,
1337                                     direction,
1338                                     ret, offset, NULL);
1339 }
1340
1341 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1342         Object *o;
1343         int r;
1344
1345         assert(f);
1346         assert(p > 0);
1347
1348         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1349         if (r < 0)
1350                 return r;
1351
1352         if (le64toh(o->entry.realtime) == needle)
1353                 return TEST_FOUND;
1354         else if (le64toh(o->entry.realtime) < needle)
1355                 return TEST_LEFT;
1356         else
1357                 return TEST_RIGHT;
1358 }
1359
1360 int journal_file_move_to_entry_by_realtime(
1361                 JournalFile *f,
1362                 uint64_t realtime,
1363                 direction_t direction,
1364                 Object **ret,
1365                 uint64_t *offset) {
1366
1367         return generic_array_bisect(f,
1368                                     le64toh(f->header->entry_array_offset),
1369                                     le64toh(f->header->n_entries),
1370                                     realtime,
1371                                     test_object_realtime,
1372                                     direction,
1373                                     ret, offset, NULL);
1374 }
1375
1376 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1377         Object *o;
1378         int r;
1379
1380         assert(f);
1381         assert(p > 0);
1382
1383         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1384         if (r < 0)
1385                 return r;
1386
1387         if (le64toh(o->entry.monotonic) == needle)
1388                 return TEST_FOUND;
1389         else if (le64toh(o->entry.monotonic) < needle)
1390                 return TEST_LEFT;
1391         else
1392                 return TEST_RIGHT;
1393 }
1394
1395 int journal_file_move_to_entry_by_monotonic(
1396                 JournalFile *f,
1397                 sd_id128_t boot_id,
1398                 uint64_t monotonic,
1399                 direction_t direction,
1400                 Object **ret,
1401                 uint64_t *offset) {
1402
1403         char t[8+32+1] = "_BOOT_ID=";
1404         Object *o;
1405         int r;
1406
1407         sd_id128_to_string(boot_id, t + 8);
1408
1409         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1410         if (r < 0)
1411                 return r;
1412         else if (r == 0)
1413                 return -ENOENT;
1414
1415         return generic_array_bisect_plus_one(f,
1416                                              le64toh(o->data.entry_offset),
1417                                              le64toh(o->data.entry_array_offset),
1418                                              le64toh(o->data.n_entries),
1419                                              monotonic,
1420                                              test_object_monotonic,
1421                                              direction,
1422                                              ret, offset, NULL);
1423 }
1424
1425 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1426         assert(f);
1427         assert(p > 0);
1428
1429         if (p == needle)
1430                 return TEST_FOUND;
1431         else if (p < needle)
1432                 return TEST_LEFT;
1433         else
1434                 return TEST_RIGHT;
1435 }
1436
1437 int journal_file_next_entry(
1438                 JournalFile *f,
1439                 Object *o, uint64_t p,
1440                 direction_t direction,
1441                 Object **ret, uint64_t *offset) {
1442
1443         uint64_t i, n;
1444         int r;
1445
1446         assert(f);
1447         assert(p > 0 || !o);
1448
1449         n = le64toh(f->header->n_entries);
1450         if (n <= 0)
1451                 return 0;
1452
1453         if (!o)
1454                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1455         else {
1456                 if (o->object.type != OBJECT_ENTRY)
1457                         return -EINVAL;
1458
1459                 r = generic_array_bisect(f,
1460                                          le64toh(f->header->entry_array_offset),
1461                                          le64toh(f->header->n_entries),
1462                                          p,
1463                                          test_object_offset,
1464                                          DIRECTION_DOWN,
1465                                          NULL, NULL,
1466                                          &i);
1467                 if (r <= 0)
1468                         return r;
1469
1470                 if (direction == DIRECTION_DOWN) {
1471                         if (i >= n - 1)
1472                                 return 0;
1473
1474                         i++;
1475                 } else {
1476                         if (i <= 0)
1477                                 return 0;
1478
1479                         i--;
1480                 }
1481         }
1482
1483         /* And jump to it */
1484         return generic_array_get(f,
1485                                  le64toh(f->header->entry_array_offset),
1486                                  i,
1487                                  ret, offset);
1488 }
1489
1490 int journal_file_skip_entry(
1491                 JournalFile *f,
1492                 Object *o, uint64_t p,
1493                 int64_t skip,
1494                 Object **ret, uint64_t *offset) {
1495
1496         uint64_t i, n;
1497         int r;
1498
1499         assert(f);
1500         assert(o);
1501         assert(p > 0);
1502
1503         if (o->object.type != OBJECT_ENTRY)
1504                 return -EINVAL;
1505
1506         r = generic_array_bisect(f,
1507                                  le64toh(f->header->entry_array_offset),
1508                                  le64toh(f->header->n_entries),
1509                                  p,
1510                                  test_object_offset,
1511                                  DIRECTION_DOWN,
1512                                  NULL, NULL,
1513                                  &i);
1514         if (r <= 0)
1515                 return r;
1516
1517         /* Calculate new index */
1518         if (skip < 0) {
1519                 if ((uint64_t) -skip >= i)
1520                         i = 0;
1521                 else
1522                         i = i - (uint64_t) -skip;
1523         } else
1524                 i  += (uint64_t) skip;
1525
1526         n = le64toh(f->header->n_entries);
1527         if (n <= 0)
1528                 return -EBADMSG;
1529
1530         if (i >= n)
1531                 i = n-1;
1532
1533         return generic_array_get(f,
1534                                  le64toh(f->header->entry_array_offset),
1535                                  i,
1536                                  ret, offset);
1537 }
1538
1539 int journal_file_next_entry_for_data(
1540                 JournalFile *f,
1541                 Object *o, uint64_t p,
1542                 uint64_t data_offset,
1543                 direction_t direction,
1544                 Object **ret, uint64_t *offset) {
1545
1546         uint64_t n, i;
1547         int r;
1548         Object *d;
1549
1550         assert(f);
1551         assert(p > 0 || !o);
1552
1553         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1554         if (r < 0)
1555                 return r;
1556
1557         n = le64toh(d->data.n_entries);
1558         if (n <= 0)
1559                 return n;
1560
1561         if (!o)
1562                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1563         else {
1564                 if (o->object.type != OBJECT_ENTRY)
1565                         return -EINVAL;
1566
1567                 r = generic_array_bisect_plus_one(f,
1568                                                   le64toh(d->data.entry_offset),
1569                                                   le64toh(d->data.entry_array_offset),
1570                                                   le64toh(d->data.n_entries),
1571                                                   p,
1572                                                   test_object_offset,
1573                                                   DIRECTION_DOWN,
1574                                                   NULL, NULL,
1575                                                   &i);
1576
1577                 if (r <= 0)
1578                         return r;
1579
1580                 if (direction == DIRECTION_DOWN) {
1581                         if (i >= n - 1)
1582                                 return 0;
1583
1584                         i++;
1585                 } else {
1586                         if (i <= 0)
1587                                 return 0;
1588
1589                         i--;
1590                 }
1591
1592         }
1593
1594         return generic_array_get_plus_one(f,
1595                                           le64toh(d->data.entry_offset),
1596                                           le64toh(d->data.entry_array_offset),
1597                                           i,
1598                                           ret, offset);
1599 }
1600
1601 int journal_file_move_to_entry_by_seqnum_for_data(
1602                 JournalFile *f,
1603                 uint64_t data_offset,
1604                 uint64_t seqnum,
1605                 direction_t direction,
1606                 Object **ret, uint64_t *offset) {
1607
1608         Object *d;
1609         int r;
1610
1611         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1612         if (r <= 0)
1613                 return r;
1614
1615         return generic_array_bisect_plus_one(f,
1616                                              le64toh(d->data.entry_offset),
1617                                              le64toh(d->data.entry_array_offset),
1618                                              le64toh(d->data.n_entries),
1619                                              seqnum,
1620                                              test_object_seqnum,
1621                                              direction,
1622                                              ret, offset, NULL);
1623 }
1624
1625 int journal_file_move_to_entry_by_realtime_for_data(
1626                 JournalFile *f,
1627                 uint64_t data_offset,
1628                 uint64_t realtime,
1629                 direction_t direction,
1630                 Object **ret, uint64_t *offset) {
1631
1632         Object *d;
1633         int r;
1634
1635         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1636         if (r <= 0)
1637                 return r;
1638
1639         return generic_array_bisect_plus_one(f,
1640                                              le64toh(d->data.entry_offset),
1641                                              le64toh(d->data.entry_array_offset),
1642                                              le64toh(d->data.n_entries),
1643                                              realtime,
1644                                              test_object_realtime,
1645                                              direction,
1646                                              ret, offset, NULL);
1647 }
1648
1649 void journal_file_dump(JournalFile *f) {
1650         char a[33], b[33], c[33];
1651         Object *o;
1652         int r;
1653         uint64_t p;
1654
1655         assert(f);
1656
1657         printf("File Path: %s\n"
1658                "File ID: %s\n"
1659                "Machine ID: %s\n"
1660                "Boot ID: %s\n"
1661                "Arena size: %llu\n"
1662                "Objects: %lu\n"
1663                "Entries: %lu\n",
1664                f->path,
1665                sd_id128_to_string(f->header->file_id, a),
1666                sd_id128_to_string(f->header->machine_id, b),
1667                sd_id128_to_string(f->header->boot_id, c),
1668                (unsigned long long) le64toh(f->header->arena_size),
1669                (unsigned long) le64toh(f->header->n_objects),
1670                (unsigned long) le64toh(f->header->n_entries));
1671
1672         p = le64toh(f->header->header_size);
1673         while (p != 0) {
1674                 r = journal_file_move_to_object(f, -1, p, &o);
1675                 if (r < 0)
1676                         goto fail;
1677
1678                 switch (o->object.type) {
1679
1680                 case OBJECT_UNUSED:
1681                         printf("Type: OBJECT_UNUSED\n");
1682                         break;
1683
1684                 case OBJECT_DATA:
1685                         printf("Type: OBJECT_DATA\n");
1686                         break;
1687
1688                 case OBJECT_ENTRY:
1689                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1690                                (unsigned long long) le64toh(o->entry.seqnum),
1691                                (unsigned long long) le64toh(o->entry.monotonic),
1692                                (unsigned long long) le64toh(o->entry.realtime));
1693                         break;
1694
1695                 case OBJECT_FIELD_HASH_TABLE:
1696                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1697                         break;
1698
1699                 case OBJECT_DATA_HASH_TABLE:
1700                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1701                         break;
1702
1703                 case OBJECT_ENTRY_ARRAY:
1704                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1705                         break;
1706
1707                 case OBJECT_SIGNATURE:
1708                         printf("Type: OBJECT_SIGNATURE\n");
1709                         break;
1710                 }
1711
1712                 if (o->object.flags & OBJECT_COMPRESSED)
1713                         printf("Flags: COMPRESSED\n");
1714
1715                 if (p == le64toh(f->header->tail_object_offset))
1716                         p = 0;
1717                 else
1718                         p = p + ALIGN64(le64toh(o->object.size));
1719         }
1720
1721         return;
1722 fail:
1723         log_error("File corrupt");
1724 }
1725
1726 int journal_file_open(
1727                 const char *fname,
1728                 int flags,
1729                 mode_t mode,
1730                 JournalFile *template,
1731                 JournalFile **ret) {
1732
1733         JournalFile *f;
1734         int r;
1735         bool newly_created = false;
1736
1737         assert(fname);
1738
1739         if ((flags & O_ACCMODE) != O_RDONLY &&
1740             (flags & O_ACCMODE) != O_RDWR)
1741                 return -EINVAL;
1742
1743         if (!endswith(fname, ".journal"))
1744                 return -EINVAL;
1745
1746         f = new0(JournalFile, 1);
1747         if (!f)
1748                 return -ENOMEM;
1749
1750         f->fd = -1;
1751         f->flags = flags;
1752         f->mode = mode;
1753         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1754         f->prot = prot_from_flags(flags);
1755
1756         if (template) {
1757                 f->metrics = template->metrics;
1758                 f->compress = template->compress;
1759         }
1760
1761         f->path = strdup(fname);
1762         if (!f->path) {
1763                 r = -ENOMEM;
1764                 goto fail;
1765         }
1766
1767         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1768         if (f->fd < 0) {
1769                 r = -errno;
1770                 goto fail;
1771         }
1772
1773         if (fstat(f->fd, &f->last_stat) < 0) {
1774                 r = -errno;
1775                 goto fail;
1776         }
1777
1778         if (f->last_stat.st_size == 0 && f->writable) {
1779                 newly_created = true;
1780
1781                 r = journal_file_init_header(f, template);
1782                 if (r < 0)
1783                         goto fail;
1784
1785                 if (fstat(f->fd, &f->last_stat) < 0) {
1786                         r = -errno;
1787                         goto fail;
1788                 }
1789         }
1790
1791         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1792                 r = -EIO;
1793                 goto fail;
1794         }
1795
1796         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1797         if (f->header == MAP_FAILED) {
1798                 f->header = NULL;
1799                 r = -errno;
1800                 goto fail;
1801         }
1802
1803         if (!newly_created) {
1804                 r = journal_file_verify_header(f);
1805                 if (r < 0)
1806                         goto fail;
1807         }
1808
1809         if (f->writable) {
1810                 r = journal_file_refresh_header(f);
1811                 if (r < 0)
1812                         goto fail;
1813         }
1814
1815         if (newly_created) {
1816
1817                 r = journal_file_setup_field_hash_table(f);
1818                 if (r < 0)
1819                         goto fail;
1820
1821                 r = journal_file_setup_data_hash_table(f);
1822                 if (r < 0)
1823                         goto fail;
1824         }
1825
1826         r = journal_file_map_field_hash_table(f);
1827         if (r < 0)
1828                 goto fail;
1829
1830         r = journal_file_map_data_hash_table(f);
1831         if (r < 0)
1832                 goto fail;
1833
1834         if (ret)
1835                 *ret = f;
1836
1837         return 0;
1838
1839 fail:
1840         journal_file_close(f);
1841
1842         return r;
1843 }
1844
1845 int journal_file_rotate(JournalFile **f) {
1846         char *p;
1847         size_t l;
1848         JournalFile *old_file, *new_file = NULL;
1849         int r;
1850
1851         assert(f);
1852         assert(*f);
1853
1854         old_file = *f;
1855
1856         if (!old_file->writable)
1857                 return -EINVAL;
1858
1859         if (!endswith(old_file->path, ".journal"))
1860                 return -EINVAL;
1861
1862         l = strlen(old_file->path);
1863
1864         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1865         if (!p)
1866                 return -ENOMEM;
1867
1868         memcpy(p, old_file->path, l - 8);
1869         p[l-8] = '@';
1870         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1871         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1872                  "-%016llx-%016llx.journal",
1873                  (unsigned long long) le64toh((*f)->header->seqnum),
1874                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1875
1876         r = rename(old_file->path, p);
1877         free(p);
1878
1879         if (r < 0)
1880                 return -errno;
1881
1882         old_file->header->state = STATE_ARCHIVED;
1883
1884         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1885         journal_file_close(old_file);
1886
1887         *f = new_file;
1888         return r;
1889 }
1890
1891 int journal_file_open_reliably(
1892                 const char *fname,
1893                 int flags,
1894                 mode_t mode,
1895                 JournalFile *template,
1896                 JournalFile **ret) {
1897
1898         int r;
1899         size_t l;
1900         char *p;
1901
1902         r = journal_file_open(fname, flags, mode, template, ret);
1903         if (r != -EBADMSG && /* corrupted */
1904             r != -ENODATA && /* truncated */
1905             r != -EHOSTDOWN && /* other machine */
1906             r != -EPROTONOSUPPORT) /* incompatible feature */
1907                 return r;
1908
1909         if ((flags & O_ACCMODE) == O_RDONLY)
1910                 return r;
1911
1912         if (!(flags & O_CREAT))
1913                 return r;
1914
1915         /* The file is corrupted. Rotate it away and try it again (but only once) */
1916
1917         l = strlen(fname);
1918         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1919                      (int) (l-8), fname,
1920                      (unsigned long long) now(CLOCK_REALTIME),
1921                      random_ull()) < 0)
1922                 return -ENOMEM;
1923
1924         r = rename(fname, p);
1925         free(p);
1926         if (r < 0)
1927                 return -errno;
1928
1929         log_warning("File %s corrupted, renaming and replacing.", fname);
1930
1931         return journal_file_open(fname, flags, mode, template, ret);
1932 }
1933
1934 struct vacuum_info {
1935         off_t usage;
1936         char *filename;
1937
1938         uint64_t realtime;
1939         sd_id128_t seqnum_id;
1940         uint64_t seqnum;
1941
1942         bool have_seqnum;
1943 };
1944
1945 static int vacuum_compare(const void *_a, const void *_b) {
1946         const struct vacuum_info *a, *b;
1947
1948         a = _a;
1949         b = _b;
1950
1951         if (a->have_seqnum && b->have_seqnum &&
1952             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1953                 if (a->seqnum < b->seqnum)
1954                         return -1;
1955                 else if (a->seqnum > b->seqnum)
1956                         return 1;
1957                 else
1958                         return 0;
1959         }
1960
1961         if (a->realtime < b->realtime)
1962                 return -1;
1963         else if (a->realtime > b->realtime)
1964                 return 1;
1965         else if (a->have_seqnum && b->have_seqnum)
1966                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1967         else
1968                 return strcmp(a->filename, b->filename);
1969 }
1970
1971 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1972         DIR *d;
1973         int r = 0;
1974         struct vacuum_info *list = NULL;
1975         unsigned n_list = 0, n_allocated = 0, i;
1976         uint64_t sum = 0;
1977
1978         assert(directory);
1979
1980         if (max_use <= 0)
1981                 return 0;
1982
1983         d = opendir(directory);
1984         if (!d)
1985                 return -errno;
1986
1987         for (;;) {
1988                 int k;
1989                 struct dirent buf, *de;
1990                 size_t q;
1991                 struct stat st;
1992                 char *p;
1993                 unsigned long long seqnum = 0, realtime;
1994                 sd_id128_t seqnum_id;
1995                 bool have_seqnum;
1996
1997                 k = readdir_r(d, &buf, &de);
1998                 if (k != 0) {
1999                         r = -k;
2000                         goto finish;
2001                 }
2002
2003                 if (!de)
2004                         break;
2005
2006                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2007                         continue;
2008
2009                 if (!S_ISREG(st.st_mode))
2010                         continue;
2011
2012                 q = strlen(de->d_name);
2013
2014                 if (endswith(de->d_name, ".journal")) {
2015
2016                         /* Vacuum archived files */
2017
2018                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2019                                 continue;
2020
2021                         if (de->d_name[q-8-16-1] != '-' ||
2022                             de->d_name[q-8-16-1-16-1] != '-' ||
2023                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2024                                 continue;
2025
2026                         p = strdup(de->d_name);
2027                         if (!p) {
2028                                 r = -ENOMEM;
2029                                 goto finish;
2030                         }
2031
2032                         de->d_name[q-8-16-1-16-1] = 0;
2033                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2034                                 free(p);
2035                                 continue;
2036                         }
2037
2038                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2039                                 free(p);
2040                                 continue;
2041                         }
2042
2043                         have_seqnum = true;
2044
2045                 } else if (endswith(de->d_name, ".journal~")) {
2046                         unsigned long long tmp;
2047
2048                         /* Vacuum corrupted files */
2049
2050                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2051                                 continue;
2052
2053                         if (de->d_name[q-1-8-16-1] != '-' ||
2054                             de->d_name[q-1-8-16-1-16-1] != '@')
2055                                 continue;
2056
2057                         p = strdup(de->d_name);
2058                         if (!p) {
2059                                 r = -ENOMEM;
2060                                 goto finish;
2061                         }
2062
2063                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2064                                 free(p);
2065                                 continue;
2066                         }
2067
2068                         have_seqnum = false;
2069                 } else
2070                         continue;
2071
2072                 if (n_list >= n_allocated) {
2073                         struct vacuum_info *j;
2074
2075                         n_allocated = MAX(n_allocated * 2U, 8U);
2076                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2077                         if (!j) {
2078                                 free(p);
2079                                 r = -ENOMEM;
2080                                 goto finish;
2081                         }
2082
2083                         list = j;
2084                 }
2085
2086                 list[n_list].filename = p;
2087                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2088                 list[n_list].seqnum = seqnum;
2089                 list[n_list].realtime = realtime;
2090                 list[n_list].seqnum_id = seqnum_id;
2091                 list[n_list].have_seqnum = have_seqnum;
2092
2093                 sum += list[n_list].usage;
2094
2095                 n_list ++;
2096         }
2097
2098         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2099
2100         for(i = 0; i < n_list; i++) {
2101                 struct statvfs ss;
2102
2103                 if (fstatvfs(dirfd(d), &ss) < 0) {
2104                         r = -errno;
2105                         goto finish;
2106                 }
2107
2108                 if (sum <= max_use &&
2109                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2110                         break;
2111
2112                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2113                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2114                         sum -= list[i].usage;
2115                 } else if (errno != ENOENT)
2116                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2117         }
2118
2119 finish:
2120         for (i = 0; i < n_list; i++)
2121                 free(list[i].filename);
2122
2123         free(list);
2124
2125         if (d)
2126                 closedir(d);
2127
2128         return r;
2129 }
2130
2131 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2132         uint64_t i, n;
2133         uint64_t q, xor_hash = 0;
2134         int r;
2135         EntryItem *items;
2136         dual_timestamp ts;
2137
2138         assert(from);
2139         assert(to);
2140         assert(o);
2141         assert(p);
2142
2143         if (!to->writable)
2144                 return -EPERM;
2145
2146         ts.monotonic = le64toh(o->entry.monotonic);
2147         ts.realtime = le64toh(o->entry.realtime);
2148
2149         if (to->tail_entry_monotonic_valid &&
2150             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2151                 return -EINVAL;
2152
2153         if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2154                 return -EINVAL;
2155
2156         n = journal_file_entry_n_items(o);
2157         items = alloca(sizeof(EntryItem) * n);
2158
2159         for (i = 0; i < n; i++) {
2160                 uint64_t l, h;
2161                 le64_t le_hash;
2162                 size_t t;
2163                 void *data;
2164                 Object *u;
2165
2166                 q = le64toh(o->entry.items[i].object_offset);
2167                 le_hash = o->entry.items[i].hash;
2168
2169                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2170                 if (r < 0)
2171                         return r;
2172
2173                 if (le_hash != o->data.hash)
2174                         return -EBADMSG;
2175
2176                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2177                 t = (size_t) l;
2178
2179                 /* We hit the limit on 32bit machines */
2180                 if ((uint64_t) t != l)
2181                         return -E2BIG;
2182
2183                 if (o->object.flags & OBJECT_COMPRESSED) {
2184 #ifdef HAVE_XZ
2185                         uint64_t rsize;
2186
2187                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2188                                 return -EBADMSG;
2189
2190                         data = from->compress_buffer;
2191                         l = rsize;
2192 #else
2193                         return -EPROTONOSUPPORT;
2194 #endif
2195                 } else
2196                         data = o->data.payload;
2197
2198                 r = journal_file_append_data(to, data, l, &u, &h);
2199                 if (r < 0)
2200                         return r;
2201
2202                 xor_hash ^= le64toh(u->data.hash);
2203                 items[i].object_offset = htole64(h);
2204                 items[i].hash = u->data.hash;
2205
2206                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2207                 if (r < 0)
2208                         return r;
2209         }
2210
2211         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2212 }
2213
2214 void journal_default_metrics(JournalMetrics *m, int fd) {
2215         uint64_t fs_size = 0;
2216         struct statvfs ss;
2217         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2218
2219         assert(m);
2220         assert(fd >= 0);
2221
2222         if (fstatvfs(fd, &ss) >= 0)
2223                 fs_size = ss.f_frsize * ss.f_blocks;
2224
2225         if (m->max_use == (uint64_t) -1) {
2226
2227                 if (fs_size > 0) {
2228                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2229
2230                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2231                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2232
2233                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2234                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2235                 } else
2236                         m->max_use = DEFAULT_MAX_USE_LOWER;
2237         } else {
2238                 m->max_use = PAGE_ALIGN(m->max_use);
2239
2240                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2241                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2242         }
2243
2244         if (m->max_size == (uint64_t) -1) {
2245                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2246
2247                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2248                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2249         } else
2250                 m->max_size = PAGE_ALIGN(m->max_size);
2251
2252         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2253                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2254
2255         if (m->max_size*2 > m->max_use)
2256                 m->max_use = m->max_size*2;
2257
2258         if (m->min_size == (uint64_t) -1)
2259                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2260         else {
2261                 m->min_size = PAGE_ALIGN(m->min_size);
2262
2263                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2264                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2265
2266                 if (m->min_size > m->max_size)
2267                         m->max_size = m->min_size;
2268         }
2269
2270         if (m->keep_free == (uint64_t) -1) {
2271
2272                 if (fs_size > 0) {
2273                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2274
2275                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2276                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2277
2278                 } else
2279                         m->keep_free = DEFAULT_KEEP_FREE;
2280         }
2281
2282         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2283                  format_bytes(a, sizeof(a), m->max_use),
2284                  format_bytes(b, sizeof(b), m->max_size),
2285                  format_bytes(c, sizeof(c), m->min_size),
2286                  format_bytes(d, sizeof(d), m->keep_free));
2287 }
2288
2289 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2290         Object *o;
2291         int r;
2292
2293         assert(f);
2294         assert(from || to);
2295
2296         if (from) {
2297                 r = journal_file_next_entry(f, NULL, 0, DIRECTION_DOWN, &o, NULL);
2298                 if (r <= 0)
2299                         return r;
2300
2301                 *from = le64toh(o->entry.realtime);
2302         }
2303
2304         if (to) {
2305                 r = journal_file_next_entry(f, NULL, 0, DIRECTION_UP, &o, NULL);
2306                 if (r <= 0)
2307                         return r;
2308
2309                 *to = le64toh(o->entry.realtime);
2310         }
2311
2312         return 1;
2313 }
2314
2315 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2316         char t[9+32+1] = "_BOOT_ID=";
2317         Object *o;
2318         uint64_t p;
2319         int r;
2320
2321         assert(f);
2322         assert(from || to);
2323
2324         sd_id128_to_string(boot_id, t + 9);
2325
2326         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2327         if (r <= 0)
2328                 return r;
2329
2330         if (le64toh(o->data.n_entries) <= 0)
2331                 return 0;
2332
2333         if (from) {
2334                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2335                 if (r < 0)
2336                         return r;
2337
2338                 *from = le64toh(o->entry.monotonic);
2339         }
2340
2341         if (to) {
2342                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2343                 if (r < 0)
2344                         return r;
2345
2346                 r = generic_array_get_plus_one(f,
2347                                                le64toh(o->data.entry_offset),
2348                                                le64toh(o->data.entry_array_offset),
2349                                                le64toh(o->data.n_entries)-1,
2350                                                &o, NULL);
2351                 if (r <= 0)
2352                         return r;
2353
2354                 *to = le64toh(o->entry.monotonic);
2355         }
2356
2357         return 1;
2358 }