chiark / gitweb /
journal: fix seeking by realtime/seqnum
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
62
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
64
65 void journal_file_close(JournalFile *f) {
66         int t;
67
68         assert(f);
69
70         if (f->header) {
71                 if (f->writable)
72                         f->header->state = STATE_OFFLINE;
73
74                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
75         }
76
77         for (t = 0; t < _WINDOW_MAX; t++)
78                 if (f->windows[t].ptr)
79                         munmap(f->windows[t].ptr, f->windows[t].size);
80
81         if (f->fd >= 0)
82                 close_nointr_nofail(f->fd);
83
84         free(f->path);
85
86 #ifdef HAVE_XZ
87         free(f->compress_buffer);
88 #endif
89
90         free(f);
91 }
92
93 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
94         Header h;
95         ssize_t k;
96         int r;
97
98         assert(f);
99
100         zero(h);
101         memcpy(h.signature, signature, 8);
102         h.header_size = htole64(ALIGN64(sizeof(h)));
103
104         r = sd_id128_randomize(&h.file_id);
105         if (r < 0)
106                 return r;
107
108         if (template) {
109                 h.seqnum_id = template->header->seqnum_id;
110                 h.seqnum = template->header->seqnum;
111         } else
112                 h.seqnum_id = h.file_id;
113
114         k = pwrite(f->fd, &h, sizeof(h), 0);
115         if (k < 0)
116                 return -errno;
117
118         if (k != sizeof(h))
119                 return -EIO;
120
121         return 0;
122 }
123
124 static int journal_file_refresh_header(JournalFile *f) {
125         int r;
126         sd_id128_t boot_id;
127
128         assert(f);
129
130         r = sd_id128_get_machine(&f->header->machine_id);
131         if (r < 0)
132                 return r;
133
134         r = sd_id128_get_boot(&boot_id);
135         if (r < 0)
136                 return r;
137
138         if (sd_id128_equal(boot_id, f->header->boot_id))
139                 f->tail_entry_monotonic_valid = true;
140
141         f->header->boot_id = boot_id;
142
143         f->header->state = STATE_ONLINE;
144
145         __sync_synchronize();
146
147         return 0;
148 }
149
150 static int journal_file_verify_header(JournalFile *f) {
151         assert(f);
152
153         if (memcmp(f->header, signature, 8))
154                 return -EBADMSG;
155
156 #ifdef HAVE_XZ
157         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
158                 return -EPROTONOSUPPORT;
159 #else
160         if (f->header->incompatible_flags != 0)
161                 return -EPROTONOSUPPORT;
162 #endif
163
164         if (f->header->header_size != htole64(ALIGN64(sizeof(*(f->header)))))
165                 return -EBADMSG;
166
167         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
168                 return -ENODATA;
169
170         if (f->writable) {
171                 uint8_t state;
172                 sd_id128_t machine_id;
173                 int r;
174
175                 r = sd_id128_get_machine(&machine_id);
176                 if (r < 0)
177                         return r;
178
179                 if (!sd_id128_equal(machine_id, f->header->machine_id))
180                         return -EHOSTDOWN;
181
182                 state = f->header->state;
183
184                 if (state == STATE_ONLINE)
185                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
186                         /* FIXME: immediately rotate */
187                 else if (state == STATE_ARCHIVED)
188                         return -ESHUTDOWN;
189                 else if (state != STATE_OFFLINE)
190                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
191         }
192
193         return 0;
194 }
195
196 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
197         uint64_t old_size, new_size;
198         int r;
199
200         assert(f);
201
202         /* We assume that this file is not sparse, and we know that
203          * for sure, since we always call posix_fallocate()
204          * ourselves */
205
206         old_size =
207                 le64toh(f->header->header_size) +
208                 le64toh(f->header->arena_size);
209
210         new_size = PAGE_ALIGN(offset + size);
211         if (new_size < le64toh(f->header->header_size))
212                 new_size = le64toh(f->header->header_size);
213
214         if (new_size <= old_size)
215                 return 0;
216
217         if (f->metrics.max_size > 0 &&
218             new_size > f->metrics.max_size)
219                 return -E2BIG;
220
221         if (new_size > f->metrics.min_size &&
222             f->metrics.keep_free > 0) {
223                 struct statvfs svfs;
224
225                 if (fstatvfs(f->fd, &svfs) >= 0) {
226                         uint64_t available;
227
228                         available = svfs.f_bfree * svfs.f_bsize;
229
230                         if (available >= f->metrics.keep_free)
231                                 available -= f->metrics.keep_free;
232                         else
233                                 available = 0;
234
235                         if (new_size - old_size > available)
236                                 return -E2BIG;
237                 }
238         }
239
240         /* Note that the glibc fallocate() fallback is very
241            inefficient, hence we try to minimize the allocation area
242            as we can. */
243         r = posix_fallocate(f->fd, old_size, new_size - old_size);
244         if (r != 0)
245                 return -r;
246
247         if (fstat(f->fd, &f->last_stat) < 0)
248                 return -errno;
249
250         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
251
252         return 0;
253 }
254
255 static int journal_file_map(
256                 JournalFile *f,
257                 uint64_t offset,
258                 uint64_t size,
259                 void **_window,
260                 uint64_t *_woffset,
261                 uint64_t *_wsize,
262                 void **ret) {
263
264         uint64_t woffset, wsize;
265         void *window;
266
267         assert(f);
268         assert(size > 0);
269         assert(ret);
270
271         woffset = offset & ~((uint64_t) page_size() - 1ULL);
272         wsize = size + (offset - woffset);
273         wsize = PAGE_ALIGN(wsize);
274
275         /* Avoid SIGBUS on invalid accesses */
276         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
277                 return -EADDRNOTAVAIL;
278
279         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
280         if (window == MAP_FAILED)
281                 return -errno;
282
283         if (_window)
284                 *_window = window;
285
286         if (_woffset)
287                 *_woffset = woffset;
288
289         if (_wsize)
290                 *_wsize = wsize;
291
292         *ret = (uint8_t*) window + (offset - woffset);
293
294         return 0;
295 }
296
297 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
298         void *p = NULL;
299         uint64_t delta;
300         int r;
301         Window *w;
302
303         assert(f);
304         assert(ret);
305         assert(wt >= 0);
306         assert(wt < _WINDOW_MAX);
307
308         if (offset + size > (uint64_t) f->last_stat.st_size) {
309                 /* Hmm, out of range? Let's refresh the fstat() data
310                  * first, before we trust that check. */
311
312                 if (fstat(f->fd, &f->last_stat) < 0 ||
313                     offset + size > (uint64_t) f->last_stat.st_size)
314                         return -EADDRNOTAVAIL;
315         }
316
317         w = f->windows + wt;
318
319         if (_likely_(w->ptr &&
320                      w->offset <= offset &&
321                      w->offset + w->size >= offset + size)) {
322
323                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
324                 return 0;
325         }
326
327         if (w->ptr) {
328                 if (munmap(w->ptr, w->size) < 0)
329                         return -errno;
330
331                 w->ptr = NULL;
332                 w->size = w->offset = 0;
333         }
334
335         if (size < DEFAULT_WINDOW_SIZE) {
336                 /* If the default window size is larger then what was
337                  * asked for extend the mapping a bit in the hope to
338                  * minimize needed remappings later on. We add half
339                  * the window space before and half behind the
340                  * requested mapping */
341
342                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
343
344                 if (delta > offset)
345                         delta = offset;
346
347                 offset -= delta;
348                 size = DEFAULT_WINDOW_SIZE;
349         } else
350                 delta = 0;
351
352         if (offset + size > (uint64_t) f->last_stat.st_size)
353                 size = (uint64_t) f->last_stat.st_size - offset;
354
355         if (size <= 0)
356                 return -EADDRNOTAVAIL;
357
358         r = journal_file_map(f,
359                              offset, size,
360                              &w->ptr, &w->offset, &w->size,
361                              &p);
362
363         if (r < 0)
364                 return r;
365
366         *ret = (uint8_t*) p + delta;
367         return 0;
368 }
369
370 static bool verify_hash(Object *o) {
371         uint64_t h1, h2;
372
373         assert(o);
374
375         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
376                 h1 = le64toh(o->data.hash);
377                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
378         } else if (o->object.type == OBJECT_FIELD) {
379                 h1 = le64toh(o->field.hash);
380                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
381         } else
382                 return true;
383
384         return h1 == h2;
385 }
386
387 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
388         int r;
389         void *t;
390         Object *o;
391         uint64_t s;
392
393         assert(f);
394         assert(ret);
395         assert(type < _OBJECT_TYPE_MAX);
396
397         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
398         if (r < 0)
399                 return r;
400
401         o = (Object*) t;
402         s = le64toh(o->object.size);
403
404         if (s < sizeof(ObjectHeader))
405                 return -EBADMSG;
406
407         if (type >= 0 && o->object.type != type)
408                 return -EBADMSG;
409
410         if (s > sizeof(ObjectHeader)) {
411                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
412                 if (r < 0)
413                         return r;
414
415                 o = (Object*) t;
416         }
417
418         if (!verify_hash(o))
419                 return -EBADMSG;
420
421         *ret = o;
422         return 0;
423 }
424
425 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
426         uint64_t r;
427
428         assert(f);
429
430         r = le64toh(f->header->seqnum) + 1;
431
432         if (seqnum) {
433                 /* If an external seqnum counter was passed, we update
434                  * both the local and the external one, and set it to
435                  * the maximum of both */
436
437                 if (*seqnum + 1 > r)
438                         r = *seqnum + 1;
439
440                 *seqnum = r;
441         }
442
443         f->header->seqnum = htole64(r);
444
445         if (f->header->first_seqnum == 0)
446                 f->header->first_seqnum = htole64(r);
447
448         return r;
449 }
450
451 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
452         int r;
453         uint64_t p;
454         Object *tail, *o;
455         void *t;
456
457         assert(f);
458         assert(size >= sizeof(ObjectHeader));
459         assert(offset);
460         assert(ret);
461
462         p = le64toh(f->header->tail_object_offset);
463         if (p == 0)
464                 p = le64toh(f->header->header_size);
465         else {
466                 r = journal_file_move_to_object(f, -1, p, &tail);
467                 if (r < 0)
468                         return r;
469
470                 p += ALIGN64(le64toh(tail->object.size));
471         }
472
473         r = journal_file_allocate(f, p, size);
474         if (r < 0)
475                 return r;
476
477         r = journal_file_move_to(f, type, p, size, &t);
478         if (r < 0)
479                 return r;
480
481         o = (Object*) t;
482
483         zero(o->object);
484         o->object.type = type;
485         o->object.size = htole64(size);
486
487         f->header->tail_object_offset = htole64(p);
488         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
489
490         *ret = o;
491         *offset = p;
492
493         return 0;
494 }
495
496 static int journal_file_setup_data_hash_table(JournalFile *f) {
497         uint64_t s, p;
498         Object *o;
499         int r;
500
501         assert(f);
502
503         s = DEFAULT_DATA_HASH_TABLE_SIZE;
504         r = journal_file_append_object(f,
505                                        OBJECT_DATA_HASH_TABLE,
506                                        offsetof(Object, hash_table.items) + s,
507                                        &o, &p);
508         if (r < 0)
509                 return r;
510
511         memset(o->hash_table.items, 0, s);
512
513         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
514         f->header->data_hash_table_size = htole64(s);
515
516         return 0;
517 }
518
519 static int journal_file_setup_field_hash_table(JournalFile *f) {
520         uint64_t s, p;
521         Object *o;
522         int r;
523
524         assert(f);
525
526         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
527         r = journal_file_append_object(f,
528                                        OBJECT_FIELD_HASH_TABLE,
529                                        offsetof(Object, hash_table.items) + s,
530                                        &o, &p);
531         if (r < 0)
532                 return r;
533
534         memset(o->hash_table.items, 0, s);
535
536         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
537         f->header->field_hash_table_size = htole64(s);
538
539         return 0;
540 }
541
542 static int journal_file_map_data_hash_table(JournalFile *f) {
543         uint64_t s, p;
544         void *t;
545         int r;
546
547         assert(f);
548
549         p = le64toh(f->header->data_hash_table_offset);
550         s = le64toh(f->header->data_hash_table_size);
551
552         r = journal_file_move_to(f,
553                                  WINDOW_DATA_HASH_TABLE,
554                                  p, s,
555                                  &t);
556         if (r < 0)
557                 return r;
558
559         f->data_hash_table = t;
560         return 0;
561 }
562
563 static int journal_file_map_field_hash_table(JournalFile *f) {
564         uint64_t s, p;
565         void *t;
566         int r;
567
568         assert(f);
569
570         p = le64toh(f->header->field_hash_table_offset);
571         s = le64toh(f->header->field_hash_table_size);
572
573         r = journal_file_move_to(f,
574                                  WINDOW_FIELD_HASH_TABLE,
575                                  p, s,
576                                  &t);
577         if (r < 0)
578                 return r;
579
580         f->field_hash_table = t;
581         return 0;
582 }
583
584 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
585         uint64_t p, h;
586         int r;
587
588         assert(f);
589         assert(o);
590         assert(offset > 0);
591         assert(o->object.type == OBJECT_DATA);
592
593         /* This might alter the window we are looking at */
594
595         o->data.next_hash_offset = o->data.next_field_offset = 0;
596         o->data.entry_offset = o->data.entry_array_offset = 0;
597         o->data.n_entries = 0;
598
599         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
600         p = le64toh(f->data_hash_table[h].tail_hash_offset);
601         if (p == 0) {
602                 /* Only entry in the hash table is easy */
603                 f->data_hash_table[h].head_hash_offset = htole64(offset);
604         } else {
605                 /* Move back to the previous data object, to patch in
606                  * pointer */
607
608                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
609                 if (r < 0)
610                         return r;
611
612                 o->data.next_hash_offset = htole64(offset);
613         }
614
615         f->data_hash_table[h].tail_hash_offset = htole64(offset);
616
617         return 0;
618 }
619
620 int journal_file_find_data_object_with_hash(
621                 JournalFile *f,
622                 const void *data, uint64_t size, uint64_t hash,
623                 Object **ret, uint64_t *offset) {
624
625         uint64_t p, osize, h;
626         int r;
627
628         assert(f);
629         assert(data || size == 0);
630
631         osize = offsetof(Object, data.payload) + size;
632
633         if (f->header->data_hash_table_size == 0)
634                 return -EBADMSG;
635
636         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
637         p = le64toh(f->data_hash_table[h].head_hash_offset);
638
639         while (p > 0) {
640                 Object *o;
641
642                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
643                 if (r < 0)
644                         return r;
645
646                 if (le64toh(o->data.hash) != hash)
647                         goto next;
648
649                 if (o->object.flags & OBJECT_COMPRESSED) {
650 #ifdef HAVE_XZ
651                         uint64_t l, rsize;
652
653                         l = le64toh(o->object.size);
654                         if (l <= offsetof(Object, data.payload))
655                                 return -EBADMSG;
656
657                         l -= offsetof(Object, data.payload);
658
659                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
660                                 return -EBADMSG;
661
662                         if (rsize == size &&
663                             memcmp(f->compress_buffer, data, size) == 0) {
664
665                                 if (ret)
666                                         *ret = o;
667
668                                 if (offset)
669                                         *offset = p;
670
671                                 return 1;
672                         }
673 #else
674                         return -EPROTONOSUPPORT;
675 #endif
676
677                 } else if (le64toh(o->object.size) == osize &&
678                            memcmp(o->data.payload, data, size) == 0) {
679
680                         if (ret)
681                                 *ret = o;
682
683                         if (offset)
684                                 *offset = p;
685
686                         return 1;
687                 }
688
689         next:
690                 p = le64toh(o->data.next_hash_offset);
691         }
692
693         return 0;
694 }
695
696 int journal_file_find_data_object(
697                 JournalFile *f,
698                 const void *data, uint64_t size,
699                 Object **ret, uint64_t *offset) {
700
701         uint64_t hash;
702
703         assert(f);
704         assert(data || size == 0);
705
706         hash = hash64(data, size);
707
708         return journal_file_find_data_object_with_hash(f,
709                                                        data, size, hash,
710                                                        ret, offset);
711 }
712
713 static int journal_file_append_data(
714                 JournalFile *f,
715                 const void *data, uint64_t size,
716                 Object **ret, uint64_t *offset) {
717
718         uint64_t hash, p;
719         uint64_t osize;
720         Object *o;
721         int r;
722         bool compressed = false;
723
724         assert(f);
725         assert(data || size == 0);
726
727         hash = hash64(data, size);
728
729         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
730         if (r < 0)
731                 return r;
732         else if (r > 0) {
733
734                 if (ret)
735                         *ret = o;
736
737                 if (offset)
738                         *offset = p;
739
740                 return 0;
741         }
742
743         osize = offsetof(Object, data.payload) + size;
744         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
745         if (r < 0)
746                 return r;
747
748         o->data.hash = htole64(hash);
749
750 #ifdef HAVE_XZ
751         if (f->compress &&
752             size >= COMPRESSION_SIZE_THRESHOLD) {
753                 uint64_t rsize;
754
755                 compressed = compress_blob(data, size, o->data.payload, &rsize);
756
757                 if (compressed) {
758                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
759                         o->object.flags |= OBJECT_COMPRESSED;
760
761                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
762
763                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
764                 }
765         }
766 #endif
767
768         if (!compressed)
769                 memcpy(o->data.payload, data, size);
770
771         r = journal_file_link_data(f, o, p, hash);
772         if (r < 0)
773                 return r;
774
775         /* The linking might have altered the window, so let's
776          * refresh our pointer */
777         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
778         if (r < 0)
779                 return r;
780
781         if (ret)
782                 *ret = o;
783
784         if (offset)
785                 *offset = p;
786
787         return 0;
788 }
789
790 uint64_t journal_file_entry_n_items(Object *o) {
791         assert(o);
792         assert(o->object.type == OBJECT_ENTRY);
793
794         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
795 }
796
797 static uint64_t journal_file_entry_array_n_items(Object *o) {
798         assert(o);
799         assert(o->object.type == OBJECT_ENTRY_ARRAY);
800
801         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
802 }
803
804 static int link_entry_into_array(JournalFile *f,
805                                  le64_t *first,
806                                  le64_t *idx,
807                                  uint64_t p) {
808         int r;
809         uint64_t n = 0, ap = 0, q, i, a, hidx;
810         Object *o;
811
812         assert(f);
813         assert(first);
814         assert(idx);
815         assert(p > 0);
816
817         a = le64toh(*first);
818         i = hidx = le64toh(*idx);
819         while (a > 0) {
820
821                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
822                 if (r < 0)
823                         return r;
824
825                 n = journal_file_entry_array_n_items(o);
826                 if (i < n) {
827                         o->entry_array.items[i] = htole64(p);
828                         *idx = htole64(hidx + 1);
829                         return 0;
830                 }
831
832                 i -= n;
833                 ap = a;
834                 a = le64toh(o->entry_array.next_entry_array_offset);
835         }
836
837         if (hidx > n)
838                 n = (hidx+1) * 2;
839         else
840                 n = n * 2;
841
842         if (n < 4)
843                 n = 4;
844
845         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
846                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
847                                        &o, &q);
848         if (r < 0)
849                 return r;
850
851         o->entry_array.items[i] = htole64(p);
852
853         if (ap == 0)
854                 *first = htole64(q);
855         else {
856                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
857                 if (r < 0)
858                         return r;
859
860                 o->entry_array.next_entry_array_offset = htole64(q);
861         }
862
863         *idx = htole64(hidx + 1);
864
865         return 0;
866 }
867
868 static int link_entry_into_array_plus_one(JournalFile *f,
869                                           le64_t *extra,
870                                           le64_t *first,
871                                           le64_t *idx,
872                                           uint64_t p) {
873
874         int r;
875
876         assert(f);
877         assert(extra);
878         assert(first);
879         assert(idx);
880         assert(p > 0);
881
882         if (*idx == 0)
883                 *extra = htole64(p);
884         else {
885                 le64_t i;
886
887                 i = htole64(le64toh(*idx) - 1);
888                 r = link_entry_into_array(f, first, &i, p);
889                 if (r < 0)
890                         return r;
891         }
892
893         *idx = htole64(le64toh(*idx) + 1);
894         return 0;
895 }
896
897 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
898         uint64_t p;
899         int r;
900         assert(f);
901         assert(o);
902         assert(offset > 0);
903
904         p = le64toh(o->entry.items[i].object_offset);
905         if (p == 0)
906                 return -EINVAL;
907
908         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
909         if (r < 0)
910                 return r;
911
912         return link_entry_into_array_plus_one(f,
913                                               &o->data.entry_offset,
914                                               &o->data.entry_array_offset,
915                                               &o->data.n_entries,
916                                               offset);
917 }
918
919 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
920         uint64_t n, i;
921         int r;
922
923         assert(f);
924         assert(o);
925         assert(offset > 0);
926         assert(o->object.type == OBJECT_ENTRY);
927
928         __sync_synchronize();
929
930         /* Link up the entry itself */
931         r = link_entry_into_array(f,
932                                   &f->header->entry_array_offset,
933                                   &f->header->n_entries,
934                                   offset);
935         if (r < 0)
936                 return r;
937
938         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
939
940         if (f->header->head_entry_realtime == 0)
941                 f->header->head_entry_realtime = o->entry.realtime;
942
943         f->header->tail_entry_realtime = o->entry.realtime;
944         f->header->tail_entry_monotonic = o->entry.monotonic;
945
946         f->tail_entry_monotonic_valid = true;
947
948         /* Link up the items */
949         n = journal_file_entry_n_items(o);
950         for (i = 0; i < n; i++) {
951                 r = journal_file_link_entry_item(f, o, offset, i);
952                 if (r < 0)
953                         return r;
954         }
955
956         return 0;
957 }
958
959 static int journal_file_append_entry_internal(
960                 JournalFile *f,
961                 const dual_timestamp *ts,
962                 uint64_t xor_hash,
963                 const EntryItem items[], unsigned n_items,
964                 uint64_t *seqnum,
965                 Object **ret, uint64_t *offset) {
966         uint64_t np;
967         uint64_t osize;
968         Object *o;
969         int r;
970
971         assert(f);
972         assert(items || n_items == 0);
973         assert(ts);
974
975         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
976
977         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
978         if (r < 0)
979                 return r;
980
981         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
982         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
983         o->entry.realtime = htole64(ts->realtime);
984         o->entry.monotonic = htole64(ts->monotonic);
985         o->entry.xor_hash = htole64(xor_hash);
986         o->entry.boot_id = f->header->boot_id;
987
988         r = journal_file_link_entry(f, o, np);
989         if (r < 0)
990                 return r;
991
992         if (ret)
993                 *ret = o;
994
995         if (offset)
996                 *offset = np;
997
998         return 0;
999 }
1000
1001 void journal_file_post_change(JournalFile *f) {
1002         assert(f);
1003
1004         /* inotify() does not receive IN_MODIFY events from file
1005          * accesses done via mmap(). After each access we hence
1006          * trigger IN_MODIFY by truncating the journal file to its
1007          * current size which triggers IN_MODIFY. */
1008
1009         __sync_synchronize();
1010
1011         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1012                 log_error("Failed to to truncate file to its own size: %m");
1013 }
1014
1015 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1016         unsigned i;
1017         EntryItem *items;
1018         int r;
1019         uint64_t xor_hash = 0;
1020         struct dual_timestamp _ts;
1021
1022         assert(f);
1023         assert(iovec || n_iovec == 0);
1024
1025         if (!f->writable)
1026                 return -EPERM;
1027
1028         if (!ts) {
1029                 dual_timestamp_get(&_ts);
1030                 ts = &_ts;
1031         }
1032
1033         if (f->tail_entry_monotonic_valid &&
1034             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1035                 return -EINVAL;
1036
1037         items = alloca(sizeof(EntryItem) * n_iovec);
1038
1039         for (i = 0; i < n_iovec; i++) {
1040                 uint64_t p;
1041                 Object *o;
1042
1043                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1044                 if (r < 0)
1045                         return r;
1046
1047                 xor_hash ^= le64toh(o->data.hash);
1048                 items[i].object_offset = htole64(p);
1049                 items[i].hash = o->data.hash;
1050         }
1051
1052         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1053
1054         journal_file_post_change(f);
1055
1056         return r;
1057 }
1058
1059 static int generic_array_get(JournalFile *f,
1060                              uint64_t first,
1061                              uint64_t i,
1062                              Object **ret, uint64_t *offset) {
1063
1064         Object *o;
1065         uint64_t p = 0, a;
1066         int r;
1067
1068         assert(f);
1069
1070         a = first;
1071         while (a > 0) {
1072                 uint64_t n;
1073
1074                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1075                 if (r < 0)
1076                         return r;
1077
1078                 n = journal_file_entry_array_n_items(o);
1079                 if (i < n) {
1080                         p = le64toh(o->entry_array.items[i]);
1081                         break;
1082                 }
1083
1084                 i -= n;
1085                 a = le64toh(o->entry_array.next_entry_array_offset);
1086         }
1087
1088         if (a <= 0 || p <= 0)
1089                 return 0;
1090
1091         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1092         if (r < 0)
1093                 return r;
1094
1095         if (ret)
1096                 *ret = o;
1097
1098         if (offset)
1099                 *offset = p;
1100
1101         return 1;
1102 }
1103
1104 static int generic_array_get_plus_one(JournalFile *f,
1105                                       uint64_t extra,
1106                                       uint64_t first,
1107                                       uint64_t i,
1108                                       Object **ret, uint64_t *offset) {
1109
1110         Object *o;
1111
1112         assert(f);
1113
1114         if (i == 0) {
1115                 int r;
1116
1117                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1118                 if (r < 0)
1119                         return r;
1120
1121                 if (ret)
1122                         *ret = o;
1123
1124                 if (offset)
1125                         *offset = extra;
1126
1127                 return 1;
1128         }
1129
1130         return generic_array_get(f, first, i-1, ret, offset);
1131 }
1132
1133 enum {
1134         TEST_FOUND,
1135         TEST_LEFT,
1136         TEST_RIGHT
1137 };
1138
1139 static int generic_array_bisect(JournalFile *f,
1140                                 uint64_t first,
1141                                 uint64_t n,
1142                                 uint64_t needle,
1143                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1144                                 direction_t direction,
1145                                 Object **ret,
1146                                 uint64_t *offset,
1147                                 uint64_t *idx) {
1148
1149         uint64_t a, p, t = 0, i = 0, last_p = 0;
1150         bool subtract_one = false;
1151         Object *o, *array = NULL;
1152         int r;
1153
1154         assert(f);
1155         assert(test_object);
1156
1157         a = first;
1158         while (a > 0) {
1159                 uint64_t left, right, k, lp;
1160
1161                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1162                 if (r < 0)
1163                         return r;
1164
1165                 k = journal_file_entry_array_n_items(array);
1166                 right = MIN(k, n);
1167                 if (right <= 0)
1168                         return 0;
1169
1170                 i = right - 1;
1171                 lp = p = le64toh(array->entry_array.items[i]);
1172                 if (p <= 0)
1173                         return -EBADMSG;
1174
1175                 r = test_object(f, p, needle);
1176                 if (r < 0)
1177                         return r;
1178
1179                 if (r == TEST_FOUND)
1180                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1181
1182                 if (r == TEST_RIGHT) {
1183                         left = 0;
1184                         right -= 1;
1185                         for (;;) {
1186                                 if (left == right) {
1187                                         if (direction == DIRECTION_UP)
1188                                                 subtract_one = true;
1189
1190                                         i = left;
1191                                         goto found;
1192                                 }
1193
1194                                 assert(left < right);
1195
1196                                 i = (left + right) / 2;
1197                                 p = le64toh(array->entry_array.items[i]);
1198                                 if (p <= 0)
1199                                         return -EBADMSG;
1200
1201                                 r = test_object(f, p, needle);
1202                                 if (r < 0)
1203                                         return r;
1204
1205                                 if (r == TEST_FOUND)
1206                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1207
1208                                 if (r == TEST_RIGHT)
1209                                         right = i;
1210                                 else
1211                                         left = i + 1;
1212                         }
1213                 }
1214
1215                 if (k > n)
1216                         return 0;
1217
1218                 last_p = lp;
1219
1220                 n -= k;
1221                 t += k;
1222                 a = le64toh(array->entry_array.next_entry_array_offset);
1223         }
1224
1225         return 0;
1226
1227 found:
1228         if (subtract_one && t == 0 && i == 0)
1229                 return 0;
1230
1231         if (subtract_one && i == 0)
1232                 p = last_p;
1233         else if (subtract_one)
1234                 p = le64toh(array->entry_array.items[i-1]);
1235         else
1236                 p = le64toh(array->entry_array.items[i]);
1237
1238         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1239         if (r < 0)
1240                 return r;
1241
1242         if (ret)
1243                 *ret = o;
1244
1245         if (offset)
1246                 *offset = p;
1247
1248         if (idx)
1249                 *idx = t + i - (subtract_one ? 1 : 0);
1250
1251         return 1;
1252 }
1253
1254 static int generic_array_bisect_plus_one(JournalFile *f,
1255                                          uint64_t extra,
1256                                          uint64_t first,
1257                                          uint64_t n,
1258                                          uint64_t needle,
1259                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1260                                          direction_t direction,
1261                                          Object **ret,
1262                                          uint64_t *offset,
1263                                          uint64_t *idx) {
1264
1265         int r;
1266
1267         assert(f);
1268         assert(test_object);
1269
1270         if (n <= 0)
1271                 return 0;
1272
1273         /* This bisects the array in object 'first', but first checks
1274          * an extra  */
1275         r = test_object(f, extra, needle);
1276         if (r < 0)
1277                 return r;
1278
1279         if (r == TEST_FOUND)
1280                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1281
1282         if (r == TEST_RIGHT) {
1283                 Object *o;
1284
1285                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1286                 if (r < 0)
1287                         return r;
1288
1289                 if (ret)
1290                         *ret = o;
1291
1292                 if (offset)
1293                         *offset = extra;
1294
1295                 if (idx)
1296                         *idx = 0;
1297
1298                 return 1;
1299         }
1300
1301         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1302
1303         if (r > 0 && idx)
1304                 (*idx) ++;
1305
1306         return r;
1307 }
1308
1309 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1310         Object *o;
1311         int r;
1312
1313         assert(f);
1314         assert(p > 0);
1315
1316         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1317         if (r < 0)
1318                 return r;
1319
1320         if (le64toh(o->entry.seqnum) == needle)
1321                 return TEST_FOUND;
1322         else if (le64toh(o->entry.seqnum) < needle)
1323                 return TEST_LEFT;
1324         else
1325                 return TEST_RIGHT;
1326 }
1327
1328 int journal_file_move_to_entry_by_seqnum(
1329                 JournalFile *f,
1330                 uint64_t seqnum,
1331                 direction_t direction,
1332                 Object **ret,
1333                 uint64_t *offset) {
1334
1335         return generic_array_bisect(f,
1336                                     le64toh(f->header->entry_array_offset),
1337                                     le64toh(f->header->n_entries),
1338                                     seqnum,
1339                                     test_object_seqnum,
1340                                     direction,
1341                                     ret, offset, NULL);
1342 }
1343
1344 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1345         Object *o;
1346         int r;
1347
1348         assert(f);
1349         assert(p > 0);
1350
1351         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1352         if (r < 0)
1353                 return r;
1354
1355         if (le64toh(o->entry.realtime) == needle)
1356                 return TEST_FOUND;
1357         else if (le64toh(o->entry.realtime) < needle)
1358                 return TEST_LEFT;
1359         else
1360                 return TEST_RIGHT;
1361 }
1362
1363 int journal_file_move_to_entry_by_realtime(
1364                 JournalFile *f,
1365                 uint64_t realtime,
1366                 direction_t direction,
1367                 Object **ret,
1368                 uint64_t *offset) {
1369
1370         return generic_array_bisect(f,
1371                                     le64toh(f->header->entry_array_offset),
1372                                     le64toh(f->header->n_entries),
1373                                     realtime,
1374                                     test_object_realtime,
1375                                     direction,
1376                                     ret, offset, NULL);
1377 }
1378
1379 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1380         Object *o;
1381         int r;
1382
1383         assert(f);
1384         assert(p > 0);
1385
1386         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1387         if (r < 0)
1388                 return r;
1389
1390         if (le64toh(o->entry.monotonic) == needle)
1391                 return TEST_FOUND;
1392         else if (le64toh(o->entry.monotonic) < needle)
1393                 return TEST_LEFT;
1394         else
1395                 return TEST_RIGHT;
1396 }
1397
1398 int journal_file_move_to_entry_by_monotonic(
1399                 JournalFile *f,
1400                 sd_id128_t boot_id,
1401                 uint64_t monotonic,
1402                 direction_t direction,
1403                 Object **ret,
1404                 uint64_t *offset) {
1405
1406         char t[9+32+1] = "_BOOT_ID=";
1407         Object *o;
1408         int r;
1409
1410         sd_id128_to_string(boot_id, t + 9);
1411
1412         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1413         if (r < 0)
1414                 return r;
1415         else if (r == 0)
1416                 return -ENOENT;
1417
1418         return generic_array_bisect_plus_one(f,
1419                                              le64toh(o->data.entry_offset),
1420                                              le64toh(o->data.entry_array_offset),
1421                                              le64toh(o->data.n_entries),
1422                                              monotonic,
1423                                              test_object_monotonic,
1424                                              direction,
1425                                              ret, offset, NULL);
1426 }
1427
1428 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1429         assert(f);
1430         assert(p > 0);
1431
1432         if (p == needle)
1433                 return TEST_FOUND;
1434         else if (p < needle)
1435                 return TEST_LEFT;
1436         else
1437                 return TEST_RIGHT;
1438 }
1439
1440 int journal_file_next_entry(
1441                 JournalFile *f,
1442                 Object *o, uint64_t p,
1443                 direction_t direction,
1444                 Object **ret, uint64_t *offset) {
1445
1446         uint64_t i, n;
1447         int r;
1448
1449         assert(f);
1450         assert(p > 0 || !o);
1451
1452         n = le64toh(f->header->n_entries);
1453         if (n <= 0)
1454                 return 0;
1455
1456         if (!o)
1457                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1458         else {
1459                 if (o->object.type != OBJECT_ENTRY)
1460                         return -EINVAL;
1461
1462                 r = generic_array_bisect(f,
1463                                          le64toh(f->header->entry_array_offset),
1464                                          le64toh(f->header->n_entries),
1465                                          p,
1466                                          test_object_offset,
1467                                          DIRECTION_DOWN,
1468                                          NULL, NULL,
1469                                          &i);
1470                 if (r <= 0)
1471                         return r;
1472
1473                 if (direction == DIRECTION_DOWN) {
1474                         if (i >= n - 1)
1475                                 return 0;
1476
1477                         i++;
1478                 } else {
1479                         if (i <= 0)
1480                                 return 0;
1481
1482                         i--;
1483                 }
1484         }
1485
1486         /* And jump to it */
1487         return generic_array_get(f,
1488                                  le64toh(f->header->entry_array_offset),
1489                                  i,
1490                                  ret, offset);
1491 }
1492
1493 int journal_file_skip_entry(
1494                 JournalFile *f,
1495                 Object *o, uint64_t p,
1496                 int64_t skip,
1497                 Object **ret, uint64_t *offset) {
1498
1499         uint64_t i, n;
1500         int r;
1501
1502         assert(f);
1503         assert(o);
1504         assert(p > 0);
1505
1506         if (o->object.type != OBJECT_ENTRY)
1507                 return -EINVAL;
1508
1509         r = generic_array_bisect(f,
1510                                  le64toh(f->header->entry_array_offset),
1511                                  le64toh(f->header->n_entries),
1512                                  p,
1513                                  test_object_offset,
1514                                  DIRECTION_DOWN,
1515                                  NULL, NULL,
1516                                  &i);
1517         if (r <= 0)
1518                 return r;
1519
1520         /* Calculate new index */
1521         if (skip < 0) {
1522                 if ((uint64_t) -skip >= i)
1523                         i = 0;
1524                 else
1525                         i = i - (uint64_t) -skip;
1526         } else
1527                 i  += (uint64_t) skip;
1528
1529         n = le64toh(f->header->n_entries);
1530         if (n <= 0)
1531                 return -EBADMSG;
1532
1533         if (i >= n)
1534                 i = n-1;
1535
1536         return generic_array_get(f,
1537                                  le64toh(f->header->entry_array_offset),
1538                                  i,
1539                                  ret, offset);
1540 }
1541
1542 int journal_file_next_entry_for_data(
1543                 JournalFile *f,
1544                 Object *o, uint64_t p,
1545                 uint64_t data_offset,
1546                 direction_t direction,
1547                 Object **ret, uint64_t *offset) {
1548
1549         uint64_t n, i;
1550         int r;
1551         Object *d;
1552
1553         assert(f);
1554         assert(p > 0 || !o);
1555
1556         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1557         if (r < 0)
1558                 return r;
1559
1560         n = le64toh(d->data.n_entries);
1561         if (n <= 0)
1562                 return n;
1563
1564         if (!o)
1565                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1566         else {
1567                 if (o->object.type != OBJECT_ENTRY)
1568                         return -EINVAL;
1569
1570                 r = generic_array_bisect_plus_one(f,
1571                                                   le64toh(d->data.entry_offset),
1572                                                   le64toh(d->data.entry_array_offset),
1573                                                   le64toh(d->data.n_entries),
1574                                                   p,
1575                                                   test_object_offset,
1576                                                   DIRECTION_DOWN,
1577                                                   NULL, NULL,
1578                                                   &i);
1579
1580                 if (r <= 0)
1581                         return r;
1582
1583                 if (direction == DIRECTION_DOWN) {
1584                         if (i >= n - 1)
1585                                 return 0;
1586
1587                         i++;
1588                 } else {
1589                         if (i <= 0)
1590                                 return 0;
1591
1592                         i--;
1593                 }
1594
1595         }
1596
1597         return generic_array_get_plus_one(f,
1598                                           le64toh(d->data.entry_offset),
1599                                           le64toh(d->data.entry_array_offset),
1600                                           i,
1601                                           ret, offset);
1602 }
1603
1604 int journal_file_move_to_entry_by_seqnum_for_data(
1605                 JournalFile *f,
1606                 uint64_t data_offset,
1607                 uint64_t seqnum,
1608                 direction_t direction,
1609                 Object **ret, uint64_t *offset) {
1610
1611         Object *d;
1612         int r;
1613
1614         assert(f);
1615
1616         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1617         if (r < 0)
1618                 return r;
1619
1620         return generic_array_bisect_plus_one(f,
1621                                              le64toh(d->data.entry_offset),
1622                                              le64toh(d->data.entry_array_offset),
1623                                              le64toh(d->data.n_entries),
1624                                              seqnum,
1625                                              test_object_seqnum,
1626                                              direction,
1627                                              ret, offset, NULL);
1628 }
1629
1630 int journal_file_move_to_entry_by_realtime_for_data(
1631                 JournalFile *f,
1632                 uint64_t data_offset,
1633                 uint64_t realtime,
1634                 direction_t direction,
1635                 Object **ret, uint64_t *offset) {
1636
1637         Object *d;
1638         int r;
1639
1640         assert(f);
1641
1642         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1643         if (r < 0)
1644                 return r;
1645
1646         return generic_array_bisect_plus_one(f,
1647                                              le64toh(d->data.entry_offset),
1648                                              le64toh(d->data.entry_array_offset),
1649                                              le64toh(d->data.n_entries),
1650                                              realtime,
1651                                              test_object_realtime,
1652                                              direction,
1653                                              ret, offset, NULL);
1654 }
1655
1656 void journal_file_dump(JournalFile *f) {
1657         char a[33], b[33], c[33];
1658         Object *o;
1659         int r;
1660         uint64_t p;
1661
1662         assert(f);
1663
1664         printf("File Path: %s\n"
1665                "File ID: %s\n"
1666                "Machine ID: %s\n"
1667                "Boot ID: %s\n"
1668                "Arena size: %llu\n"
1669                "Objects: %lu\n"
1670                "Entries: %lu\n",
1671                f->path,
1672                sd_id128_to_string(f->header->file_id, a),
1673                sd_id128_to_string(f->header->machine_id, b),
1674                sd_id128_to_string(f->header->boot_id, c),
1675                (unsigned long long) le64toh(f->header->arena_size),
1676                (unsigned long) le64toh(f->header->n_objects),
1677                (unsigned long) le64toh(f->header->n_entries));
1678
1679         p = le64toh(f->header->header_size);
1680         while (p != 0) {
1681                 r = journal_file_move_to_object(f, -1, p, &o);
1682                 if (r < 0)
1683                         goto fail;
1684
1685                 switch (o->object.type) {
1686
1687                 case OBJECT_UNUSED:
1688                         printf("Type: OBJECT_UNUSED\n");
1689                         break;
1690
1691                 case OBJECT_DATA:
1692                         printf("Type: OBJECT_DATA\n");
1693                         break;
1694
1695                 case OBJECT_ENTRY:
1696                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1697                                (unsigned long long) le64toh(o->entry.seqnum),
1698                                (unsigned long long) le64toh(o->entry.monotonic),
1699                                (unsigned long long) le64toh(o->entry.realtime));
1700                         break;
1701
1702                 case OBJECT_FIELD_HASH_TABLE:
1703                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1704                         break;
1705
1706                 case OBJECT_DATA_HASH_TABLE:
1707                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1708                         break;
1709
1710                 case OBJECT_ENTRY_ARRAY:
1711                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1712                         break;
1713
1714                 case OBJECT_SIGNATURE:
1715                         printf("Type: OBJECT_SIGNATURE\n");
1716                         break;
1717                 }
1718
1719                 if (o->object.flags & OBJECT_COMPRESSED)
1720                         printf("Flags: COMPRESSED\n");
1721
1722                 if (p == le64toh(f->header->tail_object_offset))
1723                         p = 0;
1724                 else
1725                         p = p + ALIGN64(le64toh(o->object.size));
1726         }
1727
1728         return;
1729 fail:
1730         log_error("File corrupt");
1731 }
1732
1733 int journal_file_open(
1734                 const char *fname,
1735                 int flags,
1736                 mode_t mode,
1737                 JournalFile *template,
1738                 JournalFile **ret) {
1739
1740         JournalFile *f;
1741         int r;
1742         bool newly_created = false;
1743
1744         assert(fname);
1745
1746         if ((flags & O_ACCMODE) != O_RDONLY &&
1747             (flags & O_ACCMODE) != O_RDWR)
1748                 return -EINVAL;
1749
1750         if (!endswith(fname, ".journal"))
1751                 return -EINVAL;
1752
1753         f = new0(JournalFile, 1);
1754         if (!f)
1755                 return -ENOMEM;
1756
1757         f->fd = -1;
1758         f->flags = flags;
1759         f->mode = mode;
1760         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1761         f->prot = prot_from_flags(flags);
1762
1763         if (template) {
1764                 f->metrics = template->metrics;
1765                 f->compress = template->compress;
1766         }
1767
1768         f->path = strdup(fname);
1769         if (!f->path) {
1770                 r = -ENOMEM;
1771                 goto fail;
1772         }
1773
1774         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1775         if (f->fd < 0) {
1776                 r = -errno;
1777                 goto fail;
1778         }
1779
1780         if (fstat(f->fd, &f->last_stat) < 0) {
1781                 r = -errno;
1782                 goto fail;
1783         }
1784
1785         if (f->last_stat.st_size == 0 && f->writable) {
1786                 newly_created = true;
1787
1788                 r = journal_file_init_header(f, template);
1789                 if (r < 0)
1790                         goto fail;
1791
1792                 if (fstat(f->fd, &f->last_stat) < 0) {
1793                         r = -errno;
1794                         goto fail;
1795                 }
1796         }
1797
1798         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1799                 r = -EIO;
1800                 goto fail;
1801         }
1802
1803         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1804         if (f->header == MAP_FAILED) {
1805                 f->header = NULL;
1806                 r = -errno;
1807                 goto fail;
1808         }
1809
1810         if (!newly_created) {
1811                 r = journal_file_verify_header(f);
1812                 if (r < 0)
1813                         goto fail;
1814         }
1815
1816         if (f->writable) {
1817                 r = journal_file_refresh_header(f);
1818                 if (r < 0)
1819                         goto fail;
1820         }
1821
1822         if (newly_created) {
1823
1824                 r = journal_file_setup_field_hash_table(f);
1825                 if (r < 0)
1826                         goto fail;
1827
1828                 r = journal_file_setup_data_hash_table(f);
1829                 if (r < 0)
1830                         goto fail;
1831         }
1832
1833         r = journal_file_map_field_hash_table(f);
1834         if (r < 0)
1835                 goto fail;
1836
1837         r = journal_file_map_data_hash_table(f);
1838         if (r < 0)
1839                 goto fail;
1840
1841         if (ret)
1842                 *ret = f;
1843
1844         return 0;
1845
1846 fail:
1847         journal_file_close(f);
1848
1849         return r;
1850 }
1851
1852 int journal_file_rotate(JournalFile **f) {
1853         char *p;
1854         size_t l;
1855         JournalFile *old_file, *new_file = NULL;
1856         int r;
1857
1858         assert(f);
1859         assert(*f);
1860
1861         old_file = *f;
1862
1863         if (!old_file->writable)
1864                 return -EINVAL;
1865
1866         if (!endswith(old_file->path, ".journal"))
1867                 return -EINVAL;
1868
1869         l = strlen(old_file->path);
1870
1871         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1872         if (!p)
1873                 return -ENOMEM;
1874
1875         memcpy(p, old_file->path, l - 8);
1876         p[l-8] = '@';
1877         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1878         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1879                  "-%016llx-%016llx.journal",
1880                  (unsigned long long) le64toh((*f)->header->seqnum),
1881                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1882
1883         r = rename(old_file->path, p);
1884         free(p);
1885
1886         if (r < 0)
1887                 return -errno;
1888
1889         old_file->header->state = STATE_ARCHIVED;
1890
1891         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1892         journal_file_close(old_file);
1893
1894         *f = new_file;
1895         return r;
1896 }
1897
1898 int journal_file_open_reliably(
1899                 const char *fname,
1900                 int flags,
1901                 mode_t mode,
1902                 JournalFile *template,
1903                 JournalFile **ret) {
1904
1905         int r;
1906         size_t l;
1907         char *p;
1908
1909         r = journal_file_open(fname, flags, mode, template, ret);
1910         if (r != -EBADMSG && /* corrupted */
1911             r != -ENODATA && /* truncated */
1912             r != -EHOSTDOWN && /* other machine */
1913             r != -EPROTONOSUPPORT) /* incompatible feature */
1914                 return r;
1915
1916         if ((flags & O_ACCMODE) == O_RDONLY)
1917                 return r;
1918
1919         if (!(flags & O_CREAT))
1920                 return r;
1921
1922         /* The file is corrupted. Rotate it away and try it again (but only once) */
1923
1924         l = strlen(fname);
1925         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1926                      (int) (l-8), fname,
1927                      (unsigned long long) now(CLOCK_REALTIME),
1928                      random_ull()) < 0)
1929                 return -ENOMEM;
1930
1931         r = rename(fname, p);
1932         free(p);
1933         if (r < 0)
1934                 return -errno;
1935
1936         log_warning("File %s corrupted, renaming and replacing.", fname);
1937
1938         return journal_file_open(fname, flags, mode, template, ret);
1939 }
1940
1941 struct vacuum_info {
1942         off_t usage;
1943         char *filename;
1944
1945         uint64_t realtime;
1946         sd_id128_t seqnum_id;
1947         uint64_t seqnum;
1948
1949         bool have_seqnum;
1950 };
1951
1952 static int vacuum_compare(const void *_a, const void *_b) {
1953         const struct vacuum_info *a, *b;
1954
1955         a = _a;
1956         b = _b;
1957
1958         if (a->have_seqnum && b->have_seqnum &&
1959             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1960                 if (a->seqnum < b->seqnum)
1961                         return -1;
1962                 else if (a->seqnum > b->seqnum)
1963                         return 1;
1964                 else
1965                         return 0;
1966         }
1967
1968         if (a->realtime < b->realtime)
1969                 return -1;
1970         else if (a->realtime > b->realtime)
1971                 return 1;
1972         else if (a->have_seqnum && b->have_seqnum)
1973                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1974         else
1975                 return strcmp(a->filename, b->filename);
1976 }
1977
1978 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1979         DIR *d;
1980         int r = 0;
1981         struct vacuum_info *list = NULL;
1982         unsigned n_list = 0, n_allocated = 0, i;
1983         uint64_t sum = 0;
1984
1985         assert(directory);
1986
1987         if (max_use <= 0)
1988                 return 0;
1989
1990         d = opendir(directory);
1991         if (!d)
1992                 return -errno;
1993
1994         for (;;) {
1995                 int k;
1996                 struct dirent buf, *de;
1997                 size_t q;
1998                 struct stat st;
1999                 char *p;
2000                 unsigned long long seqnum = 0, realtime;
2001                 sd_id128_t seqnum_id;
2002                 bool have_seqnum;
2003
2004                 k = readdir_r(d, &buf, &de);
2005                 if (k != 0) {
2006                         r = -k;
2007                         goto finish;
2008                 }
2009
2010                 if (!de)
2011                         break;
2012
2013                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2014                         continue;
2015
2016                 if (!S_ISREG(st.st_mode))
2017                         continue;
2018
2019                 q = strlen(de->d_name);
2020
2021                 if (endswith(de->d_name, ".journal")) {
2022
2023                         /* Vacuum archived files */
2024
2025                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2026                                 continue;
2027
2028                         if (de->d_name[q-8-16-1] != '-' ||
2029                             de->d_name[q-8-16-1-16-1] != '-' ||
2030                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2031                                 continue;
2032
2033                         p = strdup(de->d_name);
2034                         if (!p) {
2035                                 r = -ENOMEM;
2036                                 goto finish;
2037                         }
2038
2039                         de->d_name[q-8-16-1-16-1] = 0;
2040                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2041                                 free(p);
2042                                 continue;
2043                         }
2044
2045                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2046                                 free(p);
2047                                 continue;
2048                         }
2049
2050                         have_seqnum = true;
2051
2052                 } else if (endswith(de->d_name, ".journal~")) {
2053                         unsigned long long tmp;
2054
2055                         /* Vacuum corrupted files */
2056
2057                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2058                                 continue;
2059
2060                         if (de->d_name[q-1-8-16-1] != '-' ||
2061                             de->d_name[q-1-8-16-1-16-1] != '@')
2062                                 continue;
2063
2064                         p = strdup(de->d_name);
2065                         if (!p) {
2066                                 r = -ENOMEM;
2067                                 goto finish;
2068                         }
2069
2070                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2071                                 free(p);
2072                                 continue;
2073                         }
2074
2075                         have_seqnum = false;
2076                 } else
2077                         continue;
2078
2079                 if (n_list >= n_allocated) {
2080                         struct vacuum_info *j;
2081
2082                         n_allocated = MAX(n_allocated * 2U, 8U);
2083                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2084                         if (!j) {
2085                                 free(p);
2086                                 r = -ENOMEM;
2087                                 goto finish;
2088                         }
2089
2090                         list = j;
2091                 }
2092
2093                 list[n_list].filename = p;
2094                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2095                 list[n_list].seqnum = seqnum;
2096                 list[n_list].realtime = realtime;
2097                 list[n_list].seqnum_id = seqnum_id;
2098                 list[n_list].have_seqnum = have_seqnum;
2099
2100                 sum += list[n_list].usage;
2101
2102                 n_list ++;
2103         }
2104
2105         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2106
2107         for(i = 0; i < n_list; i++) {
2108                 struct statvfs ss;
2109
2110                 if (fstatvfs(dirfd(d), &ss) < 0) {
2111                         r = -errno;
2112                         goto finish;
2113                 }
2114
2115                 if (sum <= max_use &&
2116                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2117                         break;
2118
2119                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2120                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2121                         sum -= list[i].usage;
2122                 } else if (errno != ENOENT)
2123                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2124         }
2125
2126 finish:
2127         for (i = 0; i < n_list; i++)
2128                 free(list[i].filename);
2129
2130         free(list);
2131
2132         if (d)
2133                 closedir(d);
2134
2135         return r;
2136 }
2137
2138 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2139         uint64_t i, n;
2140         uint64_t q, xor_hash = 0;
2141         int r;
2142         EntryItem *items;
2143         dual_timestamp ts;
2144
2145         assert(from);
2146         assert(to);
2147         assert(o);
2148         assert(p);
2149
2150         if (!to->writable)
2151                 return -EPERM;
2152
2153         ts.monotonic = le64toh(o->entry.monotonic);
2154         ts.realtime = le64toh(o->entry.realtime);
2155
2156         if (to->tail_entry_monotonic_valid &&
2157             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2158                 return -EINVAL;
2159
2160         if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2161                 return -EINVAL;
2162
2163         n = journal_file_entry_n_items(o);
2164         items = alloca(sizeof(EntryItem) * n);
2165
2166         for (i = 0; i < n; i++) {
2167                 uint64_t l, h;
2168                 le64_t le_hash;
2169                 size_t t;
2170                 void *data;
2171                 Object *u;
2172
2173                 q = le64toh(o->entry.items[i].object_offset);
2174                 le_hash = o->entry.items[i].hash;
2175
2176                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2177                 if (r < 0)
2178                         return r;
2179
2180                 if (le_hash != o->data.hash)
2181                         return -EBADMSG;
2182
2183                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2184                 t = (size_t) l;
2185
2186                 /* We hit the limit on 32bit machines */
2187                 if ((uint64_t) t != l)
2188                         return -E2BIG;
2189
2190                 if (o->object.flags & OBJECT_COMPRESSED) {
2191 #ifdef HAVE_XZ
2192                         uint64_t rsize;
2193
2194                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2195                                 return -EBADMSG;
2196
2197                         data = from->compress_buffer;
2198                         l = rsize;
2199 #else
2200                         return -EPROTONOSUPPORT;
2201 #endif
2202                 } else
2203                         data = o->data.payload;
2204
2205                 r = journal_file_append_data(to, data, l, &u, &h);
2206                 if (r < 0)
2207                         return r;
2208
2209                 xor_hash ^= le64toh(u->data.hash);
2210                 items[i].object_offset = htole64(h);
2211                 items[i].hash = u->data.hash;
2212
2213                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2214                 if (r < 0)
2215                         return r;
2216         }
2217
2218         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2219 }
2220
2221 void journal_default_metrics(JournalMetrics *m, int fd) {
2222         uint64_t fs_size = 0;
2223         struct statvfs ss;
2224         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2225
2226         assert(m);
2227         assert(fd >= 0);
2228
2229         if (fstatvfs(fd, &ss) >= 0)
2230                 fs_size = ss.f_frsize * ss.f_blocks;
2231
2232         if (m->max_use == (uint64_t) -1) {
2233
2234                 if (fs_size > 0) {
2235                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2236
2237                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2238                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2239
2240                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2241                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2242                 } else
2243                         m->max_use = DEFAULT_MAX_USE_LOWER;
2244         } else {
2245                 m->max_use = PAGE_ALIGN(m->max_use);
2246
2247                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2248                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2249         }
2250
2251         if (m->max_size == (uint64_t) -1) {
2252                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2253
2254                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2255                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2256         } else
2257                 m->max_size = PAGE_ALIGN(m->max_size);
2258
2259         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2260                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2261
2262         if (m->max_size*2 > m->max_use)
2263                 m->max_use = m->max_size*2;
2264
2265         if (m->min_size == (uint64_t) -1)
2266                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2267         else {
2268                 m->min_size = PAGE_ALIGN(m->min_size);
2269
2270                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2271                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2272
2273                 if (m->min_size > m->max_size)
2274                         m->max_size = m->min_size;
2275         }
2276
2277         if (m->keep_free == (uint64_t) -1) {
2278
2279                 if (fs_size > 0) {
2280                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2281
2282                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2283                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2284
2285                 } else
2286                         m->keep_free = DEFAULT_KEEP_FREE;
2287         }
2288
2289         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2290                  format_bytes(a, sizeof(a), m->max_use),
2291                  format_bytes(b, sizeof(b), m->max_size),
2292                  format_bytes(c, sizeof(c), m->min_size),
2293                  format_bytes(d, sizeof(d), m->keep_free));
2294 }
2295
2296 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2297         Object *o;
2298         int r;
2299
2300         assert(f);
2301         assert(from || to);
2302
2303         if (from) {
2304                 r = journal_file_next_entry(f, NULL, 0, DIRECTION_DOWN, &o, NULL);
2305                 if (r <= 0)
2306                         return r;
2307
2308                 *from = le64toh(o->entry.realtime);
2309         }
2310
2311         if (to) {
2312                 r = journal_file_next_entry(f, NULL, 0, DIRECTION_UP, &o, NULL);
2313                 if (r <= 0)
2314                         return r;
2315
2316                 *to = le64toh(o->entry.realtime);
2317         }
2318
2319         return 1;
2320 }
2321
2322 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2323         char t[9+32+1] = "_BOOT_ID=";
2324         Object *o;
2325         uint64_t p;
2326         int r;
2327
2328         assert(f);
2329         assert(from || to);
2330
2331         sd_id128_to_string(boot_id, t + 9);
2332
2333         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2334         if (r <= 0)
2335                 return r;
2336
2337         if (le64toh(o->data.n_entries) <= 0)
2338                 return 0;
2339
2340         if (from) {
2341                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2342                 if (r < 0)
2343                         return r;
2344
2345                 *from = le64toh(o->entry.monotonic);
2346         }
2347
2348         if (to) {
2349                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2350                 if (r < 0)
2351                         return r;
2352
2353                 r = generic_array_get_plus_one(f,
2354                                                le64toh(o->data.entry_offset),
2355                                                le64toh(o->data.entry_array_offset),
2356                                                le64toh(o->data.n_entries)-1,
2357                                                &o, NULL);
2358                 if (r <= 0)
2359                         return r;
2360
2361                 *to = le64toh(o->entry.monotonic);
2362         }
2363
2364         return 1;
2365 }