chiark / gitweb /
a60a896c2f3ad7370f43abf8ba97514d0c9f20d2
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
62
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
64
65 void journal_file_close(JournalFile *f) {
66         int t;
67
68         assert(f);
69
70         if (f->header && f->writable)
71                 f->header->state = STATE_OFFLINE;
72
73
74         for (t = 0; t < _WINDOW_MAX; t++)
75                 if (f->windows[t].ptr)
76                         munmap(f->windows[t].ptr, f->windows[t].size);
77
78         if (f->fd >= 0)
79                 close_nointr_nofail(f->fd);
80
81         free(f->path);
82
83 #ifdef HAVE_XZ
84         free(f->compress_buffer);
85 #endif
86
87         free(f);
88 }
89
90 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
91         Header h;
92         ssize_t k;
93         int r;
94
95         assert(f);
96
97         zero(h);
98         memcpy(h.signature, signature, 8);
99         h.arena_offset = htole64(ALIGN64(sizeof(h)));
100
101         r = sd_id128_randomize(&h.file_id);
102         if (r < 0)
103                 return r;
104
105         if (template) {
106                 h.seqnum_id = template->header->seqnum_id;
107                 h.seqnum = template->header->seqnum;
108         } else
109                 h.seqnum_id = h.file_id;
110
111         k = pwrite(f->fd, &h, sizeof(h), 0);
112         if (k < 0)
113                 return -errno;
114
115         if (k != sizeof(h))
116                 return -EIO;
117
118         return 0;
119 }
120
121 static int journal_file_refresh_header(JournalFile *f) {
122         int r;
123         sd_id128_t boot_id;
124
125         assert(f);
126
127         r = sd_id128_get_machine(&f->header->machine_id);
128         if (r < 0)
129                 return r;
130
131         r = sd_id128_get_boot(&boot_id);
132         if (r < 0)
133                 return r;
134
135         if (sd_id128_equal(boot_id, f->header->boot_id))
136                 f->tail_entry_monotonic_valid = true;
137
138         f->header->boot_id = boot_id;
139
140         f->header->state = STATE_ONLINE;
141
142         __sync_synchronize();
143
144         return 0;
145 }
146
147 static int journal_file_verify_header(JournalFile *f) {
148         assert(f);
149
150         if (memcmp(f->header, signature, 8))
151                 return -EBADMSG;
152
153 #ifdef HAVE_XZ
154         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
155                 return -EPROTONOSUPPORT;
156 #else
157         if (f->header->incompatible_flags != 0)
158                 return -EPROTONOSUPPORT;
159 #endif
160
161         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
162                 return -ENODATA;
163
164         if (f->writable) {
165                 uint8_t state;
166                 sd_id128_t machine_id;
167                 int r;
168
169                 r = sd_id128_get_machine(&machine_id);
170                 if (r < 0)
171                         return r;
172
173                 if (!sd_id128_equal(machine_id, f->header->machine_id))
174                         return -EHOSTDOWN;
175
176                 state = f->header->state;
177
178                 if (state == STATE_ONLINE)
179                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
180                 else if (state == STATE_ARCHIVED)
181                         return -ESHUTDOWN;
182                 else if (state != STATE_OFFLINE)
183                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
184         }
185
186         return 0;
187 }
188
189 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
190         uint64_t old_size, new_size;
191
192         assert(f);
193
194         /* We assume that this file is not sparse, and we know that
195          * for sure, since we always call posix_fallocate()
196          * ourselves */
197
198         old_size =
199                 le64toh(f->header->arena_offset) +
200                 le64toh(f->header->arena_size);
201
202         new_size = PAGE_ALIGN(offset + size);
203         if (new_size < le64toh(f->header->arena_offset))
204                 new_size = le64toh(f->header->arena_offset);
205
206         if (new_size <= old_size)
207                 return 0;
208
209         if (f->metrics.max_size > 0 &&
210             new_size > f->metrics.max_size)
211                 return -E2BIG;
212
213         if (new_size > f->metrics.min_size &&
214             f->metrics.keep_free > 0) {
215                 struct statvfs svfs;
216
217                 if (fstatvfs(f->fd, &svfs) >= 0) {
218                         uint64_t available;
219
220                         available = svfs.f_bfree * svfs.f_bsize;
221
222                         if (available >= f->metrics.keep_free)
223                                 available -= f->metrics.keep_free;
224                         else
225                                 available = 0;
226
227                         if (new_size - old_size > available)
228                                 return -E2BIG;
229                 }
230         }
231
232         /* Note that the glibc fallocate() fallback is very
233            inefficient, hence we try to minimize the allocation area
234            as we can. */
235         if (posix_fallocate(f->fd, old_size, new_size - old_size) < 0)
236                 return -errno;
237
238         if (fstat(f->fd, &f->last_stat) < 0)
239                 return -errno;
240
241         f->header->arena_size = htole64(new_size - le64toh(f->header->arena_offset));
242
243         return 0;
244 }
245
246 static int journal_file_map(
247                 JournalFile *f,
248                 uint64_t offset,
249                 uint64_t size,
250                 void **_window,
251                 uint64_t *_woffset,
252                 uint64_t *_wsize,
253                 void **ret) {
254
255         uint64_t woffset, wsize;
256         void *window;
257
258         assert(f);
259         assert(size > 0);
260         assert(ret);
261
262         woffset = offset & ~((uint64_t) page_size() - 1ULL);
263         wsize = size + (offset - woffset);
264         wsize = PAGE_ALIGN(wsize);
265
266         /* Avoid SIGBUS on invalid accesses */
267         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
268                 return -EADDRNOTAVAIL;
269
270         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
271         if (window == MAP_FAILED)
272                 return -errno;
273
274         if (_window)
275                 *_window = window;
276
277         if (_woffset)
278                 *_woffset = woffset;
279
280         if (_wsize)
281                 *_wsize = wsize;
282
283         *ret = (uint8_t*) window + (offset - woffset);
284
285         return 0;
286 }
287
288 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
289         void *p = NULL;
290         uint64_t delta;
291         int r;
292         Window *w;
293
294         assert(f);
295         assert(ret);
296         assert(wt >= 0);
297         assert(wt < _WINDOW_MAX);
298
299         if (offset + size > (uint64_t) f->last_stat.st_size) {
300                 /* Hmm, out of range? Let's refresh the fstat() data
301                  * first, before we trust that check. */
302
303                 if (fstat(f->fd, &f->last_stat) < 0 ||
304                     offset + size > (uint64_t) f->last_stat.st_size)
305                         return -EADDRNOTAVAIL;
306         }
307
308         w = f->windows + wt;
309
310         if (_likely_(w->ptr &&
311                      w->offset <= offset &&
312                      w->offset + w->size >= offset + size)) {
313
314                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
315                 return 0;
316         }
317
318         if (w->ptr) {
319                 if (munmap(w->ptr, w->size) < 0)
320                         return -errno;
321
322                 w->ptr = NULL;
323                 w->size = w->offset = 0;
324         }
325
326         if (size < DEFAULT_WINDOW_SIZE) {
327                 /* If the default window size is larger then what was
328                  * asked for extend the mapping a bit in the hope to
329                  * minimize needed remappings later on. We add half
330                  * the window space before and half behind the
331                  * requested mapping */
332
333                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
334
335                 if (delta > offset)
336                         delta = offset;
337
338                 offset -= delta;
339                 size = DEFAULT_WINDOW_SIZE;
340         } else
341                 delta = 0;
342
343         if (offset + size > (uint64_t) f->last_stat.st_size)
344                 size = (uint64_t) f->last_stat.st_size - offset;
345
346         if (size <= 0)
347                 return -EADDRNOTAVAIL;
348
349         r = journal_file_map(f,
350                              offset, size,
351                              &w->ptr, &w->offset, &w->size,
352                              &p);
353
354         if (r < 0)
355                 return r;
356
357         *ret = (uint8_t*) p + delta;
358         return 0;
359 }
360
361 static bool verify_hash(Object *o) {
362         uint64_t h1, h2;
363
364         assert(o);
365
366         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
367                 h1 = le64toh(o->data.hash);
368                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
369         } else if (o->object.type == OBJECT_FIELD) {
370                 h1 = le64toh(o->field.hash);
371                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
372         } else
373                 return true;
374
375         return h1 == h2;
376 }
377
378 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
379         int r;
380         void *t;
381         Object *o;
382         uint64_t s;
383
384         assert(f);
385         assert(ret);
386         assert(type < _OBJECT_TYPE_MAX);
387
388         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
389         if (r < 0)
390                 return r;
391
392         o = (Object*) t;
393         s = le64toh(o->object.size);
394
395         if (s < sizeof(ObjectHeader))
396                 return -EBADMSG;
397
398         if (type >= 0 && o->object.type != type)
399                 return -EBADMSG;
400
401         if (s > sizeof(ObjectHeader)) {
402                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
403                 if (r < 0)
404                         return r;
405
406                 o = (Object*) t;
407         }
408
409         if (!verify_hash(o))
410                 return -EBADMSG;
411
412         *ret = o;
413         return 0;
414 }
415
416 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
417         uint64_t r;
418
419         assert(f);
420
421         r = le64toh(f->header->seqnum) + 1;
422
423         if (seqnum) {
424                 /* If an external seqnum counter was passed, we update
425                  * both the local and the external one, and set it to
426                  * the maximum of both */
427
428                 if (*seqnum + 1 > r)
429                         r = *seqnum + 1;
430
431                 *seqnum = r;
432         }
433
434         f->header->seqnum = htole64(r);
435
436         if (f->header->first_seqnum == 0)
437                 f->header->first_seqnum = htole64(r);
438
439         return r;
440 }
441
442 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
443         int r;
444         uint64_t p;
445         Object *tail, *o;
446         void *t;
447
448         assert(f);
449         assert(size >= sizeof(ObjectHeader));
450         assert(offset);
451         assert(ret);
452
453         p = le64toh(f->header->tail_object_offset);
454         if (p == 0)
455                 p = le64toh(f->header->arena_offset);
456         else {
457                 r = journal_file_move_to_object(f, -1, p, &tail);
458                 if (r < 0)
459                         return r;
460
461                 p += ALIGN64(le64toh(tail->object.size));
462         }
463
464         r = journal_file_allocate(f, p, size);
465         if (r < 0)
466                 return r;
467
468         r = journal_file_move_to(f, type, p, size, &t);
469         if (r < 0)
470                 return r;
471
472         o = (Object*) t;
473
474         zero(o->object);
475         o->object.type = type;
476         o->object.size = htole64(size);
477
478         f->header->tail_object_offset = htole64(p);
479         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
480
481         *ret = o;
482         *offset = p;
483
484         return 0;
485 }
486
487 static int journal_file_setup_data_hash_table(JournalFile *f) {
488         uint64_t s, p;
489         Object *o;
490         int r;
491
492         assert(f);
493
494         s = DEFAULT_DATA_HASH_TABLE_SIZE;
495         r = journal_file_append_object(f,
496                                        OBJECT_DATA_HASH_TABLE,
497                                        offsetof(Object, hash_table.items) + s,
498                                        &o, &p);
499         if (r < 0)
500                 return r;
501
502         memset(o->hash_table.items, 0, s);
503
504         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
505         f->header->data_hash_table_size = htole64(s);
506
507         return 0;
508 }
509
510 static int journal_file_setup_field_hash_table(JournalFile *f) {
511         uint64_t s, p;
512         Object *o;
513         int r;
514
515         assert(f);
516
517         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
518         r = journal_file_append_object(f,
519                                        OBJECT_FIELD_HASH_TABLE,
520                                        offsetof(Object, hash_table.items) + s,
521                                        &o, &p);
522         if (r < 0)
523                 return r;
524
525         memset(o->hash_table.items, 0, s);
526
527         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
528         f->header->field_hash_table_size = htole64(s);
529
530         return 0;
531 }
532
533 static int journal_file_map_data_hash_table(JournalFile *f) {
534         uint64_t s, p;
535         void *t;
536         int r;
537
538         assert(f);
539
540         p = le64toh(f->header->data_hash_table_offset);
541         s = le64toh(f->header->data_hash_table_size);
542
543         r = journal_file_move_to(f,
544                                  WINDOW_DATA_HASH_TABLE,
545                                  p, s,
546                                  &t);
547         if (r < 0)
548                 return r;
549
550         f->data_hash_table = t;
551         return 0;
552 }
553
554 static int journal_file_map_field_hash_table(JournalFile *f) {
555         uint64_t s, p;
556         void *t;
557         int r;
558
559         assert(f);
560
561         p = le64toh(f->header->field_hash_table_offset);
562         s = le64toh(f->header->field_hash_table_size);
563
564         r = journal_file_move_to(f,
565                                  WINDOW_FIELD_HASH_TABLE,
566                                  p, s,
567                                  &t);
568         if (r < 0)
569                 return r;
570
571         f->field_hash_table = t;
572         return 0;
573 }
574
575 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
576         uint64_t p, h;
577         int r;
578
579         assert(f);
580         assert(o);
581         assert(offset > 0);
582         assert(o->object.type == OBJECT_DATA);
583
584         /* This might alter the window we are looking at */
585
586         o->data.next_hash_offset = o->data.next_field_offset = 0;
587         o->data.entry_offset = o->data.entry_array_offset = 0;
588         o->data.n_entries = 0;
589
590         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
591         p = le64toh(f->data_hash_table[h].head_hash_offset);
592         if (p == 0) {
593                 /* Only entry in the hash table is easy */
594                 f->data_hash_table[h].head_hash_offset = htole64(offset);
595         } else {
596                 /* Move back to the previous data object, to patch in
597                  * pointer */
598
599                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
600                 if (r < 0)
601                         return r;
602
603                 o->data.next_hash_offset = htole64(offset);
604         }
605
606         f->data_hash_table[h].tail_hash_offset = htole64(offset);
607
608         return 0;
609 }
610
611 int journal_file_find_data_object_with_hash(
612                 JournalFile *f,
613                 const void *data, uint64_t size, uint64_t hash,
614                 Object **ret, uint64_t *offset) {
615
616         uint64_t p, osize, h;
617         int r;
618
619         assert(f);
620         assert(data || size == 0);
621
622         osize = offsetof(Object, data.payload) + size;
623
624         if (f->header->data_hash_table_size == 0)
625                 return -EBADMSG;
626
627         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
628         p = le64toh(f->data_hash_table[h].head_hash_offset);
629
630         while (p > 0) {
631                 Object *o;
632
633                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
634                 if (r < 0)
635                         return r;
636
637                 if (le64toh(o->data.hash) != hash)
638                         goto next;
639
640                 if (o->object.flags & OBJECT_COMPRESSED) {
641 #ifdef HAVE_XZ
642                         uint64_t l, rsize;
643
644                         l = le64toh(o->object.size);
645                         if (l <= offsetof(Object, data.payload))
646                                 return -EBADMSG;
647
648                         l -= offsetof(Object, data.payload);
649
650                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
651                                 return -EBADMSG;
652
653                         if (rsize == size &&
654                             memcmp(f->compress_buffer, data, size) == 0) {
655
656                                 if (ret)
657                                         *ret = o;
658
659                                 if (offset)
660                                         *offset = p;
661
662                                 return 1;
663                         }
664 #else
665                         return -EPROTONOSUPPORT;
666 #endif
667
668                 } else if (le64toh(o->object.size) == osize &&
669                            memcmp(o->data.payload, data, size) == 0) {
670
671                         if (ret)
672                                 *ret = o;
673
674                         if (offset)
675                                 *offset = p;
676
677                         return 1;
678                 }
679
680         next:
681                 p = le64toh(o->data.next_hash_offset);
682         }
683
684         return 0;
685 }
686
687 int journal_file_find_data_object(
688                 JournalFile *f,
689                 const void *data, uint64_t size,
690                 Object **ret, uint64_t *offset) {
691
692         uint64_t hash;
693
694         assert(f);
695         assert(data || size == 0);
696
697         hash = hash64(data, size);
698
699         return journal_file_find_data_object_with_hash(f,
700                                                        data, size, hash,
701                                                        ret, offset);
702 }
703
704 static int journal_file_append_data(
705                 JournalFile *f,
706                 const void *data, uint64_t size,
707                 Object **ret, uint64_t *offset) {
708
709         uint64_t hash, p;
710         uint64_t osize;
711         Object *o;
712         int r;
713         bool compressed = false;
714
715         assert(f);
716         assert(data || size == 0);
717
718         hash = hash64(data, size);
719
720         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
721         if (r < 0)
722                 return r;
723         else if (r > 0) {
724
725                 if (ret)
726                         *ret = o;
727
728                 if (offset)
729                         *offset = p;
730
731                 return 0;
732         }
733
734         osize = offsetof(Object, data.payload) + size;
735         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
736         if (r < 0)
737                 return r;
738
739         o->data.hash = htole64(hash);
740
741 #ifdef HAVE_XZ
742         if (f->compress &&
743             size >= COMPRESSION_SIZE_THRESHOLD) {
744                 uint64_t rsize;
745
746                 compressed = compress_blob(data, size, o->data.payload, &rsize);
747
748                 if (compressed) {
749                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
750                         o->object.flags |= OBJECT_COMPRESSED;
751
752                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
753
754                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
755                 }
756         }
757 #endif
758
759         if (!compressed)
760                 memcpy(o->data.payload, data, size);
761
762         r = journal_file_link_data(f, o, p, hash);
763         if (r < 0)
764                 return r;
765
766         /* The linking might have altered the window, so let's
767          * refresh our pointer */
768         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
769         if (r < 0)
770                 return r;
771
772         if (ret)
773                 *ret = o;
774
775         if (offset)
776                 *offset = p;
777
778         return 0;
779 }
780
781 uint64_t journal_file_entry_n_items(Object *o) {
782         assert(o);
783         assert(o->object.type == OBJECT_ENTRY);
784
785         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
786 }
787
788 static uint64_t journal_file_entry_array_n_items(Object *o) {
789         assert(o);
790         assert(o->object.type == OBJECT_ENTRY_ARRAY);
791
792         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
793 }
794
795 static int link_entry_into_array(JournalFile *f,
796                                  le64_t *first,
797                                  le64_t *idx,
798                                  uint64_t p) {
799         int r;
800         uint64_t n = 0, ap = 0, q, i, a, hidx;
801         Object *o;
802
803         assert(f);
804         assert(first);
805         assert(idx);
806         assert(p > 0);
807
808         a = le64toh(*first);
809         i = hidx = le64toh(*idx);
810         while (a > 0) {
811
812                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
813                 if (r < 0)
814                         return r;
815
816                 n = journal_file_entry_array_n_items(o);
817                 if (i < n) {
818                         o->entry_array.items[i] = htole64(p);
819                         *idx = htole64(hidx + 1);
820                         return 0;
821                 }
822
823                 i -= n;
824                 ap = a;
825                 a = le64toh(o->entry_array.next_entry_array_offset);
826         }
827
828         if (hidx > n)
829                 n = (hidx+1) * 2;
830         else
831                 n = n * 2;
832
833         if (n < 4)
834                 n = 4;
835
836         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
837                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
838                                        &o, &q);
839         if (r < 0)
840                 return r;
841
842         o->entry_array.items[i] = htole64(p);
843
844         if (ap == 0)
845                 *first = htole64(q);
846         else {
847                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
848                 if (r < 0)
849                         return r;
850
851                 o->entry_array.next_entry_array_offset = htole64(q);
852         }
853
854         *idx = htole64(hidx + 1);
855
856         return 0;
857 }
858
859 static int link_entry_into_array_plus_one(JournalFile *f,
860                                           le64_t *extra,
861                                           le64_t *first,
862                                           le64_t *idx,
863                                           uint64_t p) {
864
865         int r;
866
867         assert(f);
868         assert(extra);
869         assert(first);
870         assert(idx);
871         assert(p > 0);
872
873         if (*idx == 0)
874                 *extra = htole64(p);
875         else {
876                 le64_t i;
877
878                 i = htole64(le64toh(*idx) - 1);
879                 r = link_entry_into_array(f, first, &i, p);
880                 if (r < 0)
881                         return r;
882         }
883
884         *idx = htole64(le64toh(*idx) + 1);
885         return 0;
886 }
887
888 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
889         uint64_t p;
890         int r;
891         assert(f);
892         assert(o);
893         assert(offset > 0);
894
895         p = le64toh(o->entry.items[i].object_offset);
896         if (p == 0)
897                 return -EINVAL;
898
899         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
900         if (r < 0)
901                 return r;
902
903         return link_entry_into_array_plus_one(f,
904                                               &o->data.entry_offset,
905                                               &o->data.entry_array_offset,
906                                               &o->data.n_entries,
907                                               offset);
908 }
909
910 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
911         uint64_t n, i;
912         int r;
913
914         assert(f);
915         assert(o);
916         assert(offset > 0);
917         assert(o->object.type == OBJECT_ENTRY);
918
919         __sync_synchronize();
920
921         /* Link up the entry itself */
922         r = link_entry_into_array(f,
923                                   &f->header->entry_array_offset,
924                                   &f->header->n_entries,
925                                   offset);
926         if (r < 0)
927                 return r;
928
929         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
930
931         if (f->header->head_entry_realtime == 0)
932                 f->header->head_entry_realtime = o->entry.realtime;
933
934         f->header->tail_entry_realtime = o->entry.realtime;
935         f->header->tail_entry_monotonic = o->entry.monotonic;
936
937         f->tail_entry_monotonic_valid = true;
938
939         /* Link up the items */
940         n = journal_file_entry_n_items(o);
941         for (i = 0; i < n; i++) {
942                 r = journal_file_link_entry_item(f, o, offset, i);
943                 if (r < 0)
944                         return r;
945         }
946
947         return 0;
948 }
949
950 static int journal_file_append_entry_internal(
951                 JournalFile *f,
952                 const dual_timestamp *ts,
953                 uint64_t xor_hash,
954                 const EntryItem items[], unsigned n_items,
955                 uint64_t *seqnum,
956                 Object **ret, uint64_t *offset) {
957         uint64_t np;
958         uint64_t osize;
959         Object *o;
960         int r;
961
962         assert(f);
963         assert(items || n_items == 0);
964         assert(ts);
965
966         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
967
968         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
969         if (r < 0)
970                 return r;
971
972         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
973         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
974         o->entry.realtime = htole64(ts->realtime);
975         o->entry.monotonic = htole64(ts->monotonic);
976         o->entry.xor_hash = htole64(xor_hash);
977         o->entry.boot_id = f->header->boot_id;
978
979         r = journal_file_link_entry(f, o, np);
980         if (r < 0)
981                 return r;
982
983         if (ret)
984                 *ret = o;
985
986         if (offset)
987                 *offset = np;
988
989         return 0;
990 }
991
992 void journal_file_post_change(JournalFile *f) {
993         assert(f);
994
995         /* inotify() does not receive IN_MODIFY events from file
996          * accesses done via mmap(). After each access we hence
997          * trigger IN_MODIFY by truncating the journal file to its
998          * current size which triggers IN_MODIFY. */
999
1000         __sync_synchronize();
1001
1002         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1003                 log_error("Failed to to truncate file to its own size: %m");
1004 }
1005
1006 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1007         unsigned i;
1008         EntryItem *items;
1009         int r;
1010         uint64_t xor_hash = 0;
1011         struct dual_timestamp _ts;
1012
1013         assert(f);
1014         assert(iovec || n_iovec == 0);
1015
1016         if (!f->writable)
1017                 return -EPERM;
1018
1019         if (!ts) {
1020                 dual_timestamp_get(&_ts);
1021                 ts = &_ts;
1022         }
1023
1024         if (f->tail_entry_monotonic_valid &&
1025             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1026                 return -EINVAL;
1027
1028         items = alloca(sizeof(EntryItem) * n_iovec);
1029
1030         for (i = 0; i < n_iovec; i++) {
1031                 uint64_t p;
1032                 Object *o;
1033
1034                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1035                 if (r < 0)
1036                         return r;
1037
1038                 xor_hash ^= le64toh(o->data.hash);
1039                 items[i].object_offset = htole64(p);
1040                 items[i].hash = o->data.hash;
1041         }
1042
1043         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1044
1045         journal_file_post_change(f);
1046
1047         return r;
1048 }
1049
1050 static int generic_array_get(JournalFile *f,
1051                              uint64_t first,
1052                              uint64_t i,
1053                              Object **ret, uint64_t *offset) {
1054
1055         Object *o;
1056         uint64_t p = 0, a;
1057         int r;
1058
1059         assert(f);
1060
1061         a = first;
1062         while (a > 0) {
1063                 uint64_t n;
1064
1065                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1066                 if (r < 0)
1067                         return r;
1068
1069                 n = journal_file_entry_array_n_items(o);
1070                 if (i < n) {
1071                         p = le64toh(o->entry_array.items[i]);
1072                         break;
1073                 }
1074
1075                 i -= n;
1076                 a = le64toh(o->entry_array.next_entry_array_offset);
1077         }
1078
1079         if (a <= 0 || p <= 0)
1080                 return 0;
1081
1082         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1083         if (r < 0)
1084                 return r;
1085
1086         if (ret)
1087                 *ret = o;
1088
1089         if (offset)
1090                 *offset = p;
1091
1092         return 1;
1093 }
1094
1095 static int generic_array_get_plus_one(JournalFile *f,
1096                                       uint64_t extra,
1097                                       uint64_t first,
1098                                       uint64_t i,
1099                                       Object **ret, uint64_t *offset) {
1100
1101         Object *o;
1102
1103         assert(f);
1104
1105         if (i == 0) {
1106                 int r;
1107
1108                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1109                 if (r < 0)
1110                         return r;
1111
1112                 if (ret)
1113                         *ret = o;
1114
1115                 if (offset)
1116                         *offset = extra;
1117
1118                 return 1;
1119         }
1120
1121         return generic_array_get(f, first, i-1, ret, offset);
1122 }
1123
1124 enum {
1125         TEST_FOUND,
1126         TEST_LEFT,
1127         TEST_RIGHT
1128 };
1129
1130 static int generic_array_bisect(JournalFile *f,
1131                                 uint64_t first,
1132                                 uint64_t n,
1133                                 uint64_t needle,
1134                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1135                                 direction_t direction,
1136                                 Object **ret,
1137                                 uint64_t *offset,
1138                                 uint64_t *idx) {
1139
1140         uint64_t a, p, t = 0, i = 0, last_p = 0;
1141         bool subtract_one = false;
1142         Object *o, *array = NULL;
1143         int r;
1144
1145         assert(f);
1146         assert(test_object);
1147
1148         a = first;
1149         while (a > 0) {
1150                 uint64_t left, right, k, lp;
1151
1152                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1153                 if (r < 0)
1154                         return r;
1155
1156                 k = journal_file_entry_array_n_items(array);
1157                 right = MIN(k, n);
1158                 if (right <= 0)
1159                         return 0;
1160
1161                 i = right - 1;
1162                 lp = p = le64toh(array->entry_array.items[i]);
1163                 if (p <= 0)
1164                         return -EBADMSG;
1165
1166                 r = test_object(f, p, needle);
1167                 if (r < 0)
1168                         return r;
1169
1170                 if (r == TEST_FOUND)
1171                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1172
1173                 if (r == TEST_RIGHT) {
1174                         left = 0;
1175                         right -= 1;
1176                         for (;;) {
1177                                 if (left == right) {
1178                                         if (direction == DIRECTION_UP)
1179                                                 subtract_one = true;
1180
1181                                         i = left;
1182                                         goto found;
1183                                 }
1184
1185                                 assert(left < right);
1186
1187                                 i = (left + right) / 2;
1188                                 p = le64toh(array->entry_array.items[i]);
1189                                 if (p <= 0)
1190                                         return -EBADMSG;
1191
1192                                 r = test_object(f, p, needle);
1193                                 if (r < 0)
1194                                         return r;
1195
1196                                 if (r == TEST_FOUND)
1197                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1198
1199                                 if (r == TEST_RIGHT)
1200                                         right = i;
1201                                 else
1202                                         left = i + 1;
1203                         }
1204                 }
1205
1206                 if (k > n)
1207                         return 0;
1208
1209                 last_p = lp;
1210
1211                 n -= k;
1212                 t += k;
1213                 a = le64toh(array->entry_array.next_entry_array_offset);
1214         }
1215
1216         return 0;
1217
1218 found:
1219         if (subtract_one && t == 0 && i == 0)
1220                 return 0;
1221
1222         if (subtract_one && i == 0)
1223                 p = last_p;
1224         else if (subtract_one)
1225                 p = le64toh(array->entry_array.items[i-1]);
1226         else
1227                 p = le64toh(array->entry_array.items[i]);
1228
1229         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1230         if (r < 0)
1231                 return r;
1232
1233         if (ret)
1234                 *ret = o;
1235
1236         if (offset)
1237                 *offset = p;
1238
1239         if (idx)
1240                 *idx = t + i - (subtract_one ? 1 : 0);
1241
1242         return 1;
1243 }
1244
1245 static int generic_array_bisect_plus_one(JournalFile *f,
1246                                          uint64_t extra,
1247                                          uint64_t first,
1248                                          uint64_t n,
1249                                          uint64_t needle,
1250                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1251                                          direction_t direction,
1252                                          Object **ret,
1253                                          uint64_t *offset,
1254                                          uint64_t *idx) {
1255
1256         int r;
1257
1258         assert(f);
1259         assert(test_object);
1260
1261         if (n <= 0)
1262                 return 0;
1263
1264         /* This bisects the array in object 'first', but first checks
1265          * an extra  */
1266         r = test_object(f, extra, needle);
1267         if (r < 0)
1268                 return r;
1269         else if (r == TEST_FOUND) {
1270                 Object *o;
1271
1272                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1273                 if (r < 0)
1274                         return r;
1275
1276                 if (ret)
1277                         *ret = o;
1278
1279                 if (offset)
1280                         *offset = extra;
1281
1282                 if (idx)
1283                         *idx = 0;
1284
1285                 return 1;
1286         } else if (r == TEST_RIGHT)
1287                 return 0;
1288
1289         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1290
1291         if (r > 0)
1292                 (*idx) ++;
1293
1294         return r;
1295 }
1296
1297 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1298         Object *o;
1299         int r;
1300
1301         assert(f);
1302         assert(p > 0);
1303
1304         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1305         if (r < 0)
1306                 return r;
1307
1308         if (le64toh(o->entry.seqnum) == needle)
1309                 return TEST_FOUND;
1310         else if (le64toh(o->entry.seqnum) < needle)
1311                 return TEST_LEFT;
1312         else
1313                 return TEST_RIGHT;
1314 }
1315
1316 int journal_file_move_to_entry_by_seqnum(
1317                 JournalFile *f,
1318                 uint64_t seqnum,
1319                 direction_t direction,
1320                 Object **ret,
1321                 uint64_t *offset) {
1322
1323         return generic_array_bisect(f,
1324                                     le64toh(f->header->entry_array_offset),
1325                                     le64toh(f->header->n_entries),
1326                                     seqnum,
1327                                     test_object_seqnum,
1328                                     direction,
1329                                     ret, offset, NULL);
1330 }
1331
1332 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1333         Object *o;
1334         int r;
1335
1336         assert(f);
1337         assert(p > 0);
1338
1339         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1340         if (r < 0)
1341                 return r;
1342
1343         if (le64toh(o->entry.realtime) == needle)
1344                 return TEST_FOUND;
1345         else if (le64toh(o->entry.realtime) < needle)
1346                 return TEST_LEFT;
1347         else
1348                 return TEST_RIGHT;
1349 }
1350
1351 int journal_file_move_to_entry_by_realtime(
1352                 JournalFile *f,
1353                 uint64_t realtime,
1354                 direction_t direction,
1355                 Object **ret,
1356                 uint64_t *offset) {
1357
1358         return generic_array_bisect(f,
1359                                     le64toh(f->header->entry_array_offset),
1360                                     le64toh(f->header->n_entries),
1361                                     realtime,
1362                                     test_object_realtime,
1363                                     direction,
1364                                     ret, offset, NULL);
1365 }
1366
1367 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1368         Object *o;
1369         int r;
1370
1371         assert(f);
1372         assert(p > 0);
1373
1374         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1375         if (r < 0)
1376                 return r;
1377
1378         if (le64toh(o->entry.monotonic) == needle)
1379                 return TEST_FOUND;
1380         else if (le64toh(o->entry.monotonic) < needle)
1381                 return TEST_LEFT;
1382         else
1383                 return TEST_RIGHT;
1384 }
1385
1386 int journal_file_move_to_entry_by_monotonic(
1387                 JournalFile *f,
1388                 sd_id128_t boot_id,
1389                 uint64_t monotonic,
1390                 direction_t direction,
1391                 Object **ret,
1392                 uint64_t *offset) {
1393
1394         char t[8+32+1] = "_BOOT_ID=";
1395         Object *o;
1396         int r;
1397
1398         sd_id128_to_string(boot_id, t + 8);
1399
1400         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1401         if (r < 0)
1402                 return r;
1403         else if (r == 0)
1404                 return -ENOENT;
1405
1406         return generic_array_bisect_plus_one(f,
1407                                              le64toh(o->data.entry_offset),
1408                                              le64toh(o->data.entry_array_offset),
1409                                              le64toh(o->data.n_entries),
1410                                              monotonic,
1411                                              test_object_monotonic,
1412                                              direction,
1413                                              ret, offset, NULL);
1414 }
1415
1416 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1417         assert(f);
1418         assert(p > 0);
1419
1420         if (p == needle)
1421                 return TEST_FOUND;
1422         else if (p < needle)
1423                 return TEST_LEFT;
1424         else
1425                 return TEST_RIGHT;
1426 }
1427
1428 int journal_file_next_entry(
1429                 JournalFile *f,
1430                 Object *o, uint64_t p,
1431                 direction_t direction,
1432                 Object **ret, uint64_t *offset) {
1433
1434         uint64_t i, n;
1435         int r;
1436
1437         assert(f);
1438         assert(p > 0 || !o);
1439
1440         n = le64toh(f->header->n_entries);
1441         if (n <= 0)
1442                 return 0;
1443
1444         if (!o)
1445                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1446         else {
1447                 if (o->object.type != OBJECT_ENTRY)
1448                         return -EINVAL;
1449
1450                 r = generic_array_bisect(f,
1451                                          le64toh(f->header->entry_array_offset),
1452                                          le64toh(f->header->n_entries),
1453                                          p,
1454                                          test_object_offset,
1455                                          DIRECTION_DOWN,
1456                                          NULL, NULL,
1457                                          &i);
1458                 if (r <= 0)
1459                         return r;
1460
1461                 if (direction == DIRECTION_DOWN) {
1462                         if (i >= n - 1)
1463                                 return 0;
1464
1465                         i++;
1466                 } else {
1467                         if (i <= 0)
1468                                 return 0;
1469
1470                         i--;
1471                 }
1472         }
1473
1474         /* And jump to it */
1475         return generic_array_get(f,
1476                                  le64toh(f->header->entry_array_offset),
1477                                  i,
1478                                  ret, offset);
1479 }
1480
1481 int journal_file_skip_entry(
1482                 JournalFile *f,
1483                 Object *o, uint64_t p,
1484                 int64_t skip,
1485                 Object **ret, uint64_t *offset) {
1486
1487         uint64_t i, n;
1488         int r;
1489
1490         assert(f);
1491         assert(o);
1492         assert(p > 0);
1493
1494         if (o->object.type != OBJECT_ENTRY)
1495                 return -EINVAL;
1496
1497         r = generic_array_bisect(f,
1498                                  le64toh(f->header->entry_array_offset),
1499                                  le64toh(f->header->n_entries),
1500                                  p,
1501                                  test_object_offset,
1502                                  DIRECTION_DOWN,
1503                                  NULL, NULL,
1504                                  &i);
1505         if (r <= 0)
1506                 return r;
1507
1508         /* Calculate new index */
1509         if (skip < 0) {
1510                 if ((uint64_t) -skip >= i)
1511                         i = 0;
1512                 else
1513                         i = i - (uint64_t) -skip;
1514         } else
1515                 i  += (uint64_t) skip;
1516
1517         n = le64toh(f->header->n_entries);
1518         if (n <= 0)
1519                 return -EBADMSG;
1520
1521         if (i >= n)
1522                 i = n-1;
1523
1524         return generic_array_get(f,
1525                                  le64toh(f->header->entry_array_offset),
1526                                  i,
1527                                  ret, offset);
1528 }
1529
1530 int journal_file_next_entry_for_data(
1531                 JournalFile *f,
1532                 Object *o, uint64_t p,
1533                 uint64_t data_offset,
1534                 direction_t direction,
1535                 Object **ret, uint64_t *offset) {
1536
1537         uint64_t n, i;
1538         int r;
1539         Object *d;
1540
1541         assert(f);
1542         assert(p > 0 || !o);
1543
1544         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1545         if (r < 0)
1546                 return r;
1547
1548         n = le64toh(d->data.n_entries);
1549         if (n <= 0)
1550                 return n;
1551
1552         if (!o)
1553                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1554         else {
1555                 if (o->object.type != OBJECT_ENTRY)
1556                         return -EINVAL;
1557
1558                 r = generic_array_bisect_plus_one(f,
1559                                                   le64toh(d->data.entry_offset),
1560                                                   le64toh(d->data.entry_array_offset),
1561                                                   le64toh(d->data.n_entries),
1562                                                   p,
1563                                                   test_object_offset,
1564                                                   DIRECTION_DOWN,
1565                                                   NULL, NULL,
1566                                                   &i);
1567
1568                 if (r <= 0)
1569                         return r;
1570
1571                 if (direction == DIRECTION_DOWN) {
1572                         if (i >= n - 1)
1573                                 return 0;
1574
1575                         i++;
1576                 } else {
1577                         if (i <= 0)
1578                                 return 0;
1579
1580                         i--;
1581                 }
1582
1583         }
1584
1585         return generic_array_get_plus_one(f,
1586                                           le64toh(d->data.entry_offset),
1587                                           le64toh(d->data.entry_array_offset),
1588                                           i,
1589                                           ret, offset);
1590 }
1591
1592 int journal_file_move_to_entry_by_seqnum_for_data(
1593                 JournalFile *f,
1594                 uint64_t data_offset,
1595                 uint64_t seqnum,
1596                 direction_t direction,
1597                 Object **ret, uint64_t *offset) {
1598
1599         Object *d;
1600         int r;
1601
1602         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1603         if (r <= 0)
1604                 return r;
1605
1606         return generic_array_bisect_plus_one(f,
1607                                              le64toh(d->data.entry_offset),
1608                                              le64toh(d->data.entry_array_offset),
1609                                              le64toh(d->data.n_entries),
1610                                              seqnum,
1611                                              test_object_seqnum,
1612                                              direction,
1613                                              ret, offset, NULL);
1614 }
1615
1616 int journal_file_move_to_entry_by_realtime_for_data(
1617                 JournalFile *f,
1618                 uint64_t data_offset,
1619                 uint64_t realtime,
1620                 direction_t direction,
1621                 Object **ret, uint64_t *offset) {
1622
1623         Object *d;
1624         int r;
1625
1626         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1627         if (r <= 0)
1628                 return r;
1629
1630         return generic_array_bisect_plus_one(f,
1631                                              le64toh(d->data.entry_offset),
1632                                              le64toh(d->data.entry_array_offset),
1633                                              le64toh(d->data.n_entries),
1634                                              realtime,
1635                                              test_object_realtime,
1636                                              direction,
1637                                              ret, offset, NULL);
1638 }
1639
1640 void journal_file_dump(JournalFile *f) {
1641         char a[33], b[33], c[33];
1642         Object *o;
1643         int r;
1644         uint64_t p;
1645
1646         assert(f);
1647
1648         printf("File Path: %s\n"
1649                "File ID: %s\n"
1650                "Machine ID: %s\n"
1651                "Boot ID: %s\n"
1652                "Arena size: %llu\n"
1653                "Objects: %lu\n"
1654                "Entries: %lu\n",
1655                f->path,
1656                sd_id128_to_string(f->header->file_id, a),
1657                sd_id128_to_string(f->header->machine_id, b),
1658                sd_id128_to_string(f->header->boot_id, c),
1659                (unsigned long long) le64toh(f->header->arena_size),
1660                (unsigned long) le64toh(f->header->n_objects),
1661                (unsigned long) le64toh(f->header->n_entries));
1662
1663         p = le64toh(f->header->arena_offset);
1664         while (p != 0) {
1665                 r = journal_file_move_to_object(f, -1, p, &o);
1666                 if (r < 0)
1667                         goto fail;
1668
1669                 switch (o->object.type) {
1670
1671                 case OBJECT_UNUSED:
1672                         printf("Type: OBJECT_UNUSED\n");
1673                         break;
1674
1675                 case OBJECT_DATA:
1676                         printf("Type: OBJECT_DATA\n");
1677                         break;
1678
1679                 case OBJECT_ENTRY:
1680                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1681                                (unsigned long long) le64toh(o->entry.seqnum),
1682                                (unsigned long long) le64toh(o->entry.monotonic),
1683                                (unsigned long long) le64toh(o->entry.realtime));
1684                         break;
1685
1686                 case OBJECT_FIELD_HASH_TABLE:
1687                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1688                         break;
1689
1690                 case OBJECT_DATA_HASH_TABLE:
1691                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1692                         break;
1693
1694                 case OBJECT_ENTRY_ARRAY:
1695                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1696                         break;
1697                 }
1698
1699                 if (o->object.flags & OBJECT_COMPRESSED)
1700                         printf("Flags: COMPRESSED\n");
1701
1702                 if (p == le64toh(f->header->tail_object_offset))
1703                         p = 0;
1704                 else
1705                         p = p + ALIGN64(le64toh(o->object.size));
1706         }
1707
1708         return;
1709 fail:
1710         log_error("File corrupt");
1711 }
1712
1713 int journal_file_open(
1714                 const char *fname,
1715                 int flags,
1716                 mode_t mode,
1717                 JournalFile *template,
1718                 JournalFile **ret) {
1719
1720         JournalFile *f;
1721         int r;
1722         bool newly_created = false;
1723
1724         assert(fname);
1725
1726         if ((flags & O_ACCMODE) != O_RDONLY &&
1727             (flags & O_ACCMODE) != O_RDWR)
1728                 return -EINVAL;
1729
1730         if (!endswith(fname, ".journal"))
1731                 return -EINVAL;
1732
1733         f = new0(JournalFile, 1);
1734         if (!f)
1735                 return -ENOMEM;
1736
1737         f->fd = -1;
1738         f->flags = flags;
1739         f->mode = mode;
1740         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1741         f->prot = prot_from_flags(flags);
1742
1743         if (template) {
1744                 f->metrics = template->metrics;
1745                 f->compress = template->compress;
1746         }
1747
1748         f->path = strdup(fname);
1749         if (!f->path) {
1750                 r = -ENOMEM;
1751                 goto fail;
1752         }
1753
1754         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1755         if (f->fd < 0) {
1756                 r = -errno;
1757                 goto fail;
1758         }
1759
1760         if (fstat(f->fd, &f->last_stat) < 0) {
1761                 r = -errno;
1762                 goto fail;
1763         }
1764
1765         if (f->last_stat.st_size == 0 && f->writable) {
1766                 newly_created = true;
1767
1768                 r = journal_file_init_header(f, template);
1769                 if (r < 0)
1770                         goto fail;
1771
1772                 if (fstat(f->fd, &f->last_stat) < 0) {
1773                         r = -errno;
1774                         goto fail;
1775                 }
1776         }
1777
1778         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1779                 r = -EIO;
1780                 goto fail;
1781         }
1782
1783         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1784         if (f->header == MAP_FAILED) {
1785                 f->header = NULL;
1786                 r = -errno;
1787                 goto fail;
1788         }
1789
1790         if (!newly_created) {
1791                 r = journal_file_verify_header(f);
1792                 if (r < 0)
1793                         goto fail;
1794         }
1795
1796         if (f->writable) {
1797                 r = journal_file_refresh_header(f);
1798                 if (r < 0)
1799                         goto fail;
1800         }
1801
1802         if (newly_created) {
1803
1804                 r = journal_file_setup_field_hash_table(f);
1805                 if (r < 0)
1806                         goto fail;
1807
1808                 r = journal_file_setup_data_hash_table(f);
1809                 if (r < 0)
1810                         goto fail;
1811         }
1812
1813         r = journal_file_map_field_hash_table(f);
1814         if (r < 0)
1815                 goto fail;
1816
1817         r = journal_file_map_data_hash_table(f);
1818         if (r < 0)
1819                 goto fail;
1820
1821         if (ret)
1822                 *ret = f;
1823
1824         return 0;
1825
1826 fail:
1827         journal_file_close(f);
1828
1829         return r;
1830 }
1831
1832 int journal_file_rotate(JournalFile **f) {
1833         char *p;
1834         size_t l;
1835         JournalFile *old_file, *new_file = NULL;
1836         int r;
1837
1838         assert(f);
1839         assert(*f);
1840
1841         old_file = *f;
1842
1843         if (!old_file->writable)
1844                 return -EINVAL;
1845
1846         if (!endswith(old_file->path, ".journal"))
1847                 return -EINVAL;
1848
1849         l = strlen(old_file->path);
1850
1851         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1852         if (!p)
1853                 return -ENOMEM;
1854
1855         memcpy(p, old_file->path, l - 8);
1856         p[l-8] = '@';
1857         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1858         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1859                  "-%016llx-%016llx.journal",
1860                  (unsigned long long) le64toh((*f)->header->seqnum),
1861                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1862
1863         r = rename(old_file->path, p);
1864         free(p);
1865
1866         if (r < 0)
1867                 return -errno;
1868
1869         old_file->header->state = STATE_ARCHIVED;
1870
1871         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1872         journal_file_close(old_file);
1873
1874         *f = new_file;
1875         return r;
1876 }
1877
1878 int journal_file_open_reliably(
1879                 const char *fname,
1880                 int flags,
1881                 mode_t mode,
1882                 JournalFile *template,
1883                 JournalFile **ret) {
1884
1885         int r;
1886         size_t l;
1887         char *p;
1888
1889         r = journal_file_open(fname, flags, mode, template, ret);
1890         if (r != -EBADMSG && /* corrupted */
1891             r != -ENODATA && /* truncated */
1892             r != -EHOSTDOWN && /* other machine */
1893             r != -EPROTONOSUPPORT) /* incompatible feature */
1894                 return r;
1895
1896         if ((flags & O_ACCMODE) == O_RDONLY)
1897                 return r;
1898
1899         if (!(flags & O_CREAT))
1900                 return r;
1901
1902         /* The file is corrupted. Rotate it away and try it again (but only once) */
1903
1904         l = strlen(fname);
1905         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1906                      (int) (l-8), fname,
1907                      (unsigned long long) now(CLOCK_REALTIME),
1908                      random_ull()) < 0)
1909                 return -ENOMEM;
1910
1911         r = rename(fname, p);
1912         free(p);
1913         if (r < 0)
1914                 return -errno;
1915
1916         log_warning("File %s corrupted, renaming and replacing.", fname);
1917
1918         return journal_file_open(fname, flags, mode, template, ret);
1919 }
1920
1921 struct vacuum_info {
1922         off_t usage;
1923         char *filename;
1924
1925         uint64_t realtime;
1926         sd_id128_t seqnum_id;
1927         uint64_t seqnum;
1928
1929         bool have_seqnum;
1930 };
1931
1932 static int vacuum_compare(const void *_a, const void *_b) {
1933         const struct vacuum_info *a, *b;
1934
1935         a = _a;
1936         b = _b;
1937
1938         if (a->have_seqnum && b->have_seqnum &&
1939             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1940                 if (a->seqnum < b->seqnum)
1941                         return -1;
1942                 else if (a->seqnum > b->seqnum)
1943                         return 1;
1944                 else
1945                         return 0;
1946         }
1947
1948         if (a->realtime < b->realtime)
1949                 return -1;
1950         else if (a->realtime > b->realtime)
1951                 return 1;
1952         else if (a->have_seqnum && b->have_seqnum)
1953                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1954         else
1955                 return strcmp(a->filename, b->filename);
1956 }
1957
1958 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1959         DIR *d;
1960         int r = 0;
1961         struct vacuum_info *list = NULL;
1962         unsigned n_list = 0, n_allocated = 0, i;
1963         uint64_t sum = 0;
1964
1965         assert(directory);
1966
1967         if (max_use <= 0)
1968                 return 0;
1969
1970         d = opendir(directory);
1971         if (!d)
1972                 return -errno;
1973
1974         for (;;) {
1975                 int k;
1976                 struct dirent buf, *de;
1977                 size_t q;
1978                 struct stat st;
1979                 char *p;
1980                 unsigned long long seqnum = 0, realtime;
1981                 sd_id128_t seqnum_id;
1982                 bool have_seqnum;
1983
1984                 k = readdir_r(d, &buf, &de);
1985                 if (k != 0) {
1986                         r = -k;
1987                         goto finish;
1988                 }
1989
1990                 if (!de)
1991                         break;
1992
1993                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
1994                         continue;
1995
1996                 if (!S_ISREG(st.st_mode))
1997                         continue;
1998
1999                 q = strlen(de->d_name);
2000
2001                 if (endswith(de->d_name, ".journal")) {
2002
2003                         /* Vacuum archived files */
2004
2005                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2006                                 continue;
2007
2008                         if (de->d_name[q-8-16-1] != '-' ||
2009                             de->d_name[q-8-16-1-16-1] != '-' ||
2010                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2011                                 continue;
2012
2013                         p = strdup(de->d_name);
2014                         if (!p) {
2015                                 r = -ENOMEM;
2016                                 goto finish;
2017                         }
2018
2019                         de->d_name[q-8-16-1-16-1] = 0;
2020                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2021                                 free(p);
2022                                 continue;
2023                         }
2024
2025                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2026                                 free(p);
2027                                 continue;
2028                         }
2029
2030                         have_seqnum = true;
2031
2032                 } else if (endswith(de->d_name, ".journal~")) {
2033                         unsigned long long tmp;
2034
2035                         /* Vacuum corrupted files */
2036
2037                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2038                                 continue;
2039
2040                         if (de->d_name[q-1-8-16-1] != '-' ||
2041                             de->d_name[q-1-8-16-1-16-1] != '@')
2042                                 continue;
2043
2044                         p = strdup(de->d_name);
2045                         if (!p) {
2046                                 r = -ENOMEM;
2047                                 goto finish;
2048                         }
2049
2050                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2051                                 free(p);
2052                                 continue;
2053                         }
2054
2055                         have_seqnum = false;
2056                 } else
2057                         continue;
2058
2059                 if (n_list >= n_allocated) {
2060                         struct vacuum_info *j;
2061
2062                         n_allocated = MAX(n_allocated * 2U, 8U);
2063                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2064                         if (!j) {
2065                                 free(p);
2066                                 r = -ENOMEM;
2067                                 goto finish;
2068                         }
2069
2070                         list = j;
2071                 }
2072
2073                 list[n_list].filename = p;
2074                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2075                 list[n_list].seqnum = seqnum;
2076                 list[n_list].realtime = realtime;
2077                 list[n_list].seqnum_id = seqnum_id;
2078                 list[n_list].have_seqnum = have_seqnum;
2079
2080                 sum += list[n_list].usage;
2081
2082                 n_list ++;
2083         }
2084
2085         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2086
2087         for(i = 0; i < n_list; i++) {
2088                 struct statvfs ss;
2089
2090                 if (fstatvfs(dirfd(d), &ss) < 0) {
2091                         r = -errno;
2092                         goto finish;
2093                 }
2094
2095                 if (sum <= max_use &&
2096                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2097                         break;
2098
2099                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2100                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2101                         sum -= list[i].usage;
2102                 } else if (errno != ENOENT)
2103                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2104         }
2105
2106 finish:
2107         for (i = 0; i < n_list; i++)
2108                 free(list[i].filename);
2109
2110         free(list);
2111
2112         if (d)
2113                 closedir(d);
2114
2115         return r;
2116 }
2117
2118 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2119         uint64_t i, n;
2120         uint64_t q, xor_hash = 0;
2121         int r;
2122         EntryItem *items;
2123         dual_timestamp ts;
2124
2125         assert(from);
2126         assert(to);
2127         assert(o);
2128         assert(p);
2129
2130         if (!to->writable)
2131                 return -EPERM;
2132
2133         ts.monotonic = le64toh(o->entry.monotonic);
2134         ts.realtime = le64toh(o->entry.realtime);
2135
2136         if (to->tail_entry_monotonic_valid &&
2137             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2138                 return -EINVAL;
2139
2140         if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2141                 return -EINVAL;
2142
2143         n = journal_file_entry_n_items(o);
2144         items = alloca(sizeof(EntryItem) * n);
2145
2146         for (i = 0; i < n; i++) {
2147                 uint64_t l, h;
2148                 le64_t le_hash;
2149                 size_t t;
2150                 void *data;
2151                 Object *u;
2152
2153                 q = le64toh(o->entry.items[i].object_offset);
2154                 le_hash = o->entry.items[i].hash;
2155
2156                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2157                 if (r < 0)
2158                         return r;
2159
2160                 if (le_hash != o->data.hash)
2161                         return -EBADMSG;
2162
2163                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2164                 t = (size_t) l;
2165
2166                 /* We hit the limit on 32bit machines */
2167                 if ((uint64_t) t != l)
2168                         return -E2BIG;
2169
2170                 if (o->object.flags & OBJECT_COMPRESSED) {
2171 #ifdef HAVE_XZ
2172                         uint64_t rsize;
2173
2174                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2175                                 return -EBADMSG;
2176
2177                         data = from->compress_buffer;
2178                         l = rsize;
2179 #else
2180                         return -EPROTONOSUPPORT;
2181 #endif
2182                 } else
2183                         data = o->data.payload;
2184
2185                 r = journal_file_append_data(to, data, l, &u, &h);
2186                 if (r < 0)
2187                         return r;
2188
2189                 xor_hash ^= le64toh(u->data.hash);
2190                 items[i].object_offset = htole64(h);
2191                 items[i].hash = u->data.hash;
2192
2193                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2194                 if (r < 0)
2195                         return r;
2196         }
2197
2198         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2199 }
2200
2201 void journal_default_metrics(JournalMetrics *m, int fd) {
2202         uint64_t fs_size = 0;
2203         struct statvfs ss;
2204         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2205
2206         assert(m);
2207         assert(fd >= 0);
2208
2209         if (fstatvfs(fd, &ss) >= 0)
2210                 fs_size = ss.f_frsize * ss.f_blocks;
2211
2212         if (m->max_use == (uint64_t) -1) {
2213
2214                 if (fs_size > 0) {
2215                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2216
2217                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2218                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2219
2220                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2221                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2222                 } else
2223                         m->max_use = DEFAULT_MAX_USE_LOWER;
2224         } else {
2225                 m->max_use = PAGE_ALIGN(m->max_use);
2226
2227                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2228                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2229         }
2230
2231         if (m->max_size == (uint64_t) -1) {
2232                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2233
2234                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2235                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2236         } else
2237                 m->max_size = PAGE_ALIGN(m->max_size);
2238
2239         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2240                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2241
2242         if (m->max_size*2 > m->max_use)
2243                 m->max_use = m->max_size*2;
2244
2245         if (m->min_size == (uint64_t) -1)
2246                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2247         else {
2248                 m->min_size = PAGE_ALIGN(m->min_size);
2249
2250                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2251                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2252
2253                 if (m->min_size > m->max_size)
2254                         m->max_size = m->min_size;
2255         }
2256
2257         if (m->keep_free == (uint64_t) -1) {
2258
2259                 if (fs_size > 0) {
2260                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2261
2262                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2263                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2264
2265                 } else
2266                         m->keep_free = DEFAULT_KEEP_FREE;
2267         }
2268
2269         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2270                  format_bytes(a, sizeof(a), m->max_use),
2271                  format_bytes(b, sizeof(b), m->max_size),
2272                  format_bytes(c, sizeof(c), m->min_size),
2273                  format_bytes(d, sizeof(d), m->keep_free));
2274 }