chiark / gitweb /
1f82191d286641da2f7af5432fcae47543247005
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
62
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
64
65 void journal_file_close(JournalFile *f) {
66         int t;
67
68         assert(f);
69
70         if (f->header) {
71                 if (f->writable)
72                         f->header->state = STATE_OFFLINE;
73
74                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
75         }
76
77         for (t = 0; t < _WINDOW_MAX; t++)
78                 if (f->windows[t].ptr)
79                         munmap(f->windows[t].ptr, f->windows[t].size);
80
81         if (f->fd >= 0)
82                 close_nointr_nofail(f->fd);
83
84         free(f->path);
85
86 #ifdef HAVE_XZ
87         free(f->compress_buffer);
88 #endif
89
90         free(f);
91 }
92
93 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
94         Header h;
95         ssize_t k;
96         int r;
97
98         assert(f);
99
100         zero(h);
101         memcpy(h.signature, signature, 8);
102         h.arena_offset = htole64(ALIGN64(sizeof(h)));
103
104         r = sd_id128_randomize(&h.file_id);
105         if (r < 0)
106                 return r;
107
108         if (template) {
109                 h.seqnum_id = template->header->seqnum_id;
110                 h.seqnum = template->header->seqnum;
111         } else
112                 h.seqnum_id = h.file_id;
113
114         k = pwrite(f->fd, &h, sizeof(h), 0);
115         if (k < 0)
116                 return -errno;
117
118         if (k != sizeof(h))
119                 return -EIO;
120
121         return 0;
122 }
123
124 static int journal_file_refresh_header(JournalFile *f) {
125         int r;
126         sd_id128_t boot_id;
127
128         assert(f);
129
130         r = sd_id128_get_machine(&f->header->machine_id);
131         if (r < 0)
132                 return r;
133
134         r = sd_id128_get_boot(&boot_id);
135         if (r < 0)
136                 return r;
137
138         if (sd_id128_equal(boot_id, f->header->boot_id))
139                 f->tail_entry_monotonic_valid = true;
140
141         f->header->boot_id = boot_id;
142
143         f->header->state = STATE_ONLINE;
144
145         __sync_synchronize();
146
147         return 0;
148 }
149
150 static int journal_file_verify_header(JournalFile *f) {
151         assert(f);
152
153         if (memcmp(f->header, signature, 8))
154                 return -EBADMSG;
155
156 #ifdef HAVE_XZ
157         if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
158                 return -EPROTONOSUPPORT;
159 #else
160         if (f->header->incompatible_flags != 0)
161                 return -EPROTONOSUPPORT;
162 #endif
163
164         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
165                 return -ENODATA;
166
167         if (f->writable) {
168                 uint8_t state;
169                 sd_id128_t machine_id;
170                 int r;
171
172                 r = sd_id128_get_machine(&machine_id);
173                 if (r < 0)
174                         return r;
175
176                 if (!sd_id128_equal(machine_id, f->header->machine_id))
177                         return -EHOSTDOWN;
178
179                 state = f->header->state;
180
181                 if (state == STATE_ONLINE)
182                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
183                 else if (state == STATE_ARCHIVED)
184                         return -ESHUTDOWN;
185                 else if (state != STATE_OFFLINE)
186                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
187         }
188
189         return 0;
190 }
191
192 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
193         uint64_t old_size, new_size;
194         int r;
195
196         assert(f);
197
198         /* We assume that this file is not sparse, and we know that
199          * for sure, since we always call posix_fallocate()
200          * ourselves */
201
202         old_size =
203                 le64toh(f->header->arena_offset) +
204                 le64toh(f->header->arena_size);
205
206         new_size = PAGE_ALIGN(offset + size);
207         if (new_size < le64toh(f->header->arena_offset))
208                 new_size = le64toh(f->header->arena_offset);
209
210         if (new_size <= old_size)
211                 return 0;
212
213         if (f->metrics.max_size > 0 &&
214             new_size > f->metrics.max_size)
215                 return -E2BIG;
216
217         if (new_size > f->metrics.min_size &&
218             f->metrics.keep_free > 0) {
219                 struct statvfs svfs;
220
221                 if (fstatvfs(f->fd, &svfs) >= 0) {
222                         uint64_t available;
223
224                         available = svfs.f_bfree * svfs.f_bsize;
225
226                         if (available >= f->metrics.keep_free)
227                                 available -= f->metrics.keep_free;
228                         else
229                                 available = 0;
230
231                         if (new_size - old_size > available)
232                                 return -E2BIG;
233                 }
234         }
235
236         /* Note that the glibc fallocate() fallback is very
237            inefficient, hence we try to minimize the allocation area
238            as we can. */
239         r = posix_fallocate(f->fd, old_size, new_size - old_size);
240         if (r != 0)
241                 return -r;
242
243         if (fstat(f->fd, &f->last_stat) < 0)
244                 return -errno;
245
246         f->header->arena_size = htole64(new_size - le64toh(f->header->arena_offset));
247
248         return 0;
249 }
250
251 static int journal_file_map(
252                 JournalFile *f,
253                 uint64_t offset,
254                 uint64_t size,
255                 void **_window,
256                 uint64_t *_woffset,
257                 uint64_t *_wsize,
258                 void **ret) {
259
260         uint64_t woffset, wsize;
261         void *window;
262
263         assert(f);
264         assert(size > 0);
265         assert(ret);
266
267         woffset = offset & ~((uint64_t) page_size() - 1ULL);
268         wsize = size + (offset - woffset);
269         wsize = PAGE_ALIGN(wsize);
270
271         /* Avoid SIGBUS on invalid accesses */
272         if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
273                 return -EADDRNOTAVAIL;
274
275         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
276         if (window == MAP_FAILED)
277                 return -errno;
278
279         if (_window)
280                 *_window = window;
281
282         if (_woffset)
283                 *_woffset = woffset;
284
285         if (_wsize)
286                 *_wsize = wsize;
287
288         *ret = (uint8_t*) window + (offset - woffset);
289
290         return 0;
291 }
292
293 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
294         void *p = NULL;
295         uint64_t delta;
296         int r;
297         Window *w;
298
299         assert(f);
300         assert(ret);
301         assert(wt >= 0);
302         assert(wt < _WINDOW_MAX);
303
304         if (offset + size > (uint64_t) f->last_stat.st_size) {
305                 /* Hmm, out of range? Let's refresh the fstat() data
306                  * first, before we trust that check. */
307
308                 if (fstat(f->fd, &f->last_stat) < 0 ||
309                     offset + size > (uint64_t) f->last_stat.st_size)
310                         return -EADDRNOTAVAIL;
311         }
312
313         w = f->windows + wt;
314
315         if (_likely_(w->ptr &&
316                      w->offset <= offset &&
317                      w->offset + w->size >= offset + size)) {
318
319                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
320                 return 0;
321         }
322
323         if (w->ptr) {
324                 if (munmap(w->ptr, w->size) < 0)
325                         return -errno;
326
327                 w->ptr = NULL;
328                 w->size = w->offset = 0;
329         }
330
331         if (size < DEFAULT_WINDOW_SIZE) {
332                 /* If the default window size is larger then what was
333                  * asked for extend the mapping a bit in the hope to
334                  * minimize needed remappings later on. We add half
335                  * the window space before and half behind the
336                  * requested mapping */
337
338                 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
339
340                 if (delta > offset)
341                         delta = offset;
342
343                 offset -= delta;
344                 size = DEFAULT_WINDOW_SIZE;
345         } else
346                 delta = 0;
347
348         if (offset + size > (uint64_t) f->last_stat.st_size)
349                 size = (uint64_t) f->last_stat.st_size - offset;
350
351         if (size <= 0)
352                 return -EADDRNOTAVAIL;
353
354         r = journal_file_map(f,
355                              offset, size,
356                              &w->ptr, &w->offset, &w->size,
357                              &p);
358
359         if (r < 0)
360                 return r;
361
362         *ret = (uint8_t*) p + delta;
363         return 0;
364 }
365
366 static bool verify_hash(Object *o) {
367         uint64_t h1, h2;
368
369         assert(o);
370
371         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
372                 h1 = le64toh(o->data.hash);
373                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
374         } else if (o->object.type == OBJECT_FIELD) {
375                 h1 = le64toh(o->field.hash);
376                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
377         } else
378                 return true;
379
380         return h1 == h2;
381 }
382
383 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
384         int r;
385         void *t;
386         Object *o;
387         uint64_t s;
388
389         assert(f);
390         assert(ret);
391         assert(type < _OBJECT_TYPE_MAX);
392
393         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
394         if (r < 0)
395                 return r;
396
397         o = (Object*) t;
398         s = le64toh(o->object.size);
399
400         if (s < sizeof(ObjectHeader))
401                 return -EBADMSG;
402
403         if (type >= 0 && o->object.type != type)
404                 return -EBADMSG;
405
406         if (s > sizeof(ObjectHeader)) {
407                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
408                 if (r < 0)
409                         return r;
410
411                 o = (Object*) t;
412         }
413
414         if (!verify_hash(o))
415                 return -EBADMSG;
416
417         *ret = o;
418         return 0;
419 }
420
421 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
422         uint64_t r;
423
424         assert(f);
425
426         r = le64toh(f->header->seqnum) + 1;
427
428         if (seqnum) {
429                 /* If an external seqnum counter was passed, we update
430                  * both the local and the external one, and set it to
431                  * the maximum of both */
432
433                 if (*seqnum + 1 > r)
434                         r = *seqnum + 1;
435
436                 *seqnum = r;
437         }
438
439         f->header->seqnum = htole64(r);
440
441         if (f->header->first_seqnum == 0)
442                 f->header->first_seqnum = htole64(r);
443
444         return r;
445 }
446
447 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
448         int r;
449         uint64_t p;
450         Object *tail, *o;
451         void *t;
452
453         assert(f);
454         assert(size >= sizeof(ObjectHeader));
455         assert(offset);
456         assert(ret);
457
458         p = le64toh(f->header->tail_object_offset);
459         if (p == 0)
460                 p = le64toh(f->header->arena_offset);
461         else {
462                 r = journal_file_move_to_object(f, -1, p, &tail);
463                 if (r < 0)
464                         return r;
465
466                 p += ALIGN64(le64toh(tail->object.size));
467         }
468
469         r = journal_file_allocate(f, p, size);
470         if (r < 0)
471                 return r;
472
473         r = journal_file_move_to(f, type, p, size, &t);
474         if (r < 0)
475                 return r;
476
477         o = (Object*) t;
478
479         zero(o->object);
480         o->object.type = type;
481         o->object.size = htole64(size);
482
483         f->header->tail_object_offset = htole64(p);
484         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
485
486         *ret = o;
487         *offset = p;
488
489         return 0;
490 }
491
492 static int journal_file_setup_data_hash_table(JournalFile *f) {
493         uint64_t s, p;
494         Object *o;
495         int r;
496
497         assert(f);
498
499         s = DEFAULT_DATA_HASH_TABLE_SIZE;
500         r = journal_file_append_object(f,
501                                        OBJECT_DATA_HASH_TABLE,
502                                        offsetof(Object, hash_table.items) + s,
503                                        &o, &p);
504         if (r < 0)
505                 return r;
506
507         memset(o->hash_table.items, 0, s);
508
509         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
510         f->header->data_hash_table_size = htole64(s);
511
512         return 0;
513 }
514
515 static int journal_file_setup_field_hash_table(JournalFile *f) {
516         uint64_t s, p;
517         Object *o;
518         int r;
519
520         assert(f);
521
522         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
523         r = journal_file_append_object(f,
524                                        OBJECT_FIELD_HASH_TABLE,
525                                        offsetof(Object, hash_table.items) + s,
526                                        &o, &p);
527         if (r < 0)
528                 return r;
529
530         memset(o->hash_table.items, 0, s);
531
532         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
533         f->header->field_hash_table_size = htole64(s);
534
535         return 0;
536 }
537
538 static int journal_file_map_data_hash_table(JournalFile *f) {
539         uint64_t s, p;
540         void *t;
541         int r;
542
543         assert(f);
544
545         p = le64toh(f->header->data_hash_table_offset);
546         s = le64toh(f->header->data_hash_table_size);
547
548         r = journal_file_move_to(f,
549                                  WINDOW_DATA_HASH_TABLE,
550                                  p, s,
551                                  &t);
552         if (r < 0)
553                 return r;
554
555         f->data_hash_table = t;
556         return 0;
557 }
558
559 static int journal_file_map_field_hash_table(JournalFile *f) {
560         uint64_t s, p;
561         void *t;
562         int r;
563
564         assert(f);
565
566         p = le64toh(f->header->field_hash_table_offset);
567         s = le64toh(f->header->field_hash_table_size);
568
569         r = journal_file_move_to(f,
570                                  WINDOW_FIELD_HASH_TABLE,
571                                  p, s,
572                                  &t);
573         if (r < 0)
574                 return r;
575
576         f->field_hash_table = t;
577         return 0;
578 }
579
580 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
581         uint64_t p, h;
582         int r;
583
584         assert(f);
585         assert(o);
586         assert(offset > 0);
587         assert(o->object.type == OBJECT_DATA);
588
589         /* This might alter the window we are looking at */
590
591         o->data.next_hash_offset = o->data.next_field_offset = 0;
592         o->data.entry_offset = o->data.entry_array_offset = 0;
593         o->data.n_entries = 0;
594
595         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
596         p = le64toh(f->data_hash_table[h].head_hash_offset);
597         if (p == 0) {
598                 /* Only entry in the hash table is easy */
599                 f->data_hash_table[h].head_hash_offset = htole64(offset);
600         } else {
601                 /* Move back to the previous data object, to patch in
602                  * pointer */
603
604                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
605                 if (r < 0)
606                         return r;
607
608                 o->data.next_hash_offset = htole64(offset);
609         }
610
611         f->data_hash_table[h].tail_hash_offset = htole64(offset);
612
613         return 0;
614 }
615
616 int journal_file_find_data_object_with_hash(
617                 JournalFile *f,
618                 const void *data, uint64_t size, uint64_t hash,
619                 Object **ret, uint64_t *offset) {
620
621         uint64_t p, osize, h;
622         int r;
623
624         assert(f);
625         assert(data || size == 0);
626
627         osize = offsetof(Object, data.payload) + size;
628
629         if (f->header->data_hash_table_size == 0)
630                 return -EBADMSG;
631
632         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
633         p = le64toh(f->data_hash_table[h].head_hash_offset);
634
635         while (p > 0) {
636                 Object *o;
637
638                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
639                 if (r < 0)
640                         return r;
641
642                 if (le64toh(o->data.hash) != hash)
643                         goto next;
644
645                 if (o->object.flags & OBJECT_COMPRESSED) {
646 #ifdef HAVE_XZ
647                         uint64_t l, rsize;
648
649                         l = le64toh(o->object.size);
650                         if (l <= offsetof(Object, data.payload))
651                                 return -EBADMSG;
652
653                         l -= offsetof(Object, data.payload);
654
655                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
656                                 return -EBADMSG;
657
658                         if (rsize == size &&
659                             memcmp(f->compress_buffer, data, size) == 0) {
660
661                                 if (ret)
662                                         *ret = o;
663
664                                 if (offset)
665                                         *offset = p;
666
667                                 return 1;
668                         }
669 #else
670                         return -EPROTONOSUPPORT;
671 #endif
672
673                 } else if (le64toh(o->object.size) == osize &&
674                            memcmp(o->data.payload, data, size) == 0) {
675
676                         if (ret)
677                                 *ret = o;
678
679                         if (offset)
680                                 *offset = p;
681
682                         return 1;
683                 }
684
685         next:
686                 p = le64toh(o->data.next_hash_offset);
687         }
688
689         return 0;
690 }
691
692 int journal_file_find_data_object(
693                 JournalFile *f,
694                 const void *data, uint64_t size,
695                 Object **ret, uint64_t *offset) {
696
697         uint64_t hash;
698
699         assert(f);
700         assert(data || size == 0);
701
702         hash = hash64(data, size);
703
704         return journal_file_find_data_object_with_hash(f,
705                                                        data, size, hash,
706                                                        ret, offset);
707 }
708
709 static int journal_file_append_data(
710                 JournalFile *f,
711                 const void *data, uint64_t size,
712                 Object **ret, uint64_t *offset) {
713
714         uint64_t hash, p;
715         uint64_t osize;
716         Object *o;
717         int r;
718         bool compressed = false;
719
720         assert(f);
721         assert(data || size == 0);
722
723         hash = hash64(data, size);
724
725         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
726         if (r < 0)
727                 return r;
728         else if (r > 0) {
729
730                 if (ret)
731                         *ret = o;
732
733                 if (offset)
734                         *offset = p;
735
736                 return 0;
737         }
738
739         osize = offsetof(Object, data.payload) + size;
740         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
741         if (r < 0)
742                 return r;
743
744         o->data.hash = htole64(hash);
745
746 #ifdef HAVE_XZ
747         if (f->compress &&
748             size >= COMPRESSION_SIZE_THRESHOLD) {
749                 uint64_t rsize;
750
751                 compressed = compress_blob(data, size, o->data.payload, &rsize);
752
753                 if (compressed) {
754                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
755                         o->object.flags |= OBJECT_COMPRESSED;
756
757                         f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
758
759                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
760                 }
761         }
762 #endif
763
764         if (!compressed)
765                 memcpy(o->data.payload, data, size);
766
767         r = journal_file_link_data(f, o, p, hash);
768         if (r < 0)
769                 return r;
770
771         /* The linking might have altered the window, so let's
772          * refresh our pointer */
773         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
774         if (r < 0)
775                 return r;
776
777         if (ret)
778                 *ret = o;
779
780         if (offset)
781                 *offset = p;
782
783         return 0;
784 }
785
786 uint64_t journal_file_entry_n_items(Object *o) {
787         assert(o);
788         assert(o->object.type == OBJECT_ENTRY);
789
790         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
791 }
792
793 static uint64_t journal_file_entry_array_n_items(Object *o) {
794         assert(o);
795         assert(o->object.type == OBJECT_ENTRY_ARRAY);
796
797         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
798 }
799
800 static int link_entry_into_array(JournalFile *f,
801                                  le64_t *first,
802                                  le64_t *idx,
803                                  uint64_t p) {
804         int r;
805         uint64_t n = 0, ap = 0, q, i, a, hidx;
806         Object *o;
807
808         assert(f);
809         assert(first);
810         assert(idx);
811         assert(p > 0);
812
813         a = le64toh(*first);
814         i = hidx = le64toh(*idx);
815         while (a > 0) {
816
817                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
818                 if (r < 0)
819                         return r;
820
821                 n = journal_file_entry_array_n_items(o);
822                 if (i < n) {
823                         o->entry_array.items[i] = htole64(p);
824                         *idx = htole64(hidx + 1);
825                         return 0;
826                 }
827
828                 i -= n;
829                 ap = a;
830                 a = le64toh(o->entry_array.next_entry_array_offset);
831         }
832
833         if (hidx > n)
834                 n = (hidx+1) * 2;
835         else
836                 n = n * 2;
837
838         if (n < 4)
839                 n = 4;
840
841         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
842                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
843                                        &o, &q);
844         if (r < 0)
845                 return r;
846
847         o->entry_array.items[i] = htole64(p);
848
849         if (ap == 0)
850                 *first = htole64(q);
851         else {
852                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
853                 if (r < 0)
854                         return r;
855
856                 o->entry_array.next_entry_array_offset = htole64(q);
857         }
858
859         *idx = htole64(hidx + 1);
860
861         return 0;
862 }
863
864 static int link_entry_into_array_plus_one(JournalFile *f,
865                                           le64_t *extra,
866                                           le64_t *first,
867                                           le64_t *idx,
868                                           uint64_t p) {
869
870         int r;
871
872         assert(f);
873         assert(extra);
874         assert(first);
875         assert(idx);
876         assert(p > 0);
877
878         if (*idx == 0)
879                 *extra = htole64(p);
880         else {
881                 le64_t i;
882
883                 i = htole64(le64toh(*idx) - 1);
884                 r = link_entry_into_array(f, first, &i, p);
885                 if (r < 0)
886                         return r;
887         }
888
889         *idx = htole64(le64toh(*idx) + 1);
890         return 0;
891 }
892
893 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
894         uint64_t p;
895         int r;
896         assert(f);
897         assert(o);
898         assert(offset > 0);
899
900         p = le64toh(o->entry.items[i].object_offset);
901         if (p == 0)
902                 return -EINVAL;
903
904         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
905         if (r < 0)
906                 return r;
907
908         return link_entry_into_array_plus_one(f,
909                                               &o->data.entry_offset,
910                                               &o->data.entry_array_offset,
911                                               &o->data.n_entries,
912                                               offset);
913 }
914
915 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
916         uint64_t n, i;
917         int r;
918
919         assert(f);
920         assert(o);
921         assert(offset > 0);
922         assert(o->object.type == OBJECT_ENTRY);
923
924         __sync_synchronize();
925
926         /* Link up the entry itself */
927         r = link_entry_into_array(f,
928                                   &f->header->entry_array_offset,
929                                   &f->header->n_entries,
930                                   offset);
931         if (r < 0)
932                 return r;
933
934         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
935
936         if (f->header->head_entry_realtime == 0)
937                 f->header->head_entry_realtime = o->entry.realtime;
938
939         f->header->tail_entry_realtime = o->entry.realtime;
940         f->header->tail_entry_monotonic = o->entry.monotonic;
941
942         f->tail_entry_monotonic_valid = true;
943
944         /* Link up the items */
945         n = journal_file_entry_n_items(o);
946         for (i = 0; i < n; i++) {
947                 r = journal_file_link_entry_item(f, o, offset, i);
948                 if (r < 0)
949                         return r;
950         }
951
952         return 0;
953 }
954
955 static int journal_file_append_entry_internal(
956                 JournalFile *f,
957                 const dual_timestamp *ts,
958                 uint64_t xor_hash,
959                 const EntryItem items[], unsigned n_items,
960                 uint64_t *seqnum,
961                 Object **ret, uint64_t *offset) {
962         uint64_t np;
963         uint64_t osize;
964         Object *o;
965         int r;
966
967         assert(f);
968         assert(items || n_items == 0);
969         assert(ts);
970
971         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
972
973         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
974         if (r < 0)
975                 return r;
976
977         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
978         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
979         o->entry.realtime = htole64(ts->realtime);
980         o->entry.monotonic = htole64(ts->monotonic);
981         o->entry.xor_hash = htole64(xor_hash);
982         o->entry.boot_id = f->header->boot_id;
983
984         r = journal_file_link_entry(f, o, np);
985         if (r < 0)
986                 return r;
987
988         if (ret)
989                 *ret = o;
990
991         if (offset)
992                 *offset = np;
993
994         return 0;
995 }
996
997 void journal_file_post_change(JournalFile *f) {
998         assert(f);
999
1000         /* inotify() does not receive IN_MODIFY events from file
1001          * accesses done via mmap(). After each access we hence
1002          * trigger IN_MODIFY by truncating the journal file to its
1003          * current size which triggers IN_MODIFY. */
1004
1005         __sync_synchronize();
1006
1007         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1008                 log_error("Failed to to truncate file to its own size: %m");
1009 }
1010
1011 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1012         unsigned i;
1013         EntryItem *items;
1014         int r;
1015         uint64_t xor_hash = 0;
1016         struct dual_timestamp _ts;
1017
1018         assert(f);
1019         assert(iovec || n_iovec == 0);
1020
1021         if (!f->writable)
1022                 return -EPERM;
1023
1024         if (!ts) {
1025                 dual_timestamp_get(&_ts);
1026                 ts = &_ts;
1027         }
1028
1029         if (f->tail_entry_monotonic_valid &&
1030             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1031                 return -EINVAL;
1032
1033         items = alloca(sizeof(EntryItem) * n_iovec);
1034
1035         for (i = 0; i < n_iovec; i++) {
1036                 uint64_t p;
1037                 Object *o;
1038
1039                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1040                 if (r < 0)
1041                         return r;
1042
1043                 xor_hash ^= le64toh(o->data.hash);
1044                 items[i].object_offset = htole64(p);
1045                 items[i].hash = o->data.hash;
1046         }
1047
1048         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1049
1050         journal_file_post_change(f);
1051
1052         return r;
1053 }
1054
1055 static int generic_array_get(JournalFile *f,
1056                              uint64_t first,
1057                              uint64_t i,
1058                              Object **ret, uint64_t *offset) {
1059
1060         Object *o;
1061         uint64_t p = 0, a;
1062         int r;
1063
1064         assert(f);
1065
1066         a = first;
1067         while (a > 0) {
1068                 uint64_t n;
1069
1070                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1071                 if (r < 0)
1072                         return r;
1073
1074                 n = journal_file_entry_array_n_items(o);
1075                 if (i < n) {
1076                         p = le64toh(o->entry_array.items[i]);
1077                         break;
1078                 }
1079
1080                 i -= n;
1081                 a = le64toh(o->entry_array.next_entry_array_offset);
1082         }
1083
1084         if (a <= 0 || p <= 0)
1085                 return 0;
1086
1087         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1088         if (r < 0)
1089                 return r;
1090
1091         if (ret)
1092                 *ret = o;
1093
1094         if (offset)
1095                 *offset = p;
1096
1097         return 1;
1098 }
1099
1100 static int generic_array_get_plus_one(JournalFile *f,
1101                                       uint64_t extra,
1102                                       uint64_t first,
1103                                       uint64_t i,
1104                                       Object **ret, uint64_t *offset) {
1105
1106         Object *o;
1107
1108         assert(f);
1109
1110         if (i == 0) {
1111                 int r;
1112
1113                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1114                 if (r < 0)
1115                         return r;
1116
1117                 if (ret)
1118                         *ret = o;
1119
1120                 if (offset)
1121                         *offset = extra;
1122
1123                 return 1;
1124         }
1125
1126         return generic_array_get(f, first, i-1, ret, offset);
1127 }
1128
1129 enum {
1130         TEST_FOUND,
1131         TEST_LEFT,
1132         TEST_RIGHT
1133 };
1134
1135 static int generic_array_bisect(JournalFile *f,
1136                                 uint64_t first,
1137                                 uint64_t n,
1138                                 uint64_t needle,
1139                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1140                                 direction_t direction,
1141                                 Object **ret,
1142                                 uint64_t *offset,
1143                                 uint64_t *idx) {
1144
1145         uint64_t a, p, t = 0, i = 0, last_p = 0;
1146         bool subtract_one = false;
1147         Object *o, *array = NULL;
1148         int r;
1149
1150         assert(f);
1151         assert(test_object);
1152
1153         a = first;
1154         while (a > 0) {
1155                 uint64_t left, right, k, lp;
1156
1157                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1158                 if (r < 0)
1159                         return r;
1160
1161                 k = journal_file_entry_array_n_items(array);
1162                 right = MIN(k, n);
1163                 if (right <= 0)
1164                         return 0;
1165
1166                 i = right - 1;
1167                 lp = p = le64toh(array->entry_array.items[i]);
1168                 if (p <= 0)
1169                         return -EBADMSG;
1170
1171                 r = test_object(f, p, needle);
1172                 if (r < 0)
1173                         return r;
1174
1175                 if (r == TEST_FOUND)
1176                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1177
1178                 if (r == TEST_RIGHT) {
1179                         left = 0;
1180                         right -= 1;
1181                         for (;;) {
1182                                 if (left == right) {
1183                                         if (direction == DIRECTION_UP)
1184                                                 subtract_one = true;
1185
1186                                         i = left;
1187                                         goto found;
1188                                 }
1189
1190                                 assert(left < right);
1191
1192                                 i = (left + right) / 2;
1193                                 p = le64toh(array->entry_array.items[i]);
1194                                 if (p <= 0)
1195                                         return -EBADMSG;
1196
1197                                 r = test_object(f, p, needle);
1198                                 if (r < 0)
1199                                         return r;
1200
1201                                 if (r == TEST_FOUND)
1202                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1203
1204                                 if (r == TEST_RIGHT)
1205                                         right = i;
1206                                 else
1207                                         left = i + 1;
1208                         }
1209                 }
1210
1211                 if (k > n)
1212                         return 0;
1213
1214                 last_p = lp;
1215
1216                 n -= k;
1217                 t += k;
1218                 a = le64toh(array->entry_array.next_entry_array_offset);
1219         }
1220
1221         return 0;
1222
1223 found:
1224         if (subtract_one && t == 0 && i == 0)
1225                 return 0;
1226
1227         if (subtract_one && i == 0)
1228                 p = last_p;
1229         else if (subtract_one)
1230                 p = le64toh(array->entry_array.items[i-1]);
1231         else
1232                 p = le64toh(array->entry_array.items[i]);
1233
1234         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1235         if (r < 0)
1236                 return r;
1237
1238         if (ret)
1239                 *ret = o;
1240
1241         if (offset)
1242                 *offset = p;
1243
1244         if (idx)
1245                 *idx = t + i - (subtract_one ? 1 : 0);
1246
1247         return 1;
1248 }
1249
1250 static int generic_array_bisect_plus_one(JournalFile *f,
1251                                          uint64_t extra,
1252                                          uint64_t first,
1253                                          uint64_t n,
1254                                          uint64_t needle,
1255                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1256                                          direction_t direction,
1257                                          Object **ret,
1258                                          uint64_t *offset,
1259                                          uint64_t *idx) {
1260
1261         int r;
1262
1263         assert(f);
1264         assert(test_object);
1265
1266         if (n <= 0)
1267                 return 0;
1268
1269         /* This bisects the array in object 'first', but first checks
1270          * an extra  */
1271         r = test_object(f, extra, needle);
1272         if (r < 0)
1273                 return r;
1274         else if (r == TEST_FOUND) {
1275                 Object *o;
1276
1277                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1278                 if (r < 0)
1279                         return r;
1280
1281                 if (ret)
1282                         *ret = o;
1283
1284                 if (offset)
1285                         *offset = extra;
1286
1287                 if (idx)
1288                         *idx = 0;
1289
1290                 return 1;
1291         } else if (r == TEST_RIGHT)
1292                 return 0;
1293
1294         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1295
1296         if (r > 0)
1297                 (*idx) ++;
1298
1299         return r;
1300 }
1301
1302 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1303         Object *o;
1304         int r;
1305
1306         assert(f);
1307         assert(p > 0);
1308
1309         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1310         if (r < 0)
1311                 return r;
1312
1313         if (le64toh(o->entry.seqnum) == needle)
1314                 return TEST_FOUND;
1315         else if (le64toh(o->entry.seqnum) < needle)
1316                 return TEST_LEFT;
1317         else
1318                 return TEST_RIGHT;
1319 }
1320
1321 int journal_file_move_to_entry_by_seqnum(
1322                 JournalFile *f,
1323                 uint64_t seqnum,
1324                 direction_t direction,
1325                 Object **ret,
1326                 uint64_t *offset) {
1327
1328         return generic_array_bisect(f,
1329                                     le64toh(f->header->entry_array_offset),
1330                                     le64toh(f->header->n_entries),
1331                                     seqnum,
1332                                     test_object_seqnum,
1333                                     direction,
1334                                     ret, offset, NULL);
1335 }
1336
1337 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1338         Object *o;
1339         int r;
1340
1341         assert(f);
1342         assert(p > 0);
1343
1344         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1345         if (r < 0)
1346                 return r;
1347
1348         if (le64toh(o->entry.realtime) == needle)
1349                 return TEST_FOUND;
1350         else if (le64toh(o->entry.realtime) < needle)
1351                 return TEST_LEFT;
1352         else
1353                 return TEST_RIGHT;
1354 }
1355
1356 int journal_file_move_to_entry_by_realtime(
1357                 JournalFile *f,
1358                 uint64_t realtime,
1359                 direction_t direction,
1360                 Object **ret,
1361                 uint64_t *offset) {
1362
1363         return generic_array_bisect(f,
1364                                     le64toh(f->header->entry_array_offset),
1365                                     le64toh(f->header->n_entries),
1366                                     realtime,
1367                                     test_object_realtime,
1368                                     direction,
1369                                     ret, offset, NULL);
1370 }
1371
1372 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1373         Object *o;
1374         int r;
1375
1376         assert(f);
1377         assert(p > 0);
1378
1379         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1380         if (r < 0)
1381                 return r;
1382
1383         if (le64toh(o->entry.monotonic) == needle)
1384                 return TEST_FOUND;
1385         else if (le64toh(o->entry.monotonic) < needle)
1386                 return TEST_LEFT;
1387         else
1388                 return TEST_RIGHT;
1389 }
1390
1391 int journal_file_move_to_entry_by_monotonic(
1392                 JournalFile *f,
1393                 sd_id128_t boot_id,
1394                 uint64_t monotonic,
1395                 direction_t direction,
1396                 Object **ret,
1397                 uint64_t *offset) {
1398
1399         char t[8+32+1] = "_BOOT_ID=";
1400         Object *o;
1401         int r;
1402
1403         sd_id128_to_string(boot_id, t + 8);
1404
1405         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1406         if (r < 0)
1407                 return r;
1408         else if (r == 0)
1409                 return -ENOENT;
1410
1411         return generic_array_bisect_plus_one(f,
1412                                              le64toh(o->data.entry_offset),
1413                                              le64toh(o->data.entry_array_offset),
1414                                              le64toh(o->data.n_entries),
1415                                              monotonic,
1416                                              test_object_monotonic,
1417                                              direction,
1418                                              ret, offset, NULL);
1419 }
1420
1421 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1422         assert(f);
1423         assert(p > 0);
1424
1425         if (p == needle)
1426                 return TEST_FOUND;
1427         else if (p < needle)
1428                 return TEST_LEFT;
1429         else
1430                 return TEST_RIGHT;
1431 }
1432
1433 int journal_file_next_entry(
1434                 JournalFile *f,
1435                 Object *o, uint64_t p,
1436                 direction_t direction,
1437                 Object **ret, uint64_t *offset) {
1438
1439         uint64_t i, n;
1440         int r;
1441
1442         assert(f);
1443         assert(p > 0 || !o);
1444
1445         n = le64toh(f->header->n_entries);
1446         if (n <= 0)
1447                 return 0;
1448
1449         if (!o)
1450                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1451         else {
1452                 if (o->object.type != OBJECT_ENTRY)
1453                         return -EINVAL;
1454
1455                 r = generic_array_bisect(f,
1456                                          le64toh(f->header->entry_array_offset),
1457                                          le64toh(f->header->n_entries),
1458                                          p,
1459                                          test_object_offset,
1460                                          DIRECTION_DOWN,
1461                                          NULL, NULL,
1462                                          &i);
1463                 if (r <= 0)
1464                         return r;
1465
1466                 if (direction == DIRECTION_DOWN) {
1467                         if (i >= n - 1)
1468                                 return 0;
1469
1470                         i++;
1471                 } else {
1472                         if (i <= 0)
1473                                 return 0;
1474
1475                         i--;
1476                 }
1477         }
1478
1479         /* And jump to it */
1480         return generic_array_get(f,
1481                                  le64toh(f->header->entry_array_offset),
1482                                  i,
1483                                  ret, offset);
1484 }
1485
1486 int journal_file_skip_entry(
1487                 JournalFile *f,
1488                 Object *o, uint64_t p,
1489                 int64_t skip,
1490                 Object **ret, uint64_t *offset) {
1491
1492         uint64_t i, n;
1493         int r;
1494
1495         assert(f);
1496         assert(o);
1497         assert(p > 0);
1498
1499         if (o->object.type != OBJECT_ENTRY)
1500                 return -EINVAL;
1501
1502         r = generic_array_bisect(f,
1503                                  le64toh(f->header->entry_array_offset),
1504                                  le64toh(f->header->n_entries),
1505                                  p,
1506                                  test_object_offset,
1507                                  DIRECTION_DOWN,
1508                                  NULL, NULL,
1509                                  &i);
1510         if (r <= 0)
1511                 return r;
1512
1513         /* Calculate new index */
1514         if (skip < 0) {
1515                 if ((uint64_t) -skip >= i)
1516                         i = 0;
1517                 else
1518                         i = i - (uint64_t) -skip;
1519         } else
1520                 i  += (uint64_t) skip;
1521
1522         n = le64toh(f->header->n_entries);
1523         if (n <= 0)
1524                 return -EBADMSG;
1525
1526         if (i >= n)
1527                 i = n-1;
1528
1529         return generic_array_get(f,
1530                                  le64toh(f->header->entry_array_offset),
1531                                  i,
1532                                  ret, offset);
1533 }
1534
1535 int journal_file_next_entry_for_data(
1536                 JournalFile *f,
1537                 Object *o, uint64_t p,
1538                 uint64_t data_offset,
1539                 direction_t direction,
1540                 Object **ret, uint64_t *offset) {
1541
1542         uint64_t n, i;
1543         int r;
1544         Object *d;
1545
1546         assert(f);
1547         assert(p > 0 || !o);
1548
1549         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1550         if (r < 0)
1551                 return r;
1552
1553         n = le64toh(d->data.n_entries);
1554         if (n <= 0)
1555                 return n;
1556
1557         if (!o)
1558                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1559         else {
1560                 if (o->object.type != OBJECT_ENTRY)
1561                         return -EINVAL;
1562
1563                 r = generic_array_bisect_plus_one(f,
1564                                                   le64toh(d->data.entry_offset),
1565                                                   le64toh(d->data.entry_array_offset),
1566                                                   le64toh(d->data.n_entries),
1567                                                   p,
1568                                                   test_object_offset,
1569                                                   DIRECTION_DOWN,
1570                                                   NULL, NULL,
1571                                                   &i);
1572
1573                 if (r <= 0)
1574                         return r;
1575
1576                 if (direction == DIRECTION_DOWN) {
1577                         if (i >= n - 1)
1578                                 return 0;
1579
1580                         i++;
1581                 } else {
1582                         if (i <= 0)
1583                                 return 0;
1584
1585                         i--;
1586                 }
1587
1588         }
1589
1590         return generic_array_get_plus_one(f,
1591                                           le64toh(d->data.entry_offset),
1592                                           le64toh(d->data.entry_array_offset),
1593                                           i,
1594                                           ret, offset);
1595 }
1596
1597 int journal_file_move_to_entry_by_seqnum_for_data(
1598                 JournalFile *f,
1599                 uint64_t data_offset,
1600                 uint64_t seqnum,
1601                 direction_t direction,
1602                 Object **ret, uint64_t *offset) {
1603
1604         Object *d;
1605         int r;
1606
1607         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1608         if (r <= 0)
1609                 return r;
1610
1611         return generic_array_bisect_plus_one(f,
1612                                              le64toh(d->data.entry_offset),
1613                                              le64toh(d->data.entry_array_offset),
1614                                              le64toh(d->data.n_entries),
1615                                              seqnum,
1616                                              test_object_seqnum,
1617                                              direction,
1618                                              ret, offset, NULL);
1619 }
1620
1621 int journal_file_move_to_entry_by_realtime_for_data(
1622                 JournalFile *f,
1623                 uint64_t data_offset,
1624                 uint64_t realtime,
1625                 direction_t direction,
1626                 Object **ret, uint64_t *offset) {
1627
1628         Object *d;
1629         int r;
1630
1631         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1632         if (r <= 0)
1633                 return r;
1634
1635         return generic_array_bisect_plus_one(f,
1636                                              le64toh(d->data.entry_offset),
1637                                              le64toh(d->data.entry_array_offset),
1638                                              le64toh(d->data.n_entries),
1639                                              realtime,
1640                                              test_object_realtime,
1641                                              direction,
1642                                              ret, offset, NULL);
1643 }
1644
1645 void journal_file_dump(JournalFile *f) {
1646         char a[33], b[33], c[33];
1647         Object *o;
1648         int r;
1649         uint64_t p;
1650
1651         assert(f);
1652
1653         printf("File Path: %s\n"
1654                "File ID: %s\n"
1655                "Machine ID: %s\n"
1656                "Boot ID: %s\n"
1657                "Arena size: %llu\n"
1658                "Objects: %lu\n"
1659                "Entries: %lu\n",
1660                f->path,
1661                sd_id128_to_string(f->header->file_id, a),
1662                sd_id128_to_string(f->header->machine_id, b),
1663                sd_id128_to_string(f->header->boot_id, c),
1664                (unsigned long long) le64toh(f->header->arena_size),
1665                (unsigned long) le64toh(f->header->n_objects),
1666                (unsigned long) le64toh(f->header->n_entries));
1667
1668         p = le64toh(f->header->arena_offset);
1669         while (p != 0) {
1670                 r = journal_file_move_to_object(f, -1, p, &o);
1671                 if (r < 0)
1672                         goto fail;
1673
1674                 switch (o->object.type) {
1675
1676                 case OBJECT_UNUSED:
1677                         printf("Type: OBJECT_UNUSED\n");
1678                         break;
1679
1680                 case OBJECT_DATA:
1681                         printf("Type: OBJECT_DATA\n");
1682                         break;
1683
1684                 case OBJECT_ENTRY:
1685                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1686                                (unsigned long long) le64toh(o->entry.seqnum),
1687                                (unsigned long long) le64toh(o->entry.monotonic),
1688                                (unsigned long long) le64toh(o->entry.realtime));
1689                         break;
1690
1691                 case OBJECT_FIELD_HASH_TABLE:
1692                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1693                         break;
1694
1695                 case OBJECT_DATA_HASH_TABLE:
1696                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1697                         break;
1698
1699                 case OBJECT_ENTRY_ARRAY:
1700                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1701                         break;
1702                 }
1703
1704                 if (o->object.flags & OBJECT_COMPRESSED)
1705                         printf("Flags: COMPRESSED\n");
1706
1707                 if (p == le64toh(f->header->tail_object_offset))
1708                         p = 0;
1709                 else
1710                         p = p + ALIGN64(le64toh(o->object.size));
1711         }
1712
1713         return;
1714 fail:
1715         log_error("File corrupt");
1716 }
1717
1718 int journal_file_open(
1719                 const char *fname,
1720                 int flags,
1721                 mode_t mode,
1722                 JournalFile *template,
1723                 JournalFile **ret) {
1724
1725         JournalFile *f;
1726         int r;
1727         bool newly_created = false;
1728
1729         assert(fname);
1730
1731         if ((flags & O_ACCMODE) != O_RDONLY &&
1732             (flags & O_ACCMODE) != O_RDWR)
1733                 return -EINVAL;
1734
1735         if (!endswith(fname, ".journal"))
1736                 return -EINVAL;
1737
1738         f = new0(JournalFile, 1);
1739         if (!f)
1740                 return -ENOMEM;
1741
1742         f->fd = -1;
1743         f->flags = flags;
1744         f->mode = mode;
1745         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1746         f->prot = prot_from_flags(flags);
1747
1748         if (template) {
1749                 f->metrics = template->metrics;
1750                 f->compress = template->compress;
1751         }
1752
1753         f->path = strdup(fname);
1754         if (!f->path) {
1755                 r = -ENOMEM;
1756                 goto fail;
1757         }
1758
1759         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1760         if (f->fd < 0) {
1761                 r = -errno;
1762                 goto fail;
1763         }
1764
1765         if (fstat(f->fd, &f->last_stat) < 0) {
1766                 r = -errno;
1767                 goto fail;
1768         }
1769
1770         if (f->last_stat.st_size == 0 && f->writable) {
1771                 newly_created = true;
1772
1773                 r = journal_file_init_header(f, template);
1774                 if (r < 0)
1775                         goto fail;
1776
1777                 if (fstat(f->fd, &f->last_stat) < 0) {
1778                         r = -errno;
1779                         goto fail;
1780                 }
1781         }
1782
1783         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1784                 r = -EIO;
1785                 goto fail;
1786         }
1787
1788         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1789         if (f->header == MAP_FAILED) {
1790                 f->header = NULL;
1791                 r = -errno;
1792                 goto fail;
1793         }
1794
1795         if (!newly_created) {
1796                 r = journal_file_verify_header(f);
1797                 if (r < 0)
1798                         goto fail;
1799         }
1800
1801         if (f->writable) {
1802                 r = journal_file_refresh_header(f);
1803                 if (r < 0)
1804                         goto fail;
1805         }
1806
1807         if (newly_created) {
1808
1809                 r = journal_file_setup_field_hash_table(f);
1810                 if (r < 0)
1811                         goto fail;
1812
1813                 r = journal_file_setup_data_hash_table(f);
1814                 if (r < 0)
1815                         goto fail;
1816         }
1817
1818         r = journal_file_map_field_hash_table(f);
1819         if (r < 0)
1820                 goto fail;
1821
1822         r = journal_file_map_data_hash_table(f);
1823         if (r < 0)
1824                 goto fail;
1825
1826         if (ret)
1827                 *ret = f;
1828
1829         return 0;
1830
1831 fail:
1832         journal_file_close(f);
1833
1834         return r;
1835 }
1836
1837 int journal_file_rotate(JournalFile **f) {
1838         char *p;
1839         size_t l;
1840         JournalFile *old_file, *new_file = NULL;
1841         int r;
1842
1843         assert(f);
1844         assert(*f);
1845
1846         old_file = *f;
1847
1848         if (!old_file->writable)
1849                 return -EINVAL;
1850
1851         if (!endswith(old_file->path, ".journal"))
1852                 return -EINVAL;
1853
1854         l = strlen(old_file->path);
1855
1856         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1857         if (!p)
1858                 return -ENOMEM;
1859
1860         memcpy(p, old_file->path, l - 8);
1861         p[l-8] = '@';
1862         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1863         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1864                  "-%016llx-%016llx.journal",
1865                  (unsigned long long) le64toh((*f)->header->seqnum),
1866                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1867
1868         r = rename(old_file->path, p);
1869         free(p);
1870
1871         if (r < 0)
1872                 return -errno;
1873
1874         old_file->header->state = STATE_ARCHIVED;
1875
1876         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1877         journal_file_close(old_file);
1878
1879         *f = new_file;
1880         return r;
1881 }
1882
1883 int journal_file_open_reliably(
1884                 const char *fname,
1885                 int flags,
1886                 mode_t mode,
1887                 JournalFile *template,
1888                 JournalFile **ret) {
1889
1890         int r;
1891         size_t l;
1892         char *p;
1893
1894         r = journal_file_open(fname, flags, mode, template, ret);
1895         if (r != -EBADMSG && /* corrupted */
1896             r != -ENODATA && /* truncated */
1897             r != -EHOSTDOWN && /* other machine */
1898             r != -EPROTONOSUPPORT) /* incompatible feature */
1899                 return r;
1900
1901         if ((flags & O_ACCMODE) == O_RDONLY)
1902                 return r;
1903
1904         if (!(flags & O_CREAT))
1905                 return r;
1906
1907         /* The file is corrupted. Rotate it away and try it again (but only once) */
1908
1909         l = strlen(fname);
1910         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1911                      (int) (l-8), fname,
1912                      (unsigned long long) now(CLOCK_REALTIME),
1913                      random_ull()) < 0)
1914                 return -ENOMEM;
1915
1916         r = rename(fname, p);
1917         free(p);
1918         if (r < 0)
1919                 return -errno;
1920
1921         log_warning("File %s corrupted, renaming and replacing.", fname);
1922
1923         return journal_file_open(fname, flags, mode, template, ret);
1924 }
1925
1926 struct vacuum_info {
1927         off_t usage;
1928         char *filename;
1929
1930         uint64_t realtime;
1931         sd_id128_t seqnum_id;
1932         uint64_t seqnum;
1933
1934         bool have_seqnum;
1935 };
1936
1937 static int vacuum_compare(const void *_a, const void *_b) {
1938         const struct vacuum_info *a, *b;
1939
1940         a = _a;
1941         b = _b;
1942
1943         if (a->have_seqnum && b->have_seqnum &&
1944             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1945                 if (a->seqnum < b->seqnum)
1946                         return -1;
1947                 else if (a->seqnum > b->seqnum)
1948                         return 1;
1949                 else
1950                         return 0;
1951         }
1952
1953         if (a->realtime < b->realtime)
1954                 return -1;
1955         else if (a->realtime > b->realtime)
1956                 return 1;
1957         else if (a->have_seqnum && b->have_seqnum)
1958                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1959         else
1960                 return strcmp(a->filename, b->filename);
1961 }
1962
1963 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1964         DIR *d;
1965         int r = 0;
1966         struct vacuum_info *list = NULL;
1967         unsigned n_list = 0, n_allocated = 0, i;
1968         uint64_t sum = 0;
1969
1970         assert(directory);
1971
1972         if (max_use <= 0)
1973                 return 0;
1974
1975         d = opendir(directory);
1976         if (!d)
1977                 return -errno;
1978
1979         for (;;) {
1980                 int k;
1981                 struct dirent buf, *de;
1982                 size_t q;
1983                 struct stat st;
1984                 char *p;
1985                 unsigned long long seqnum = 0, realtime;
1986                 sd_id128_t seqnum_id;
1987                 bool have_seqnum;
1988
1989                 k = readdir_r(d, &buf, &de);
1990                 if (k != 0) {
1991                         r = -k;
1992                         goto finish;
1993                 }
1994
1995                 if (!de)
1996                         break;
1997
1998                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
1999                         continue;
2000
2001                 if (!S_ISREG(st.st_mode))
2002                         continue;
2003
2004                 q = strlen(de->d_name);
2005
2006                 if (endswith(de->d_name, ".journal")) {
2007
2008                         /* Vacuum archived files */
2009
2010                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2011                                 continue;
2012
2013                         if (de->d_name[q-8-16-1] != '-' ||
2014                             de->d_name[q-8-16-1-16-1] != '-' ||
2015                             de->d_name[q-8-16-1-16-1-32-1] != '@')
2016                                 continue;
2017
2018                         p = strdup(de->d_name);
2019                         if (!p) {
2020                                 r = -ENOMEM;
2021                                 goto finish;
2022                         }
2023
2024                         de->d_name[q-8-16-1-16-1] = 0;
2025                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2026                                 free(p);
2027                                 continue;
2028                         }
2029
2030                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2031                                 free(p);
2032                                 continue;
2033                         }
2034
2035                         have_seqnum = true;
2036
2037                 } else if (endswith(de->d_name, ".journal~")) {
2038                         unsigned long long tmp;
2039
2040                         /* Vacuum corrupted files */
2041
2042                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
2043                                 continue;
2044
2045                         if (de->d_name[q-1-8-16-1] != '-' ||
2046                             de->d_name[q-1-8-16-1-16-1] != '@')
2047                                 continue;
2048
2049                         p = strdup(de->d_name);
2050                         if (!p) {
2051                                 r = -ENOMEM;
2052                                 goto finish;
2053                         }
2054
2055                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2056                                 free(p);
2057                                 continue;
2058                         }
2059
2060                         have_seqnum = false;
2061                 } else
2062                         continue;
2063
2064                 if (n_list >= n_allocated) {
2065                         struct vacuum_info *j;
2066
2067                         n_allocated = MAX(n_allocated * 2U, 8U);
2068                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2069                         if (!j) {
2070                                 free(p);
2071                                 r = -ENOMEM;
2072                                 goto finish;
2073                         }
2074
2075                         list = j;
2076                 }
2077
2078                 list[n_list].filename = p;
2079                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2080                 list[n_list].seqnum = seqnum;
2081                 list[n_list].realtime = realtime;
2082                 list[n_list].seqnum_id = seqnum_id;
2083                 list[n_list].have_seqnum = have_seqnum;
2084
2085                 sum += list[n_list].usage;
2086
2087                 n_list ++;
2088         }
2089
2090         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2091
2092         for(i = 0; i < n_list; i++) {
2093                 struct statvfs ss;
2094
2095                 if (fstatvfs(dirfd(d), &ss) < 0) {
2096                         r = -errno;
2097                         goto finish;
2098                 }
2099
2100                 if (sum <= max_use &&
2101                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2102                         break;
2103
2104                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2105                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2106                         sum -= list[i].usage;
2107                 } else if (errno != ENOENT)
2108                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2109         }
2110
2111 finish:
2112         for (i = 0; i < n_list; i++)
2113                 free(list[i].filename);
2114
2115         free(list);
2116
2117         if (d)
2118                 closedir(d);
2119
2120         return r;
2121 }
2122
2123 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2124         uint64_t i, n;
2125         uint64_t q, xor_hash = 0;
2126         int r;
2127         EntryItem *items;
2128         dual_timestamp ts;
2129
2130         assert(from);
2131         assert(to);
2132         assert(o);
2133         assert(p);
2134
2135         if (!to->writable)
2136                 return -EPERM;
2137
2138         ts.monotonic = le64toh(o->entry.monotonic);
2139         ts.realtime = le64toh(o->entry.realtime);
2140
2141         if (to->tail_entry_monotonic_valid &&
2142             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2143                 return -EINVAL;
2144
2145         if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2146                 return -EINVAL;
2147
2148         n = journal_file_entry_n_items(o);
2149         items = alloca(sizeof(EntryItem) * n);
2150
2151         for (i = 0; i < n; i++) {
2152                 uint64_t l, h;
2153                 le64_t le_hash;
2154                 size_t t;
2155                 void *data;
2156                 Object *u;
2157
2158                 q = le64toh(o->entry.items[i].object_offset);
2159                 le_hash = o->entry.items[i].hash;
2160
2161                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2162                 if (r < 0)
2163                         return r;
2164
2165                 if (le_hash != o->data.hash)
2166                         return -EBADMSG;
2167
2168                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2169                 t = (size_t) l;
2170
2171                 /* We hit the limit on 32bit machines */
2172                 if ((uint64_t) t != l)
2173                         return -E2BIG;
2174
2175                 if (o->object.flags & OBJECT_COMPRESSED) {
2176 #ifdef HAVE_XZ
2177                         uint64_t rsize;
2178
2179                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2180                                 return -EBADMSG;
2181
2182                         data = from->compress_buffer;
2183                         l = rsize;
2184 #else
2185                         return -EPROTONOSUPPORT;
2186 #endif
2187                 } else
2188                         data = o->data.payload;
2189
2190                 r = journal_file_append_data(to, data, l, &u, &h);
2191                 if (r < 0)
2192                         return r;
2193
2194                 xor_hash ^= le64toh(u->data.hash);
2195                 items[i].object_offset = htole64(h);
2196                 items[i].hash = u->data.hash;
2197
2198                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2199                 if (r < 0)
2200                         return r;
2201         }
2202
2203         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2204 }
2205
2206 void journal_default_metrics(JournalMetrics *m, int fd) {
2207         uint64_t fs_size = 0;
2208         struct statvfs ss;
2209         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2210
2211         assert(m);
2212         assert(fd >= 0);
2213
2214         if (fstatvfs(fd, &ss) >= 0)
2215                 fs_size = ss.f_frsize * ss.f_blocks;
2216
2217         if (m->max_use == (uint64_t) -1) {
2218
2219                 if (fs_size > 0) {
2220                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2221
2222                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2223                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2224
2225                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2226                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2227                 } else
2228                         m->max_use = DEFAULT_MAX_USE_LOWER;
2229         } else {
2230                 m->max_use = PAGE_ALIGN(m->max_use);
2231
2232                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2233                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2234         }
2235
2236         if (m->max_size == (uint64_t) -1) {
2237                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2238
2239                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2240                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2241         } else
2242                 m->max_size = PAGE_ALIGN(m->max_size);
2243
2244         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2245                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2246
2247         if (m->max_size*2 > m->max_use)
2248                 m->max_use = m->max_size*2;
2249
2250         if (m->min_size == (uint64_t) -1)
2251                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2252         else {
2253                 m->min_size = PAGE_ALIGN(m->min_size);
2254
2255                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2256                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2257
2258                 if (m->min_size > m->max_size)
2259                         m->max_size = m->min_size;
2260         }
2261
2262         if (m->keep_free == (uint64_t) -1) {
2263
2264                 if (fs_size > 0) {
2265                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2266
2267                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2268                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2269
2270                 } else
2271                         m->keep_free = DEFAULT_KEEP_FREE;
2272         }
2273
2274         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2275                  format_bytes(a, sizeof(a), m->max_use),
2276                  format_bytes(b, sizeof(b), m->max_size),
2277                  format_bytes(c, sizeof(c), m->min_size),
2278                  format_bytes(d, sizeof(d), m->keep_free));
2279 }