chiark / gitweb /
8f9b61bc2fe7c1ef32df36800675a728d94b9986
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33
34 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
35 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
36
37 #define DEFAULT_WINDOW_SIZE (128ULL*1024ULL*1024ULL)
38
39 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
40
41 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
42
43 void journal_file_close(JournalFile *f) {
44         int t;
45
46         assert(f);
47
48         if (f->header && f->writable)
49                 f->header->state = STATE_OFFLINE;
50
51
52         for (t = 0; t < _WINDOW_MAX; t++)
53                 if (f->windows[t].ptr)
54                         munmap(f->windows[t].ptr, f->windows[t].size);
55
56         if (f->fd >= 0)
57                 close_nointr_nofail(f->fd);
58
59         free(f->path);
60         free(f);
61 }
62
63 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
64         Header h;
65         ssize_t k;
66         int r;
67
68         assert(f);
69
70         zero(h);
71         memcpy(h.signature, signature, 8);
72         h.arena_offset = htole64(ALIGN64(sizeof(h)));
73
74         r = sd_id128_randomize(&h.file_id);
75         if (r < 0)
76                 return r;
77
78         if (template) {
79                 h.seqnum_id = template->header->seqnum_id;
80                 h.seqnum = template->header->seqnum;
81         } else
82                 h.seqnum_id = h.file_id;
83
84         k = pwrite(f->fd, &h, sizeof(h), 0);
85         if (k < 0)
86                 return -errno;
87
88         if (k != sizeof(h))
89                 return -EIO;
90
91         return 0;
92 }
93
94 static int journal_file_refresh_header(JournalFile *f) {
95         int r;
96         sd_id128_t boot_id;
97
98         assert(f);
99
100         r = sd_id128_get_machine(&f->header->machine_id);
101         if (r < 0)
102                 return r;
103
104         r = sd_id128_get_boot(&boot_id);
105         if (r < 0)
106                 return r;
107
108         if (sd_id128_equal(boot_id, f->header->boot_id))
109                 f->tail_entry_monotonic_valid = true;
110
111         f->header->boot_id = boot_id;
112
113         f->header->state = STATE_ONLINE;
114         return 0;
115 }
116
117 static int journal_file_verify_header(JournalFile *f) {
118         assert(f);
119
120         if (memcmp(f->header, signature, 8))
121                 return -EBADMSG;
122
123         if (f->header->incompatible_flags != 0)
124                 return -EPROTONOSUPPORT;
125
126         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
127                 return -ENODATA;
128
129         if (f->writable) {
130                 uint32_t state;
131                 sd_id128_t machine_id;
132                 int r;
133
134                 r = sd_id128_get_machine(&machine_id);
135                 if (r < 0)
136                         return r;
137
138                 if (!sd_id128_equal(machine_id, f->header->machine_id))
139                         return -EHOSTDOWN;
140
141                 state = f->header->state;
142
143                 if (state == STATE_ONLINE)
144                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
145                 else if (state == STATE_ARCHIVED)
146                         return -ESHUTDOWN;
147                 else if (state != STATE_OFFLINE)
148                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
149         }
150
151         return 0;
152 }
153
154 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
155         uint64_t old_size, new_size;
156
157         assert(f);
158
159         /* We assume that this file is not sparse, and we know that
160          * for sure, since we always call posix_fallocate()
161          * ourselves */
162
163         old_size =
164                 le64toh(f->header->arena_offset) +
165                 le64toh(f->header->arena_size);
166
167         new_size = PAGE_ALIGN(offset + size);
168         if (new_size < le64toh(f->header->arena_offset))
169                 new_size = le64toh(f->header->arena_offset);
170
171         if (new_size <= old_size)
172                 return 0;
173
174         if (f->metrics.max_size > 0 &&
175             new_size > f->metrics.max_size)
176                 return -E2BIG;
177
178         if (new_size > f->metrics.min_size &&
179             f->metrics.keep_free > 0) {
180                 struct statvfs svfs;
181
182                 if (fstatvfs(f->fd, &svfs) >= 0) {
183                         uint64_t available;
184
185                         available = svfs.f_bfree * svfs.f_bsize;
186
187                         if (available >= f->metrics.keep_free)
188                                 available -= f->metrics.keep_free;
189                         else
190                                 available = 0;
191
192                         if (new_size - old_size > available)
193                                 return -E2BIG;
194                 }
195         }
196
197         /* Note that the glibc fallocate() fallback is very
198            inefficient, hence we try to minimize the allocation area
199            as we can. */
200         if (posix_fallocate(f->fd, old_size, new_size - old_size) < 0)
201                 return -errno;
202
203         if (fstat(f->fd, &f->last_stat) < 0)
204                 return -errno;
205
206         f->header->arena_size = new_size - htole64(f->header->arena_offset);
207
208         return 0;
209 }
210
211 static int journal_file_map(
212                 JournalFile *f,
213                 uint64_t offset,
214                 uint64_t size,
215                 void **_window,
216                 uint64_t *_woffset,
217                 uint64_t *_wsize,
218                 void **ret) {
219
220         uint64_t woffset, wsize;
221         void *window;
222
223         assert(f);
224         assert(size > 0);
225         assert(ret);
226
227         woffset = offset & ~((uint64_t) page_size() - 1ULL);
228         wsize = size + (offset - woffset);
229         wsize = PAGE_ALIGN(wsize);
230
231         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
232         if (window == MAP_FAILED)
233                 return -errno;
234
235         if (_window)
236                 *_window = window;
237
238         if (_woffset)
239                 *_woffset = woffset;
240
241         if (_wsize)
242                 *_wsize = wsize;
243
244         *ret = (uint8_t*) window + (offset - woffset);
245
246         return 0;
247 }
248
249 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
250         void *p;
251         uint64_t delta;
252         int r;
253         Window *w;
254
255         assert(f);
256         assert(ret);
257         assert(wt >= 0);
258         assert(wt < _WINDOW_MAX);
259
260         w = f->windows + wt;
261
262         if (_likely_(w->ptr &&
263                      w->offset <= offset &&
264                      w->offset + w->size >= offset + size)) {
265
266                 *ret = (uint8_t*) w->ptr + (offset - w->offset);
267                 return 0;
268         }
269
270         if (w->ptr) {
271                 if (munmap(w->ptr, w->size) < 0)
272                         return -errno;
273
274                 w->ptr = NULL;
275                 w->size = w->offset = 0;
276         }
277
278         if (size < DEFAULT_WINDOW_SIZE) {
279                 /* If the default window size is larger then what was
280                  * asked for extend the mapping a bit in the hope to
281                  * minimize needed remappings later on. We add half
282                  * the window space before and half behind the
283                  * requested mapping */
284
285                 delta = PAGE_ALIGN((DEFAULT_WINDOW_SIZE - size) / 2);
286
287                 if (offset < delta)
288                         delta = offset;
289
290                 offset -= delta;
291                 size += (DEFAULT_WINDOW_SIZE - delta);
292         } else
293                 delta = 0;
294
295         r = journal_file_map(f,
296                              offset, size,
297                              &w->ptr, &w->offset, &w->size,
298                              &p);
299
300         if (r < 0)
301                 return r;
302
303         *ret = (uint8_t*) p + delta;
304         return 0;
305 }
306
307 static bool verify_hash(Object *o) {
308         uint64_t h1, h2;
309
310         assert(o);
311
312         if (o->object.type == OBJECT_DATA) {
313                 h1 = le64toh(o->data.hash);
314                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
315         } else if (o->object.type == OBJECT_FIELD) {
316                 h1 = le64toh(o->field.hash);
317                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
318         } else
319                 return true;
320
321         return h1 == h2;
322 }
323
324 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
325         int r;
326         void *t;
327         Object *o;
328         uint64_t s;
329
330         assert(f);
331         assert(ret);
332         assert(type < _OBJECT_TYPE_MAX);
333
334         r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
335         if (r < 0)
336                 return r;
337
338         o = (Object*) t;
339         s = le64toh(o->object.size);
340
341         if (s < sizeof(ObjectHeader))
342                 return -EBADMSG;
343
344         if (type >= 0 && o->object.type != type)
345                 return -EBADMSG;
346
347         if (s > sizeof(ObjectHeader)) {
348                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
349                 if (r < 0)
350                         return r;
351
352                 o = (Object*) t;
353         }
354
355         if (!verify_hash(o))
356                 return -EBADMSG;
357
358         *ret = o;
359         return 0;
360 }
361
362 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
363         uint64_t r;
364
365         assert(f);
366
367         r = le64toh(f->header->seqnum) + 1;
368
369         if (seqnum) {
370                 /* If an external seqnum counter was passed, we update
371                  * both the local and the external one, and set it to
372                  * the maximum of both */
373
374                 if (*seqnum + 1 > r)
375                         r = *seqnum + 1;
376
377                 *seqnum = r;
378         }
379
380         f->header->seqnum = htole64(r);
381
382         if (f->header->first_seqnum == 0)
383                 f->header->first_seqnum = htole64(r);
384
385         return r;
386 }
387
388 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
389         int r;
390         uint64_t p;
391         Object *tail, *o;
392         void *t;
393
394         assert(f);
395         assert(size >= sizeof(ObjectHeader));
396         assert(offset);
397         assert(ret);
398
399         p = le64toh(f->header->tail_object_offset);
400         if (p == 0)
401                 p = le64toh(f->header->arena_offset);
402         else {
403                 r = journal_file_move_to_object(f, -1, p, &tail);
404                 if (r < 0)
405                         return r;
406
407                 p += ALIGN64(le64toh(tail->object.size));
408         }
409
410         r = journal_file_allocate(f, p, size);
411         if (r < 0)
412                 return r;
413
414         r = journal_file_move_to(f, type, p, size, &t);
415         if (r < 0)
416                 return r;
417
418         o = (Object*) t;
419
420         zero(o->object);
421         o->object.type = type;
422         o->object.size = htole64(size);
423
424         f->header->tail_object_offset = htole64(p);
425         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
426
427         *ret = o;
428         *offset = p;
429
430         return 0;
431 }
432
433 static int journal_file_setup_data_hash_table(JournalFile *f) {
434         uint64_t s, p;
435         Object *o;
436         int r;
437
438         assert(f);
439
440         s = DEFAULT_DATA_HASH_TABLE_SIZE;
441         r = journal_file_append_object(f,
442                                        OBJECT_DATA_HASH_TABLE,
443                                        offsetof(Object, hash_table.items) + s,
444                                        &o, &p);
445         if (r < 0)
446                 return r;
447
448         memset(o->hash_table.items, 0, s);
449
450         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
451         f->header->data_hash_table_size = htole64(s);
452
453         return 0;
454 }
455
456 static int journal_file_setup_field_hash_table(JournalFile *f) {
457         uint64_t s, p;
458         Object *o;
459         int r;
460
461         assert(f);
462
463         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
464         r = journal_file_append_object(f,
465                                        OBJECT_FIELD_HASH_TABLE,
466                                        offsetof(Object, hash_table.items) + s,
467                                        &o, &p);
468         if (r < 0)
469                 return r;
470
471         memset(o->hash_table.items, 0, s);
472
473         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
474         f->header->field_hash_table_size = htole64(s);
475
476         return 0;
477 }
478
479 static int journal_file_map_data_hash_table(JournalFile *f) {
480         uint64_t s, p;
481         void *t;
482         int r;
483
484         assert(f);
485
486         p = le64toh(f->header->data_hash_table_offset);
487         s = le64toh(f->header->data_hash_table_size);
488
489         r = journal_file_move_to(f,
490                                  WINDOW_DATA_HASH_TABLE,
491                                  p, s,
492                                  &t);
493         if (r < 0)
494                 return r;
495
496         f->data_hash_table = t;
497         return 0;
498 }
499
500 static int journal_file_map_field_hash_table(JournalFile *f) {
501         uint64_t s, p;
502         void *t;
503         int r;
504
505         assert(f);
506
507         p = le64toh(f->header->field_hash_table_offset);
508         s = le64toh(f->header->field_hash_table_size);
509
510         r = journal_file_move_to(f,
511                                  WINDOW_FIELD_HASH_TABLE,
512                                  p, s,
513                                  &t);
514         if (r < 0)
515                 return r;
516
517         f->field_hash_table = t;
518         return 0;
519 }
520
521 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
522         uint64_t p, h;
523         int r;
524
525         assert(f);
526         assert(o);
527         assert(offset > 0);
528         assert(o->object.type == OBJECT_DATA);
529
530         o->data.next_hash_offset = o->data.next_field_offset = 0;
531         o->data.entry_offset = o->data.entry_array_offset = 0;
532         o->data.n_entries = 0;
533
534         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
535         p = le64toh(f->data_hash_table[h].head_hash_offset);
536         if (p == 0) {
537                 /* Only entry in the hash table is easy */
538                 f->data_hash_table[h].head_hash_offset = htole64(offset);
539         } else {
540                 /* Temporarily move back to the previous data object,
541                  * to patch in pointer */
542
543                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
544                 if (r < 0)
545                         return r;
546
547                 o->data.next_hash_offset = htole64(offset);
548
549                 r = journal_file_move_to_object(f, OBJECT_DATA, offset, &o);
550                 if (r < 0)
551                         return r;
552         }
553
554         f->data_hash_table[h].tail_hash_offset = htole64(offset);
555
556         return 0;
557 }
558
559 int journal_file_find_data_object_with_hash(
560                 JournalFile *f,
561                 const void *data, uint64_t size, uint64_t hash,
562                 Object **ret, uint64_t *offset) {
563         uint64_t p, osize, h;
564         int r;
565
566         assert(f);
567         assert(data || size == 0);
568
569         osize = offsetof(Object, data.payload) + size;
570
571         if (f->header->data_hash_table_size == 0)
572                 return -EBADMSG;
573
574         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
575         p = le64toh(f->data_hash_table[h].head_hash_offset);
576
577         while (p > 0) {
578                 Object *o;
579
580                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
581                 if (r < 0)
582                         return r;
583
584                 if (le64toh(o->object.size) == osize &&
585                     memcmp(o->data.payload, data, size) == 0) {
586
587                         if (le64toh(o->data.hash) != hash)
588                                 return -EBADMSG;
589
590                         if (ret)
591                                 *ret = o;
592
593                         if (offset)
594                                 *offset = p;
595
596                         return 1;
597                 }
598
599                 p = le64toh(o->data.next_hash_offset);
600         }
601
602         return 0;
603 }
604
605 int journal_file_find_data_object(
606                 JournalFile *f,
607                 const void *data, uint64_t size,
608                 Object **ret, uint64_t *offset) {
609
610         uint64_t hash;
611
612         assert(f);
613         assert(data || size == 0);
614
615         hash = hash64(data, size);
616
617         return journal_file_find_data_object_with_hash(f,
618                                                        data, size, hash,
619                                                        ret, offset);
620 }
621
622 static int journal_file_append_data(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
623         uint64_t hash, p;
624         uint64_t osize;
625         Object *o;
626         int r;
627
628         assert(f);
629         assert(data || size == 0);
630
631         hash = hash64(data, size);
632
633         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
634         if (r < 0)
635                 return r;
636         else if (r > 0) {
637
638                 if (ret)
639                         *ret = o;
640
641                 if (offset)
642                         *offset = p;
643
644                 return 0;
645         }
646
647         osize = offsetof(Object, data.payload) + size;
648         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
649         if (r < 0)
650                 return r;
651
652         o->data.hash = htole64(hash);
653         memcpy(o->data.payload, data, size);
654
655         r = journal_file_link_data(f, o, p, hash);
656         if (r < 0)
657                 return r;
658
659         if (ret)
660                 *ret = o;
661
662         if (offset)
663                 *offset = p;
664
665         return 0;
666 }
667
668 uint64_t journal_file_entry_n_items(Object *o) {
669         assert(o);
670         assert(o->object.type == htole64(OBJECT_ENTRY));
671
672         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
673 }
674
675 static uint64_t journal_file_entry_array_n_items(Object *o) {
676         assert(o);
677         assert(o->object.type == htole64(OBJECT_ENTRY_ARRAY));
678
679         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
680 }
681
682 static int link_entry_into_array(JournalFile *f,
683                                  uint64_t *first,
684                                  uint64_t *idx,
685                                  uint64_t p) {
686         int r;
687         uint64_t n = 0, ap = 0, q, i, a, hidx;
688         Object *o;
689
690         assert(f);
691         assert(first);
692         assert(idx);
693         assert(p > 0);
694
695         a = le64toh(*first);
696         i = hidx = le64toh(*idx);
697         while (a > 0) {
698
699                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
700                 if (r < 0)
701                         return r;
702
703                 n = journal_file_entry_array_n_items(o);
704                 if (i < n) {
705                         o->entry_array.items[i] = htole64(p);
706                         *idx = htole64(hidx + 1);
707                         return 0;
708                 }
709
710                 i -= n;
711                 ap = a;
712                 a = le64toh(o->entry_array.next_entry_array_offset);
713         }
714
715         if (hidx > n)
716                 n = (hidx+1) * 2;
717         else
718                 n = n * 2;
719
720         if (n < 4)
721                 n = 4;
722
723         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
724                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
725                                        &o, &q);
726         if (r < 0)
727                 return r;
728
729         o->entry_array.items[i] = htole64(p);
730
731         if (ap == 0)
732                 *first = q;
733         else {
734                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
735                 if (r < 0)
736                         return r;
737
738                 o->entry_array.next_entry_array_offset = htole64(q);
739         }
740
741         *idx = htole64(hidx + 1);
742
743         return 0;
744 }
745
746 static int link_entry_into_array_plus_one(JournalFile *f,
747                                           uint64_t *extra,
748                                           uint64_t *first,
749                                           uint64_t *idx,
750                                           uint64_t p) {
751
752         int r;
753
754         assert(f);
755         assert(extra);
756         assert(first);
757         assert(idx);
758         assert(p > 0);
759
760         if (*idx == 0)
761                 *extra = htole64(p);
762         else {
763                 uint64_t i;
764
765                 i = le64toh(*idx) - 1;
766                 r = link_entry_into_array(f, first, &i, p);
767                 if (r < 0)
768                         return r;
769         }
770
771         *idx = htole64(le64toh(*idx) + 1);
772         return 0;
773 }
774
775 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
776         uint64_t p;
777         int r;
778         assert(f);
779         assert(o);
780         assert(offset > 0);
781
782         p = le64toh(o->entry.items[i].object_offset);
783         if (p == 0)
784                 return -EINVAL;
785
786         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
787         if (r < 0)
788                 return r;
789
790         return link_entry_into_array_plus_one(f,
791                                               &o->data.entry_offset,
792                                               &o->data.entry_array_offset,
793                                               &o->data.n_entries,
794                                               offset);
795 }
796
797 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
798         uint64_t n, i;
799         int r;
800
801         assert(f);
802         assert(o);
803         assert(offset > 0);
804         assert(o->object.type == OBJECT_ENTRY);
805
806         /* Link up the entry itself */
807         r = link_entry_into_array(f,
808                                   &f->header->entry_array_offset,
809                                   &f->header->n_entries,
810                                   offset);
811         if (r < 0)
812                 return r;
813
814         log_error("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries);
815
816         if (f->header->head_entry_realtime == 0)
817                 f->header->head_entry_realtime = o->entry.realtime;
818
819         f->header->tail_entry_realtime = o->entry.realtime;
820         f->header->tail_entry_monotonic = o->entry.monotonic;
821
822         f->tail_entry_monotonic_valid = true;
823
824         /* Link up the items */
825         n = journal_file_entry_n_items(o);
826         for (i = 0; i < n; i++) {
827                 r = journal_file_link_entry_item(f, o, offset, i);
828                 if (r < 0)
829                         return r;
830         }
831
832         return 0;
833 }
834
835 static int journal_file_append_entry_internal(
836                 JournalFile *f,
837                 const dual_timestamp *ts,
838                 uint64_t xor_hash,
839                 const EntryItem items[], unsigned n_items,
840                 uint64_t *seqnum,
841                 Object **ret, uint64_t *offset) {
842         uint64_t np;
843         uint64_t osize;
844         Object *o;
845         int r;
846
847         assert(f);
848         assert(items || n_items == 0);
849         assert(ts);
850
851         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
852
853         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
854         if (r < 0)
855                 return r;
856
857         o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
858         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
859         o->entry.realtime = htole64(ts->realtime);
860         o->entry.monotonic = htole64(ts->monotonic);
861         o->entry.xor_hash = htole64(xor_hash);
862         o->entry.boot_id = f->header->boot_id;
863
864         r = journal_file_link_entry(f, o, np);
865         if (r < 0)
866                 return r;
867
868         if (ret)
869                 *ret = o;
870
871         if (offset)
872                 *offset = np;
873
874         return 0;
875 }
876
877 static void journal_file_post_change(JournalFile *f) {
878         assert(f);
879
880         /* inotify() does not receive IN_MODIFY events from file
881          * accesses done via mmap(). After each access we hence
882          * trigger IN_MODIFY by truncating the journal file to its
883          * current size which triggers IN_MODIFY. */
884
885         __sync_synchronize();
886
887         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
888                 log_error("Failed to to truncate file to its own size: %m");
889 }
890
891 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
892         unsigned i;
893         EntryItem *items;
894         int r;
895         uint64_t xor_hash = 0;
896         struct dual_timestamp _ts;
897
898         assert(f);
899         assert(iovec || n_iovec == 0);
900
901         if (!f->writable)
902                 return -EPERM;
903
904         if (!ts) {
905                 dual_timestamp_get(&_ts);
906                 ts = &_ts;
907         }
908
909         if (f->tail_entry_monotonic_valid &&
910             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
911                 return -EINVAL;
912
913         if (ts->realtime < le64toh(f->header->tail_entry_realtime))
914                 return -EINVAL;
915
916         items = new(EntryItem, n_iovec);
917         if (!items)
918                 return -ENOMEM;
919
920         for (i = 0; i < n_iovec; i++) {
921                 uint64_t p;
922                 Object *o;
923
924                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
925                 if (r < 0)
926                         goto finish;
927
928                 xor_hash ^= le64toh(o->data.hash);
929                 items[i].object_offset = htole64(p);
930                 items[i].hash = o->data.hash;
931         }
932
933         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
934
935         journal_file_post_change(f);
936
937 finish:
938         free(items);
939
940         return r;
941 }
942
943 static int generic_array_get(JournalFile *f,
944                              uint64_t first,
945                              uint64_t i,
946                              Object **ret, uint64_t *offset) {
947
948         Object *o;
949         uint64_t p, a;
950         int r;
951
952         assert(f);
953
954         a = first;
955         while (a > 0) {
956                 uint64_t n;
957
958                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
959                 if (r < 0)
960                         return r;
961
962                 n = journal_file_entry_array_n_items(o);
963                 if (i < n) {
964                         p = le64toh(o->entry_array.items[i]);
965                         break;
966                 }
967
968                 i -= n;
969                 a = le64toh(o->entry_array.next_entry_array_offset);
970         }
971
972         if (a <= 0 || p <= 0)
973                 return 0;
974
975         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
976         if (r < 0)
977                 return r;
978
979         if (ret)
980                 *ret = o;
981
982         if (offset)
983                 *offset = p;
984
985         return 1;
986 }
987
988 static int generic_array_get_plus_one(JournalFile *f,
989                                       uint64_t extra,
990                                       uint64_t first,
991                                       uint64_t i,
992                                       Object **ret, uint64_t *offset) {
993
994         Object *o;
995
996         assert(f);
997
998         if (i == 0) {
999                 int r;
1000
1001                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1002                 if (r < 0)
1003                         return r;
1004
1005                 if (ret)
1006                         *ret = o;
1007
1008                 if (offset)
1009                         *offset = extra;
1010
1011                 return 1;
1012         }
1013
1014         return generic_array_get(f, first, i-1, ret, offset);
1015 }
1016
1017 enum {
1018         TEST_FOUND,
1019         TEST_LEFT,
1020         TEST_RIGHT
1021 };
1022
1023 static int generic_array_bisect(JournalFile *f,
1024                                 uint64_t first,
1025                                 uint64_t n,
1026                                 uint64_t needle,
1027                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1028                                 direction_t direction,
1029                                 Object **ret,
1030                                 uint64_t *offset,
1031                                 uint64_t *idx) {
1032
1033         uint64_t a, p, t = 0, i = 0, last_p = 0;
1034         bool subtract_one = false;
1035         Object *o, *array = NULL;
1036         int r;
1037
1038         assert(f);
1039         assert(test_object);
1040
1041         a = first;
1042         while (a > 0) {
1043                 uint64_t left, right, k, lp;
1044
1045                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1046                 if (r < 0)
1047                         return r;
1048
1049                 k = journal_file_entry_array_n_items(array);
1050                 right = MIN(k, n);
1051                 if (right <= 0)
1052                         return 0;
1053
1054                 i = right - 1;
1055                 lp = p = le64toh(array->entry_array.items[i]);
1056                 if (p <= 0)
1057                         return -EBADMSG;
1058
1059                 r = test_object(f, p, needle);
1060                 if (r < 0)
1061                         return r;
1062
1063                 if (r == TEST_FOUND)
1064                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1065
1066                 if (r == TEST_RIGHT) {
1067                         left = 0;
1068                         right -= 1;
1069                         for (;;) {
1070                                 if (left == right) {
1071                                         if (direction == DIRECTION_UP)
1072                                                 subtract_one = true;
1073
1074                                         i = left;
1075                                         goto found;
1076                                 }
1077
1078                                 assert(left < right);
1079
1080                                 i = (left + right) / 2;
1081                                 p = le64toh(array->entry_array.items[i]);
1082                                 if (p <= 0)
1083                                         return -EBADMSG;
1084
1085                                 r = test_object(f, p, needle);
1086                                 if (r < 0)
1087                                         return r;
1088
1089                                 if (r == TEST_FOUND)
1090                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1091
1092                                 if (r == TEST_RIGHT)
1093                                         right = i;
1094                                 else
1095                                         left = i + 1;
1096                         }
1097                 }
1098
1099                 if (k > n)
1100                         return 0;
1101
1102                 last_p = lp;
1103
1104                 n -= k;
1105                 t += k;
1106                 a = le64toh(array->entry_array.next_entry_array_offset);
1107         }
1108
1109         return 0;
1110
1111 found:
1112         if (subtract_one && t == 0 && i == 0)
1113                 return 0;
1114
1115         if (subtract_one && i == 0)
1116                 p = last_p;
1117         else if (subtract_one)
1118                 p = le64toh(array->entry_array.items[i-1]);
1119         else
1120                 p = le64toh(array->entry_array.items[i]);
1121
1122         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1123         if (r < 0)
1124                 return r;
1125
1126         if (ret)
1127                 *ret = o;
1128
1129         if (offset)
1130                 *offset = p;
1131
1132         if (idx)
1133                 *idx = t + i - (subtract_one ? 1 : 0);
1134
1135         return 1;
1136 }
1137
1138 static int generic_array_bisect_plus_one(JournalFile *f,
1139                                          uint64_t extra,
1140                                          uint64_t first,
1141                                          uint64_t n,
1142                                          uint64_t needle,
1143                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1144                                          direction_t direction,
1145                                          Object **ret,
1146                                          uint64_t *offset,
1147                                          uint64_t *idx) {
1148
1149         int r;
1150
1151         assert(f);
1152         assert(test_object);
1153
1154         if (n <= 0)
1155                 return 0;
1156
1157         /* This bisects the array in object 'first', but first checks
1158          * an extra  */
1159
1160         r = test_object(f, extra, needle);
1161         if (r < 0)
1162                 return r;
1163         else if (r == TEST_FOUND) {
1164                 Object *o;
1165
1166                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1167                 if (r < 0)
1168                         return r;
1169
1170                 if (ret)
1171                         *ret = o;
1172
1173                 if (offset)
1174                         *offset = extra;
1175         } else if (r == TEST_RIGHT)
1176                 return 0;
1177
1178         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1179
1180         if (r > 0)
1181                 (*idx) ++;
1182
1183         return r;
1184 }
1185
1186 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1187         Object *o;
1188         int r;
1189
1190         assert(f);
1191         assert(p > 0);
1192
1193         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1194         if (r < 0)
1195                 return r;
1196
1197         if (le64toh(o->entry.seqnum) == needle)
1198                 return TEST_FOUND;
1199         else if (le64toh(o->entry.seqnum) < needle)
1200                 return TEST_LEFT;
1201         else
1202                 return TEST_RIGHT;
1203 }
1204
1205 int journal_file_move_to_entry_by_seqnum(
1206                 JournalFile *f,
1207                 uint64_t seqnum,
1208                 direction_t direction,
1209                 Object **ret,
1210                 uint64_t *offset) {
1211
1212         return generic_array_bisect(f,
1213                                     le64toh(f->header->entry_array_offset),
1214                                     le64toh(f->header->n_entries),
1215                                     seqnum,
1216                                     test_object_seqnum,
1217                                     direction,
1218                                     ret, offset, NULL);
1219 }
1220
1221 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1222         Object *o;
1223         int r;
1224
1225         assert(f);
1226         assert(p > 0);
1227
1228         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1229         if (r < 0)
1230                 return r;
1231
1232         if (le64toh(o->entry.realtime) == needle)
1233                 return TEST_FOUND;
1234         else if (le64toh(o->entry.realtime) < needle)
1235                 return TEST_LEFT;
1236         else
1237                 return TEST_RIGHT;
1238 }
1239
1240 int journal_file_move_to_entry_by_realtime(
1241                 JournalFile *f,
1242                 uint64_t realtime,
1243                 direction_t direction,
1244                 Object **ret,
1245                 uint64_t *offset) {
1246
1247         return generic_array_bisect(f,
1248                                     le64toh(f->header->entry_array_offset),
1249                                     le64toh(f->header->n_entries),
1250                                     realtime,
1251                                     test_object_realtime,
1252                                     direction,
1253                                     ret, offset, NULL);
1254 }
1255
1256 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1257         Object *o;
1258         int r;
1259
1260         assert(f);
1261         assert(p > 0);
1262
1263         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1264         if (r < 0)
1265                 return r;
1266
1267         if (le64toh(o->entry.monotonic) == needle)
1268                 return TEST_FOUND;
1269         else if (le64toh(o->entry.monotonic) < needle)
1270                 return TEST_LEFT;
1271         else
1272                 return TEST_RIGHT;
1273 }
1274
1275 int journal_file_move_to_entry_by_monotonic(
1276                 JournalFile *f,
1277                 sd_id128_t boot_id,
1278                 uint64_t monotonic,
1279                 direction_t direction,
1280                 Object **ret,
1281                 uint64_t *offset) {
1282
1283         char t[8+32+1] = "_BOOT_ID=";
1284         Object *o;
1285         int r;
1286
1287         sd_id128_to_string(boot_id, t + 8);
1288
1289         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1290         if (r < 0)
1291                 return r;
1292         else if (r == 0)
1293                 return -ENOENT;
1294
1295         return generic_array_bisect_plus_one(f,
1296                                              le64toh(o->data.entry_offset),
1297                                              le64toh(o->data.entry_array_offset),
1298                                              le64toh(o->data.n_entries),
1299                                              monotonic,
1300                                              test_object_monotonic,
1301                                              direction,
1302                                              ret, offset, NULL);
1303 }
1304
1305 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1306         assert(f);
1307         assert(p > 0);
1308
1309         if (p == needle)
1310                 return TEST_FOUND;
1311         else if (p < needle)
1312                 return TEST_LEFT;
1313         else
1314                 return TEST_RIGHT;
1315 }
1316
1317 int journal_file_next_entry(
1318                 JournalFile *f,
1319                 Object *o, uint64_t p,
1320                 direction_t direction,
1321                 Object **ret, uint64_t *offset) {
1322
1323         uint64_t i, n;
1324         int r;
1325
1326         assert(f);
1327         assert(p > 0 || !o);
1328
1329         n = le64toh(f->header->n_entries);
1330         if (n <= 0)
1331                 return 0;
1332
1333         if (!o)
1334                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1335         else {
1336                 if (o->object.type != OBJECT_ENTRY)
1337                         return -EINVAL;
1338
1339                 r = generic_array_bisect(f,
1340                                          le64toh(f->header->entry_array_offset),
1341                                          le64toh(f->header->n_entries),
1342                                          p,
1343                                          test_object_offset,
1344                                          DIRECTION_DOWN,
1345                                          NULL, NULL,
1346                                          &i);
1347                 if (r <= 0)
1348                         return r;
1349
1350                 if (direction == DIRECTION_DOWN) {
1351                         if (i >= n - 1)
1352                                 return 0;
1353
1354                         i++;
1355                 } else {
1356                         if (i <= 0)
1357                                 return 0;
1358
1359                         i--;
1360                 }
1361         }
1362
1363         /* And jump to it */
1364         return generic_array_get(f,
1365                                  le64toh(f->header->entry_array_offset),
1366                                  i,
1367                                  ret, offset);
1368 }
1369
1370 int journal_file_skip_entry(
1371                 JournalFile *f,
1372                 Object *o, uint64_t p,
1373                 int64_t skip,
1374                 Object **ret, uint64_t *offset) {
1375
1376         uint64_t i, n;
1377         int r;
1378
1379         assert(f);
1380         assert(o);
1381         assert(p > 0);
1382
1383         if (o->object.type != OBJECT_ENTRY)
1384                 return -EINVAL;
1385
1386         r = generic_array_bisect(f,
1387                                  le64toh(f->header->entry_array_offset),
1388                                  le64toh(f->header->n_entries),
1389                                  p,
1390                                  test_object_offset,
1391                                  DIRECTION_DOWN,
1392                                  NULL, NULL,
1393                                  &i);
1394         if (r <= 0)
1395                 return r;
1396
1397         /* Calculate new index */
1398         if (skip < 0) {
1399                 if ((uint64_t) -skip >= i)
1400                         i = 0;
1401                 else
1402                         i = i - (uint64_t) -skip;
1403         } else
1404                 i  += (uint64_t) skip;
1405
1406         n = le64toh(f->header->n_entries);
1407         if (n <= 0)
1408                 return -EBADMSG;
1409
1410         if (i >= n)
1411                 i = n-1;
1412
1413         return generic_array_get(f,
1414                                  le64toh(f->header->entry_array_offset),
1415                                  i,
1416                                  ret, offset);
1417 }
1418
1419 int journal_file_next_entry_for_data(
1420                 JournalFile *f,
1421                 Object *o, uint64_t p,
1422                 uint64_t data_offset,
1423                 direction_t direction,
1424                 Object **ret, uint64_t *offset) {
1425
1426         uint64_t n, i;
1427         int r;
1428         Object *d;
1429
1430         assert(f);
1431         assert(p > 0 || !o);
1432
1433         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1434         if (r < 0)
1435                 return r;
1436
1437         n = le64toh(d->data.n_entries);
1438         if (n <= 0)
1439                 return n;
1440
1441         if (!o)
1442                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1443         else {
1444                 if (o->object.type != OBJECT_ENTRY)
1445                         return -EINVAL;
1446
1447                 r = generic_array_bisect_plus_one(f,
1448                                                   le64toh(d->data.entry_offset),
1449                                                   le64toh(d->data.entry_array_offset),
1450                                                   le64toh(d->data.n_entries),
1451                                                   p,
1452                                                   test_object_offset,
1453                                                   DIRECTION_DOWN,
1454                                                   NULL, NULL,
1455                                                   &i);
1456
1457                 if (r <= 0)
1458                         return r;
1459
1460                 if (direction == DIRECTION_DOWN) {
1461                         if (i >= n - 1)
1462                                 return 0;
1463
1464                         i++;
1465                 } else {
1466                         if (i <= 0)
1467                                 return 0;
1468
1469                         i--;
1470                 }
1471
1472         }
1473
1474         return generic_array_get_plus_one(f,
1475                                           le64toh(d->data.entry_offset),
1476                                           le64toh(d->data.entry_array_offset),
1477                                           i,
1478                                           ret, offset);
1479 }
1480
1481 int journal_file_move_to_entry_by_seqnum_for_data(
1482                 JournalFile *f,
1483                 uint64_t data_offset,
1484                 uint64_t seqnum,
1485                 direction_t direction,
1486                 Object **ret, uint64_t *offset) {
1487
1488         Object *d;
1489         int r;
1490
1491         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1492         if (r <= 0)
1493                 return r;
1494
1495         return generic_array_bisect_plus_one(f,
1496                                              le64toh(d->data.entry_offset),
1497                                              le64toh(d->data.entry_array_offset),
1498                                              le64toh(d->data.n_entries),
1499                                              seqnum,
1500                                              test_object_seqnum,
1501                                              direction,
1502                                              ret, offset, NULL);
1503 }
1504
1505 int journal_file_move_to_entry_by_realtime_for_data(
1506                 JournalFile *f,
1507                 uint64_t data_offset,
1508                 uint64_t realtime,
1509                 direction_t direction,
1510                 Object **ret, uint64_t *offset) {
1511
1512         Object *d;
1513         int r;
1514
1515         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1516         if (r <= 0)
1517                 return r;
1518
1519         return generic_array_bisect_plus_one(f,
1520                                              le64toh(d->data.entry_offset),
1521                                              le64toh(d->data.entry_array_offset),
1522                                              le64toh(d->data.n_entries),
1523                                              realtime,
1524                                              test_object_realtime,
1525                                              direction,
1526                                              ret, offset, NULL);
1527 }
1528
1529 void journal_file_dump(JournalFile *f) {
1530         char a[33], b[33], c[33];
1531         Object *o;
1532         int r;
1533         uint64_t p;
1534
1535         assert(f);
1536
1537         printf("File Path: %s\n"
1538                "File ID: %s\n"
1539                "Machine ID: %s\n"
1540                "Boot ID: %s\n"
1541                "Arena size: %llu\n"
1542                "Objects: %lu\n"
1543                "Entries: %lu\n",
1544                f->path,
1545                sd_id128_to_string(f->header->file_id, a),
1546                sd_id128_to_string(f->header->machine_id, b),
1547                sd_id128_to_string(f->header->boot_id, c),
1548                (unsigned long long) le64toh(f->header->arena_size),
1549                (unsigned long) le64toh(f->header->n_objects),
1550                (unsigned long) le64toh(f->header->n_entries));
1551
1552         p = le64toh(f->header->arena_offset);
1553         while (p != 0) {
1554                 r = journal_file_move_to_object(f, -1, p, &o);
1555                 if (r < 0)
1556                         goto fail;
1557
1558                 switch (o->object.type) {
1559
1560                 case OBJECT_UNUSED:
1561                         printf("Type: OBJECT_UNUSED\n");
1562                         break;
1563
1564                 case OBJECT_DATA:
1565                         printf("Type: OBJECT_DATA\n");
1566                         break;
1567
1568                 case OBJECT_ENTRY:
1569                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1570                                (unsigned long long) le64toh(o->entry.seqnum),
1571                                (unsigned long long) le64toh(o->entry.monotonic),
1572                                (unsigned long long) le64toh(o->entry.realtime));
1573                         break;
1574
1575                 case OBJECT_FIELD_HASH_TABLE:
1576                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1577                         break;
1578
1579                 case OBJECT_DATA_HASH_TABLE:
1580                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1581                         break;
1582
1583                 case OBJECT_ENTRY_ARRAY:
1584                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1585                         break;
1586                 }
1587
1588                 if (p == le64toh(f->header->tail_object_offset))
1589                         p = 0;
1590                 else
1591                         p = p + ALIGN64(le64toh(o->object.size));
1592         }
1593
1594         return;
1595 fail:
1596         log_error("File corrupt");
1597 }
1598
1599 int journal_file_open(
1600                 const char *fname,
1601                 int flags,
1602                 mode_t mode,
1603                 JournalFile *template,
1604                 JournalFile **ret) {
1605
1606         JournalFile *f;
1607         int r;
1608         bool newly_created = false;
1609
1610         assert(fname);
1611
1612         if ((flags & O_ACCMODE) != O_RDONLY &&
1613             (flags & O_ACCMODE) != O_RDWR)
1614                 return -EINVAL;
1615
1616         f = new0(JournalFile, 1);
1617         if (!f)
1618                 return -ENOMEM;
1619
1620         f->fd = -1;
1621         f->flags = flags;
1622         f->mode = mode;
1623         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1624         f->prot = prot_from_flags(flags);
1625
1626         f->metrics.max_size = DEFAULT_MAX_SIZE;
1627         f->metrics.min_size = DEFAULT_MIN_SIZE;
1628         f->metrics.keep_free = DEFAULT_KEEP_FREE;
1629
1630         f->path = strdup(fname);
1631         if (!f->path) {
1632                 r = -ENOMEM;
1633                 goto fail;
1634         }
1635
1636         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1637         if (f->fd < 0) {
1638                 r = -errno;
1639                 goto fail;
1640         }
1641
1642         if (fstat(f->fd, &f->last_stat) < 0) {
1643                 r = -errno;
1644                 goto fail;
1645         }
1646
1647         if (f->last_stat.st_size == 0 && f->writable) {
1648                 newly_created = true;
1649
1650                 r = journal_file_init_header(f, template);
1651                 if (r < 0)
1652                         goto fail;
1653
1654                 if (fstat(f->fd, &f->last_stat) < 0) {
1655                         r = -errno;
1656                         goto fail;
1657                 }
1658         }
1659
1660         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1661                 r = -EIO;
1662                 goto fail;
1663         }
1664
1665         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1666         if (f->header == MAP_FAILED) {
1667                 f->header = NULL;
1668                 r = -errno;
1669                 goto fail;
1670         }
1671
1672         if (!newly_created) {
1673                 r = journal_file_verify_header(f);
1674                 if (r < 0)
1675                         goto fail;
1676         }
1677
1678         if (f->writable) {
1679                 r = journal_file_refresh_header(f);
1680                 if (r < 0)
1681                         goto fail;
1682         }
1683
1684         if (newly_created) {
1685
1686                 r = journal_file_setup_field_hash_table(f);
1687                 if (r < 0)
1688                         goto fail;
1689
1690                 r = journal_file_setup_data_hash_table(f);
1691                 if (r < 0)
1692                         goto fail;
1693         }
1694
1695         r = journal_file_map_field_hash_table(f);
1696         if (r < 0)
1697                 goto fail;
1698
1699         r = journal_file_map_data_hash_table(f);
1700         if (r < 0)
1701                 goto fail;
1702
1703         if (ret)
1704                 *ret = f;
1705
1706         return 0;
1707
1708 fail:
1709         journal_file_close(f);
1710
1711         return r;
1712 }
1713
1714 int journal_file_rotate(JournalFile **f) {
1715         char *p;
1716         size_t l;
1717         JournalFile *old_file, *new_file = NULL;
1718         int r;
1719
1720         assert(f);
1721         assert(*f);
1722
1723         old_file = *f;
1724
1725         if (!old_file->writable)
1726                 return -EINVAL;
1727
1728         if (!endswith(old_file->path, ".journal"))
1729                 return -EINVAL;
1730
1731         l = strlen(old_file->path);
1732
1733         p = new(char, l + 1 + 16 + 1 + 32 + 1 + 16 + 1);
1734         if (!p)
1735                 return -ENOMEM;
1736
1737         memcpy(p, old_file->path, l - 8);
1738         p[l-8] = '@';
1739         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1740         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1741                  "-%016llx-%016llx.journal",
1742                  (unsigned long long) le64toh((*f)->header->seqnum),
1743                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1744
1745         r = rename(old_file->path, p);
1746         free(p);
1747
1748         if (r < 0)
1749                 return -errno;
1750
1751         old_file->header->state = le32toh(STATE_ARCHIVED);
1752
1753         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1754         journal_file_close(old_file);
1755
1756         *f = new_file;
1757         return r;
1758 }
1759
1760 struct vacuum_info {
1761         off_t usage;
1762         char *filename;
1763
1764         uint64_t realtime;
1765         sd_id128_t seqnum_id;
1766         uint64_t seqnum;
1767 };
1768
1769 static int vacuum_compare(const void *_a, const void *_b) {
1770         const struct vacuum_info *a, *b;
1771
1772         a = _a;
1773         b = _b;
1774
1775         if (sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1776                 if (a->seqnum < b->seqnum)
1777                         return -1;
1778                 else if (a->seqnum > b->seqnum)
1779                         return 1;
1780                 else
1781                         return 0;
1782         }
1783
1784         if (a->realtime < b->realtime)
1785                 return -1;
1786         else if (a->realtime > b->realtime)
1787                 return 1;
1788         else
1789                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1790 }
1791
1792 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1793         DIR *d;
1794         int r = 0;
1795         struct vacuum_info *list = NULL;
1796         unsigned n_list = 0, n_allocated = 0, i;
1797         uint64_t sum = 0;
1798
1799         assert(directory);
1800
1801         if (max_use <= 0)
1802                 max_use = DEFAULT_MAX_USE;
1803
1804         d = opendir(directory);
1805         if (!d)
1806                 return -errno;
1807
1808         for (;;) {
1809                 int k;
1810                 struct dirent buf, *de;
1811                 size_t q;
1812                 struct stat st;
1813                 char *p;
1814                 unsigned long long seqnum, realtime;
1815                 sd_id128_t seqnum_id;
1816
1817                 k = readdir_r(d, &buf, &de);
1818                 if (k != 0) {
1819                         r = -k;
1820                         goto finish;
1821                 }
1822
1823                 if (!de)
1824                         break;
1825
1826                 if (!dirent_is_file_with_suffix(de, ".journal"))
1827                         continue;
1828
1829                 q = strlen(de->d_name);
1830
1831                 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
1832                         continue;
1833
1834                 if (de->d_name[q-8-16-1] != '-' ||
1835                     de->d_name[q-8-16-1-16-1] != '-' ||
1836                     de->d_name[q-8-16-1-16-1-32-1] != '@')
1837                         continue;
1838
1839                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
1840                         continue;
1841
1842                 if (!S_ISREG(st.st_mode))
1843                         continue;
1844
1845                 p = strdup(de->d_name);
1846                 if (!p) {
1847                         r = -ENOMEM;
1848                         goto finish;
1849                 }
1850
1851                 de->d_name[q-8-16-1-16-1] = 0;
1852                 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
1853                         free(p);
1854                         continue;
1855                 }
1856
1857                 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
1858                         free(p);
1859                         continue;
1860                 }
1861
1862                 if (n_list >= n_allocated) {
1863                         struct vacuum_info *j;
1864
1865                         n_allocated = MAX(n_allocated * 2U, 8U);
1866                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
1867                         if (!j) {
1868                                 free(p);
1869                                 r = -ENOMEM;
1870                                 goto finish;
1871                         }
1872
1873                         list = j;
1874                 }
1875
1876                 list[n_list].filename = p;
1877                 list[n_list].usage = (uint64_t) st.st_blksize * (uint64_t) st.st_blocks;
1878                 list[n_list].seqnum = seqnum;
1879                 list[n_list].realtime = realtime;
1880                 list[n_list].seqnum_id = seqnum_id;
1881
1882                 sum += list[n_list].usage;
1883
1884                 n_list ++;
1885         }
1886
1887         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
1888
1889         for(i = 0; i < n_list; i++) {
1890                 struct statvfs ss;
1891
1892                 if (fstatvfs(dirfd(d), &ss) < 0) {
1893                         r = -errno;
1894                         goto finish;
1895                 }
1896
1897                 if (sum <= max_use &&
1898                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
1899                         break;
1900
1901                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
1902                         log_debug("Deleted archived journal %s/%s.", directory, list[i].filename);
1903                         sum -= list[i].usage;
1904                 } else if (errno != ENOENT)
1905                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
1906         }
1907
1908 finish:
1909         for (i = 0; i < n_list; i++)
1910                 free(list[i].filename);
1911
1912         free(list);
1913
1914         if (d)
1915                 closedir(d);
1916
1917         return r;
1918 }