chiark / gitweb /
5557028147eb66fdbe6084a77de88cfd2ad47208
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33
34 #define DEFAULT_ARENA_MAX_SIZE (16ULL*1024ULL*1024ULL*1024ULL)
35 #define DEFAULT_ARENA_MIN_SIZE (256ULL*1024ULL)
36 #define DEFAULT_ARENA_KEEP_FREE (1ULL*1024ULL*1024ULL)
37
38 #define DEFAULT_MAX_USE (16ULL*1024ULL*1024ULL*16ULL)
39
40 #define DEFAULT_HASH_TABLE_SIZE (2047ULL*16ULL)
41 #define DEFAULT_BISECT_TABLE_SIZE ((DEFAULT_ARENA_MAX_SIZE/(64ULL*1024ULL))*8ULL)
42
43 #define DEFAULT_WINDOW_SIZE (128ULL*1024ULL*1024ULL)
44
45 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
46
47 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
48
49 void journal_file_close(JournalFile *f) {
50         assert(f);
51
52         if (f->header) {
53                 if (f->writable && f->header->state == htole32(STATE_ONLINE))
54                         f->header->state = htole32(STATE_OFFLINE);
55
56                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
57         }
58
59         if (f->hash_table_window)
60                 munmap(f->hash_table_window, f->hash_table_window_size);
61
62         if (f->bisect_table_window)
63                 munmap(f->bisect_table_window, f->bisect_table_window_size);
64
65         if (f->window)
66                 munmap(f->window, f->window_size);
67
68         if (f->fd >= 0)
69                 close_nointr_nofail(f->fd);
70
71         free(f->path);
72         free(f);
73 }
74
75 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
76         Header h;
77         ssize_t k;
78         int r;
79
80         assert(f);
81
82         zero(h);
83         memcpy(h.signature, signature, 8);
84         h.arena_offset = htole64(ALIGN64(sizeof(h)));
85         h.arena_max_size = htole64(DEFAULT_ARENA_MAX_SIZE);
86         h.arena_min_size = htole64(DEFAULT_ARENA_MIN_SIZE);
87         h.arena_keep_free = htole64(DEFAULT_ARENA_KEEP_FREE);
88
89         r = sd_id128_randomize(&h.file_id);
90         if (r < 0)
91                 return r;
92
93         if (template) {
94                 h.seqnum_id = template->header->seqnum_id;
95                 h.seqnum = template->header->seqnum;
96         } else
97                 h.seqnum_id = h.file_id;
98
99         k = pwrite(f->fd, &h, sizeof(h), 0);
100         if (k < 0)
101                 return -errno;
102
103         if (k != sizeof(h))
104                 return -EIO;
105
106         return 0;
107 }
108
109 static int journal_file_refresh_header(JournalFile *f) {
110         int r;
111
112         assert(f);
113
114         r = sd_id128_get_machine(&f->header->machine_id);
115         if (r < 0)
116                 return r;
117
118         r = sd_id128_get_boot(&f->header->boot_id);
119         if (r < 0)
120                 return r;
121
122         f->header->state = htole32(STATE_ONLINE);
123         return 0;
124 }
125
126 static int journal_file_verify_header(JournalFile *f) {
127         assert(f);
128
129         if (memcmp(f->header, signature, 8))
130                 return -EBADMSG;
131
132         if (f->header->incompatible_flags != 0)
133                 return -EPROTONOSUPPORT;
134
135         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
136                 return -ENODATA;
137
138         if (f->writable) {
139                 uint32_t state;
140                 sd_id128_t machine_id;
141                 int r;
142
143                 r = sd_id128_get_machine(&machine_id);
144                 if (r < 0)
145                         return r;
146
147                 if (!sd_id128_equal(machine_id, f->header->machine_id))
148                         return -EHOSTDOWN;
149
150                 state = le32toh(f->header->state);
151
152                 if (state == STATE_ONLINE)
153                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
154                 else if (state == STATE_ARCHIVED)
155                         return -ESHUTDOWN;
156                 else if (state != STATE_OFFLINE)
157                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
158         }
159
160         return 0;
161 }
162
163 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
164         uint64_t asize;
165         uint64_t old_size, new_size;
166
167         assert(f);
168
169         if (offset < le64toh(f->header->arena_offset))
170                 return -EINVAL;
171
172         new_size = PAGE_ALIGN(offset + size);
173
174         /* We assume that this file is not sparse, and we know that
175          * for sure, since we always call posix_fallocate()
176          * ourselves */
177
178         old_size =
179                 le64toh(f->header->arena_offset) +
180                 le64toh(f->header->arena_size);
181
182         if (old_size >= new_size)
183                 return 0;
184
185         asize = new_size - le64toh(f->header->arena_offset);
186
187         if (asize > le64toh(f->header->arena_min_size)) {
188                 struct statvfs svfs;
189
190                 if (fstatvfs(f->fd, &svfs) >= 0) {
191                         uint64_t available;
192
193                         available = svfs.f_bfree * svfs.f_bsize;
194
195                         if (available >= f->header->arena_keep_free)
196                                 available -= f->header->arena_keep_free;
197                         else
198                                 available = 0;
199
200                         if (new_size - old_size > available)
201                                 return -E2BIG;
202                 }
203         }
204
205         if (asize > le64toh(f->header->arena_max_size))
206                 return -E2BIG;
207
208         if (posix_fallocate(f->fd, old_size, new_size - old_size) < 0)
209                 return -errno;
210
211         if (fstat(f->fd, &f->last_stat) < 0)
212                 return -errno;
213
214         f->header->arena_size = htole64(asize);
215
216         return 0;
217 }
218
219 static int journal_file_map(
220                 JournalFile *f,
221                 uint64_t offset,
222                 uint64_t size,
223                 void **_window,
224                 uint64_t *_woffset,
225                 uint64_t *_wsize,
226                 void **ret) {
227
228         uint64_t woffset, wsize;
229         void *window;
230
231         assert(f);
232         assert(size > 0);
233         assert(ret);
234
235         woffset = offset & ~((uint64_t) page_size() - 1ULL);
236         wsize = size + (offset - woffset);
237         wsize = PAGE_ALIGN(wsize);
238
239         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
240         if (window == MAP_FAILED)
241                 return -errno;
242
243         if (_window)
244                 *_window = window;
245
246         if (_woffset)
247                 *_woffset = woffset;
248
249         if (_wsize)
250                 *_wsize = wsize;
251
252         *ret = (uint8_t*) window + (offset - woffset);
253
254         return 0;
255 }
256
257 static int journal_file_move_to(JournalFile *f, uint64_t offset, uint64_t size, void **ret) {
258         void *p;
259         uint64_t delta;
260         int r;
261
262         assert(f);
263         assert(ret);
264
265         if (_likely_(f->window &&
266                      f->window_offset <= offset &&
267                      f->window_offset+f->window_size >= offset + size)) {
268
269                 *ret = (uint8_t*) f->window + (offset - f->window_offset);
270                 return 0;
271         }
272
273         if (f->window) {
274                 if (munmap(f->window, f->window_size) < 0)
275                         return -errno;
276
277                 f->window = NULL;
278                 f->window_size = f->window_offset = 0;
279         }
280
281         if (size < DEFAULT_WINDOW_SIZE) {
282                 /* If the default window size is larger then what was
283                  * asked for extend the mapping a bit in the hope to
284                  * minimize needed remappings later on. We add half
285                  * the window space before and half behind the
286                  * requested mapping */
287
288                 delta = PAGE_ALIGN((DEFAULT_WINDOW_SIZE - size) / 2);
289
290                 if (offset < delta)
291                         delta = offset;
292
293                 offset -= delta;
294                 size += (DEFAULT_WINDOW_SIZE - delta);
295         } else
296                 delta = 0;
297
298         r = journal_file_map(f,
299                              offset, size,
300                              &f->window, &f->window_offset, &f->window_size,
301                              & p);
302
303         if (r < 0)
304                 return r;
305
306         *ret = (uint8_t*) p + delta;
307         return 0;
308 }
309
310 static bool verify_hash(Object *o) {
311         uint64_t t;
312
313         assert(o);
314
315         t = le64toh(o->object.type);
316         if (t == OBJECT_DATA) {
317                 uint64_t s, h1, h2;
318
319                 s = le64toh(o->object.size);
320
321                 h1 = le64toh(o->data.hash);
322                 h2 = hash64(o->data.payload, s - offsetof(Object, data.payload));
323
324                 return h1 == h2;
325         }
326
327         return true;
328 }
329
330 int journal_file_move_to_object(JournalFile *f, uint64_t offset, int type, Object **ret) {
331         int r;
332         void *t;
333         Object *o;
334         uint64_t s;
335
336         assert(f);
337         assert(ret);
338
339         r = journal_file_move_to(f, offset, sizeof(ObjectHeader), &t);
340         if (r < 0)
341                 return r;
342
343         o = (Object*) t;
344         s = le64toh(o->object.size);
345
346         if (s < sizeof(ObjectHeader))
347                 return -EBADMSG;
348
349         if (type >= 0 && le64toh(o->object.type) != type)
350                 return -EBADMSG;
351
352         if (s > sizeof(ObjectHeader)) {
353                 r = journal_file_move_to(f, offset, s, &t);
354                 if (r < 0)
355                         return r;
356
357                 o = (Object*) t;
358         }
359
360         if (!verify_hash(o))
361                 return -EBADMSG;
362
363         *ret = o;
364         return 0;
365 }
366
367 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
368         uint64_t r;
369
370         assert(f);
371
372         r = le64toh(f->header->seqnum) + 1;
373
374         if (seqnum) {
375                 /* If an external seqno counter was passed, we update
376                  * both the local and the external one, and set it to
377                  * the maximum of both */
378
379                 if (*seqnum + 1 > r)
380                         r = *seqnum + 1;
381
382                 *seqnum = r;
383         }
384
385         f->header->seqnum = htole64(r);
386
387         return r;
388 }
389
390 static int journal_file_append_object(JournalFile *f, uint64_t size, Object **ret, uint64_t *offset) {
391         int r;
392         uint64_t p;
393         Object *tail, *o;
394         void *t;
395
396         assert(f);
397         assert(size >= sizeof(ObjectHeader));
398         assert(offset);
399         assert(ret);
400
401         p = le64toh(f->header->tail_object_offset);
402
403         if (p == 0)
404                 p = le64toh(f->header->arena_offset);
405         else {
406                 r = journal_file_move_to_object(f, p, -1, &tail);
407                 if (r < 0)
408                         return r;
409
410                 p += ALIGN64(le64toh(tail->object.size));
411         }
412
413         r = journal_file_allocate(f, p, size);
414         if (r < 0)
415                 return r;
416
417         r = journal_file_move_to(f, p, size, &t);
418         if (r < 0)
419                 return r;
420
421         o = (Object*) t;
422
423         zero(o->object);
424         o->object.type = htole64(OBJECT_UNUSED);
425         zero(o->object.reserved);
426         o->object.size = htole64(size);
427
428         f->header->tail_object_offset = htole64(p);
429         if (f->header->head_object_offset == 0)
430                 f->header->head_object_offset = htole64(p);
431
432         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
433
434         *ret = o;
435         *offset = p;
436
437         return 0;
438 }
439
440 static int journal_file_setup_hash_table(JournalFile *f) {
441         uint64_t s, p;
442         Object *o;
443         int r;
444
445         assert(f);
446
447         s = DEFAULT_HASH_TABLE_SIZE;
448         r = journal_file_append_object(f, offsetof(Object, hash_table.table) + s, &o, &p);
449         if (r < 0)
450                 return r;
451
452         o->object.type = htole64(OBJECT_HASH_TABLE);
453         memset(o->hash_table.table, 0, s);
454
455         f->header->hash_table_offset = htole64(p + offsetof(Object, hash_table.table));
456         f->header->hash_table_size = htole64(s);
457
458         return 0;
459 }
460
461 static int journal_file_setup_bisect_table(JournalFile *f) {
462         uint64_t s, p;
463         Object *o;
464         int r;
465
466         assert(f);
467
468         s = DEFAULT_BISECT_TABLE_SIZE;
469         r = journal_file_append_object(f, offsetof(Object, bisect_table.table) + s, &o, &p);
470         if (r < 0)
471                 return r;
472
473         o->object.type = htole64(OBJECT_BISECT_TABLE);
474         memset(o->bisect_table.table, 0, s);
475
476         f->header->bisect_table_offset = htole64(p + offsetof(Object, bisect_table.table));
477         f->header->bisect_table_size = htole64(s);
478
479         return 0;
480 }
481
482 static int journal_file_map_hash_table(JournalFile *f) {
483         uint64_t s, p;
484         void *t;
485         int r;
486
487         assert(f);
488
489         p = le64toh(f->header->hash_table_offset);
490         s = le64toh(f->header->hash_table_size);
491
492         r = journal_file_map(f,
493                              p, s,
494                              &f->hash_table_window, NULL, &f->hash_table_window_size,
495                              &t);
496         if (r < 0)
497                 return r;
498
499         f->hash_table = t;
500         return 0;
501 }
502
503 static int journal_file_map_bisect_table(JournalFile *f) {
504         uint64_t s, p;
505         void *t;
506         int r;
507
508         assert(f);
509
510         p = le64toh(f->header->bisect_table_offset);
511         s = le64toh(f->header->bisect_table_size);
512
513         r = journal_file_map(f,
514                              p, s,
515                              &f->bisect_table_window, NULL, &f->bisect_table_window_size,
516                              &t);
517
518         if (r < 0)
519                 return r;
520
521         f->bisect_table = t;
522         return 0;
523 }
524
525 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash_index) {
526         uint64_t p;
527         int r;
528
529         assert(f);
530         assert(o);
531         assert(offset > 0);
532         assert(o->object.type == htole64(OBJECT_DATA));
533
534         o->data.head_entry_offset = o->data.tail_entry_offset = 0;
535         o->data.next_hash_offset = 0;
536
537         p = le64toh(f->hash_table[hash_index].tail_hash_offset);
538         if (p == 0) {
539                 /* Only entry in the hash table is easy */
540
541                 o->data.prev_hash_offset = 0;
542                 f->hash_table[hash_index].head_hash_offset = htole64(offset);
543         } else {
544                 o->data.prev_hash_offset = htole64(p);
545
546                 /* Temporarily move back to the previous data object,
547                  * to patch in pointer */
548
549                 r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
550                 if (r < 0)
551                         return r;
552
553                 o->data.next_hash_offset = offset;
554
555                 r = journal_file_move_to_object(f, offset, OBJECT_DATA, &o);
556                 if (r < 0)
557                         return r;
558         }
559
560         f->hash_table[hash_index].tail_hash_offset = htole64(offset);
561
562         return 0;
563 }
564
565 static int journal_file_append_data(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
566         uint64_t hash, h, p, np;
567         uint64_t osize;
568         Object *o;
569         int r;
570
571         assert(f);
572         assert(data || size == 0);
573
574         osize = offsetof(Object, data.payload) + size;
575
576         hash = hash64(data, size);
577         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
578         p = le64toh(f->hash_table[h].head_hash_offset);
579
580         while (p != 0) {
581                 /* Look for this data object in the hash table */
582
583                 r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
584                 if (r < 0)
585                         return r;
586
587                 if (le64toh(o->object.size) == osize &&
588                     memcmp(o->data.payload, data, size) == 0) {
589
590                         if (le64toh(o->data.hash) != hash)
591                                 return -EBADMSG;
592
593                         if (ret)
594                                 *ret = o;
595
596                         if (offset)
597                                 *offset = p;
598
599                         return 0;
600                 }
601
602                 p = le64toh(o->data.next_hash_offset);
603         }
604
605         r = journal_file_append_object(f, osize, &o, &np);
606         if (r < 0)
607                 return r;
608
609         o->object.type = htole64(OBJECT_DATA);
610         o->data.hash = htole64(hash);
611         memcpy(o->data.payload, data, size);
612
613         r = journal_file_link_data(f, o, np, h);
614         if (r < 0)
615                 return r;
616
617         if (ret)
618                 *ret = o;
619
620         if (offset)
621                 *offset = np;
622
623         return 0;
624 }
625
626 uint64_t journal_file_entry_n_items(Object *o) {
627         assert(o);
628         assert(o->object.type == htole64(OBJECT_ENTRY));
629
630         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
631 }
632
633 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
634         uint64_t p, q;
635         int r;
636         assert(f);
637         assert(o);
638         assert(offset > 0);
639
640         p = le64toh(o->entry.items[i].object_offset);
641         if (p == 0)
642                 return -EINVAL;
643
644         o->entry.items[i].next_entry_offset = 0;
645
646         /* Move to the data object */
647         r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
648         if (r < 0)
649                 return r;
650
651         q = le64toh(o->data.tail_entry_offset);
652         o->data.tail_entry_offset = htole64(offset);
653
654         if (q == 0)
655                 o->data.head_entry_offset = htole64(offset);
656         else {
657                 uint64_t n, j;
658
659                 /* Move to previous entry */
660                 r = journal_file_move_to_object(f, q, OBJECT_ENTRY, &o);
661                 if (r < 0)
662                         return r;
663
664                 n = journal_file_entry_n_items(o);
665                 for (j = 0; j < n; j++)
666                         if (le64toh(o->entry.items[j].object_offset) == p)
667                                 break;
668
669                 if (j >= n)
670                         return -EBADMSG;
671
672                 o->entry.items[j].next_entry_offset = offset;
673         }
674
675         /* Move back to original entry */
676         r = journal_file_move_to_object(f, offset, OBJECT_ENTRY, &o);
677         if (r < 0)
678                 return r;
679
680         o->entry.items[i].prev_entry_offset = q;
681         return 0;
682 }
683
684 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
685         uint64_t p, i, n, k, a, b;
686         int r;
687
688         assert(f);
689         assert(o);
690         assert(offset > 0);
691         assert(o->object.type == htole64(OBJECT_ENTRY));
692
693         /* Link up the entry itself */
694         p = le64toh(f->header->tail_entry_offset);
695
696         o->entry.prev_entry_offset = f->header->tail_entry_offset;
697         o->entry.next_entry_offset = 0;
698
699         if (p == 0) {
700                 f->header->head_entry_offset = htole64(offset);
701                 f->header->head_entry_realtime = o->entry.realtime;
702         } else {
703                 /* Temporarily move back to the previous entry, to
704                  * patch in pointer */
705
706                 r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
707                 if (r < 0)
708                         return r;
709
710                 o->entry.next_entry_offset = htole64(offset);
711
712                 r = journal_file_move_to_object(f, offset, OBJECT_ENTRY, &o);
713                 if (r < 0)
714                         return r;
715         }
716
717         f->header->tail_entry_offset = htole64(offset);
718         f->header->tail_entry_realtime = o->entry.realtime;
719
720         /* Link up the items */
721         n = journal_file_entry_n_items(o);
722         for (i = 0; i < n; i++) {
723                 r = journal_file_link_entry_item(f, o, offset, i);
724                 if (r < 0)
725                         return r;
726         }
727
728         /* Link up the entry in the bisect table */
729         n = le64toh(f->header->bisect_table_size) / sizeof(uint64_t);
730         k = le64toh(f->header->arena_max_size) / n;
731
732         a = (le64toh(f->header->last_bisect_offset) + k - 1) / k;
733         b = offset / k;
734
735         for (; a <= b; a++)
736                 f->bisect_table[a] = htole64(offset);
737
738         f->header->last_bisect_offset = htole64(offset + le64toh(o->object.size));
739
740         return 0;
741 }
742
743 static int journal_file_append_entry_internal(
744                 JournalFile *f,
745                 const dual_timestamp *ts,
746                 uint64_t xor_hash,
747                 const EntryItem items[], unsigned n_items,
748                 uint64_t *seqno,
749                 Object **ret, uint64_t *offset) {
750         uint64_t np;
751         uint64_t osize;
752         Object *o;
753         int r;
754
755         assert(f);
756         assert(items || n_items == 0);
757
758         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
759
760         r = journal_file_append_object(f, osize, &o, &np);
761         if (r < 0)
762                 return r;
763
764         o->object.type = htole64(OBJECT_ENTRY);
765         o->entry.seqnum = htole64(journal_file_seqnum(f, seqno));
766         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
767         o->entry.realtime = htole64(ts ? ts->realtime : now(CLOCK_REALTIME));
768         o->entry.monotonic = htole64(ts ? ts->monotonic : now(CLOCK_MONOTONIC));
769         o->entry.xor_hash = htole64(xor_hash);
770         o->entry.boot_id = f->header->boot_id;
771
772         r = journal_file_link_entry(f, o, np);
773         if (r < 0)
774                 return r;
775
776         if (ret)
777                 *ret = o;
778
779         if (offset)
780                 *offset = np;
781
782         return 0;
783 }
784
785 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqno, Object **ret, uint64_t *offset) {
786         unsigned i;
787         EntryItem *items;
788         int r;
789         uint64_t xor_hash = 0;
790
791         assert(f);
792         assert(iovec || n_iovec == 0);
793
794         items = new(EntryItem, n_iovec);
795         if (!items)
796                 return -ENOMEM;
797
798         for (i = 0; i < n_iovec; i++) {
799                 uint64_t p;
800                 Object *o;
801
802                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
803                 if (r < 0)
804                         goto finish;
805
806                 xor_hash ^= le64toh(o->data.hash);
807                 items[i].object_offset = htole64(p);
808                 items[i].hash = o->data.hash;
809         }
810
811         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqno, ret, offset);
812
813 finish:
814         free(items);
815
816         return r;
817 }
818
819 int journal_file_move_to_entry(JournalFile *f, uint64_t seqnum, Object **ret, uint64_t *offset) {
820         Object *o;
821         uint64_t lower, upper, p, n, k;
822         int r;
823
824         assert(f);
825
826         n = le64toh(f->header->bisect_table_size) / sizeof(uint64_t);
827         k = le64toh(f->header->arena_max_size) / n;
828
829         lower = 0;
830         upper = le64toh(f->header->last_bisect_offset)/k+1;
831
832         while (lower < upper) {
833                 k = (upper + lower) / 2;
834                 p = le64toh(f->bisect_table[k]);
835
836                 if (p == 0) {
837                         upper = k;
838                         continue;
839                 }
840
841                 r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
842                 if (r < 0)
843                         return r;
844
845                 if (o->entry.seqnum == seqnum) {
846                         if (ret)
847                                 *ret = o;
848
849                         if (offset)
850                                 *offset = p;
851
852                         return 1;
853                 } else if (seqnum < o->entry.seqnum)
854                         upper = k;
855                 else if (seqnum > o->entry.seqnum)
856                         lower = k+1;
857         }
858
859         assert(lower == upper);
860
861         if (lower <= 0)
862                 return 0;
863
864         /* The object we are looking for is between
865          * bisect_table[lower-1] and bisect_table[lower] */
866
867         p = le64toh(f->bisect_table[lower-1]);
868
869         for (;;) {
870                 r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
871                 if (r < 0)
872                         return r;
873
874                 if (o->entry.seqnum == seqnum) {
875                         if (ret)
876                                 *ret = o;
877
878                         if (offset)
879                                 *offset = p;
880
881                         return 1;
882
883                 } if (seqnum < o->entry.seqnum)
884                         return 0;
885
886                 if (o->entry.next_entry_offset == 0)
887                         return 0;
888
889                 p = le64toh(o->entry.next_entry_offset);
890         }
891
892         return 0;
893 }
894
895 int journal_file_next_entry(JournalFile *f, Object *o, Object **ret, uint64_t *offset) {
896         uint64_t np;
897         int r;
898
899         assert(f);
900
901         if (!o)
902                 np = le64toh(f->header->head_entry_offset);
903         else {
904                 if (le64toh(o->object.type) != OBJECT_ENTRY)
905                         return -EINVAL;
906
907                 np = le64toh(o->entry.next_entry_offset);
908         }
909
910         if (np == 0)
911                 return 0;
912
913         r = journal_file_move_to_object(f, np, OBJECT_ENTRY, &o);
914         if (r < 0)
915                 return r;
916
917         if (ret)
918                 *ret = o;
919
920         if (offset)
921                 *offset = np;
922
923         return 1;
924 }
925
926 int journal_file_prev_entry(JournalFile *f, Object *o, Object **ret, uint64_t *offset) {
927         uint64_t np;
928         int r;
929
930         assert(f);
931
932         if (!o)
933                 np = le64toh(f->header->tail_entry_offset);
934         else {
935                 if (le64toh(o->object.type) != OBJECT_ENTRY)
936                         return -EINVAL;
937
938                 np = le64toh(o->entry.prev_entry_offset);
939         }
940
941         if (np == 0)
942                 return 0;
943
944         r = journal_file_move_to_object(f, np, OBJECT_ENTRY, &o);
945         if (r < 0)
946                 return r;
947
948         if (ret)
949                 *ret = o;
950
951         if (offset)
952                 *offset = np;
953
954         return 1;
955 }
956
957 int journal_file_find_first_entry(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
958         uint64_t p, osize, hash, h;
959         int r;
960
961         assert(f);
962         assert(data || size == 0);
963
964         osize = offsetof(Object, data.payload) + size;
965
966         hash = hash64(data, size);
967         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
968         p = le64toh(f->hash_table[h].head_hash_offset);
969
970         while (p != 0) {
971                 Object *o;
972
973                 r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
974                 if (r < 0)
975                         return r;
976
977                 if (le64toh(o->object.size) == osize &&
978                     memcmp(o->data.payload, data, size) == 0) {
979
980                         if (le64toh(o->data.hash) != hash)
981                                 return -EBADMSG;
982
983                         if (o->data.head_entry_offset == 0)
984                                 return 0;
985
986                         p = le64toh(o->data.head_entry_offset);
987                         r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
988                         if (r < 0)
989                                 return r;
990
991                         if (ret)
992                                 *ret = o;
993
994                         if (offset)
995                                 *offset = p;
996
997                         return 1;
998                 }
999
1000                 p = le64toh(o->data.next_hash_offset);
1001         }
1002
1003         return 0;
1004 }
1005
1006 int journal_file_find_last_entry(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
1007         uint64_t p, osize, hash, h;
1008         int r;
1009
1010         assert(f);
1011         assert(data || size == 0);
1012
1013         osize = offsetof(Object, data.payload) + size;
1014
1015         hash = hash64(data, size);
1016         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
1017         p = le64toh(f->hash_table[h].tail_hash_offset);
1018
1019         while (p != 0) {
1020                 Object *o;
1021
1022                 r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
1023                 if (r < 0)
1024                         return r;
1025
1026                 if (le64toh(o->object.size) == osize &&
1027                     memcmp(o->data.payload, data, size) == 0) {
1028
1029                         if (le64toh(o->data.hash) != hash)
1030                                 return -EBADMSG;
1031
1032                         if (o->data.tail_entry_offset == 0)
1033                                 return 0;
1034
1035                         p = le64toh(o->data.tail_entry_offset);
1036                         r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
1037                         if (r < 0)
1038                                 return r;
1039
1040                         if (ret)
1041                                 *ret = o;
1042
1043                         if (offset)
1044                                 *offset = p;
1045
1046                         return 1;
1047                 }
1048
1049                 p = le64toh(o->data.prev_hash_offset);
1050         }
1051
1052         return 0;
1053 }
1054
1055 void journal_file_dump(JournalFile *f) {
1056         char a[33], b[33], c[33];
1057         Object *o;
1058         int r;
1059         uint64_t p;
1060
1061         assert(f);
1062
1063         printf("File ID: %s\n"
1064                "Machine ID: %s\n"
1065                "Boot ID: %s\n"
1066                "Arena size: %llu\n",
1067                sd_id128_to_string(f->header->file_id, a),
1068                sd_id128_to_string(f->header->machine_id, b),
1069                sd_id128_to_string(f->header->boot_id, c),
1070                (unsigned long long) le64toh(f->header->arena_size));
1071
1072         p = le64toh(f->header->head_object_offset);
1073         while (p != 0) {
1074                 r = journal_file_move_to_object(f, p, -1, &o);
1075                 if (r < 0)
1076                         goto fail;
1077
1078                 switch (o->object.type) {
1079
1080                 case OBJECT_UNUSED:
1081                         printf("Type: OBJECT_UNUSED\n");
1082                         break;
1083
1084                 case OBJECT_DATA:
1085                         printf("Type: OBJECT_DATA\n");
1086                         break;
1087
1088                 case OBJECT_ENTRY:
1089                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1090                                (unsigned long long) le64toh(o->entry.seqnum),
1091                                (unsigned long long) le64toh(o->entry.monotonic),
1092                                (unsigned long long) le64toh(o->entry.realtime));
1093                         break;
1094
1095                 case OBJECT_HASH_TABLE:
1096                         printf("Type: OBJECT_HASH_TABLE\n");
1097                         break;
1098
1099                 case OBJECT_BISECT_TABLE:
1100                         printf("Type: OBJECT_BISECT_TABLE\n");
1101                         break;
1102                 }
1103
1104                 if (p == le64toh(f->header->tail_object_offset))
1105                         p = 0;
1106                 else
1107                         p = p + ALIGN64(le64toh(o->object.size));
1108         }
1109
1110         return;
1111 fail:
1112         log_error("File corrupt");
1113 }
1114
1115 int journal_file_open(
1116                 const char *fname,
1117                 int flags,
1118                 mode_t mode,
1119                 JournalFile *template,
1120                 JournalFile **ret) {
1121
1122         JournalFile *f;
1123         int r;
1124         bool newly_created = false;
1125
1126         assert(fname);
1127
1128         if ((flags & O_ACCMODE) != O_RDONLY &&
1129             (flags & O_ACCMODE) != O_RDWR)
1130                 return -EINVAL;
1131
1132         f = new0(JournalFile, 1);
1133         if (!f)
1134                 return -ENOMEM;
1135
1136         f->fd = -1;
1137         f->flags = flags;
1138         f->mode = mode;
1139         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1140         f->prot = prot_from_flags(flags);
1141
1142         f->path = strdup(fname);
1143         if (!f->path) {
1144                 r = -ENOMEM;
1145                 goto fail;
1146         }
1147
1148         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1149         if (f->fd < 0) {
1150                 r = -errno;
1151                 goto fail;
1152         }
1153
1154         if (fstat(f->fd, &f->last_stat) < 0) {
1155                 r = -errno;
1156                 goto fail;
1157         }
1158
1159         if (f->last_stat.st_size == 0 && f->writable) {
1160                 newly_created = true;
1161
1162                 r = journal_file_init_header(f, template);
1163                 if (r < 0)
1164                         goto fail;
1165
1166                 if (fstat(f->fd, &f->last_stat) < 0) {
1167                         r = -errno;
1168                         goto fail;
1169                 }
1170         }
1171
1172         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1173                 r = -EIO;
1174                 goto fail;
1175         }
1176
1177         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1178         if (f->header == MAP_FAILED) {
1179                 f->header = NULL;
1180                 r = -errno;
1181                 goto fail;
1182         }
1183
1184         if (!newly_created) {
1185                 r = journal_file_verify_header(f);
1186                 if (r < 0)
1187                         goto fail;
1188         }
1189
1190         if (f->writable) {
1191                 r = journal_file_refresh_header(f);
1192                 if (r < 0)
1193                         goto fail;
1194         }
1195
1196         if (newly_created) {
1197
1198                 r = journal_file_setup_hash_table(f);
1199                 if (r < 0)
1200                         goto fail;
1201
1202                 r = journal_file_setup_bisect_table(f);
1203                 if (r < 0)
1204                         goto fail;
1205         }
1206
1207         r = journal_file_map_hash_table(f);
1208         if (r < 0)
1209                 goto fail;
1210
1211         r = journal_file_map_bisect_table(f);
1212         if (r < 0)
1213                 goto fail;
1214
1215         if (ret)
1216                 *ret = f;
1217
1218         return 0;
1219
1220 fail:
1221         journal_file_close(f);
1222
1223         return r;
1224 }
1225
1226 int journal_file_rotate(JournalFile **f) {
1227         char *p;
1228         size_t l;
1229         JournalFile *old_file, *new_file = NULL;
1230         int r;
1231
1232         assert(f);
1233         assert(*f);
1234
1235         old_file = *f;
1236
1237         if (!old_file->writable)
1238                 return -EINVAL;
1239
1240         if (!endswith(old_file->path, ".journal"))
1241                 return -EINVAL;
1242
1243         l = strlen(old_file->path);
1244
1245         p = new(char, l + 1 + 16 + 1 + 32 + 1 + 16 + 1);
1246         if (!p)
1247                 return -ENOMEM;
1248
1249         memcpy(p, old_file->path, l - 8);
1250         p[l-8] = '@';
1251         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1252         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1253                  "-%016llx-%016llx.journal",
1254                  (unsigned long long) le64toh((*f)->header->seqnum),
1255                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1256
1257         r = rename(old_file->path, p);
1258         free(p);
1259
1260         if (r < 0)
1261                 return -errno;
1262
1263         old_file->header->state = le32toh(STATE_ARCHIVED);
1264
1265         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1266         journal_file_close(old_file);
1267
1268         *f = new_file;
1269         return r;
1270 }
1271
1272 struct vacuum_info {
1273         off_t usage;
1274         char *filename;
1275
1276         uint64_t realtime;
1277         sd_id128_t seqnum_id;
1278         uint64_t seqnum;
1279 };
1280
1281 static int vacuum_compare(const void *_a, const void *_b) {
1282         const struct vacuum_info *a, *b;
1283
1284         a = _a;
1285         b = _b;
1286
1287         if (sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1288                 if (a->seqnum < b->seqnum)
1289                         return -1;
1290                 else if (a->seqnum > b->seqnum)
1291                         return 1;
1292                 else
1293                         return 0;
1294         }
1295
1296         if (a->realtime < b->realtime)
1297                 return -1;
1298         else if (a->realtime > b->realtime)
1299                 return 1;
1300         else
1301                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1302 }
1303
1304 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1305         DIR *d;
1306         int r = 0;
1307         struct vacuum_info *list = NULL;
1308         unsigned n_list = 0, n_allocated = 0, i;
1309         uint64_t sum = 0;
1310
1311         assert(directory);
1312
1313         if (max_use <= 0)
1314                 max_use = DEFAULT_MAX_USE;
1315
1316         d = opendir(directory);
1317         if (!d)
1318                 return -errno;
1319
1320         for (;;) {
1321                 int k;
1322                 struct dirent buf, *de;
1323                 size_t q;
1324                 struct stat st;
1325                 char *p;
1326                 unsigned long long seqnum, realtime;
1327                 sd_id128_t seqnum_id;
1328
1329                 k = readdir_r(d, &buf, &de);
1330                 if (k != 0) {
1331                         r = -k;
1332                         goto finish;
1333                 }
1334
1335                 if (!de)
1336                         break;
1337
1338                 if (!dirent_is_file_with_suffix(de, ".journal"))
1339                         continue;
1340
1341                 q = strlen(de->d_name);
1342
1343                 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
1344                         continue;
1345
1346                 if (de->d_name[q-8-16-1] != '-' ||
1347                     de->d_name[q-8-16-1-16-1] != '-' ||
1348                     de->d_name[q-8-16-1-16-1-32-1] != '@')
1349                         continue;
1350
1351                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
1352                         continue;
1353
1354                 if (!S_ISREG(st.st_mode))
1355                         continue;
1356
1357                 p = strdup(de->d_name);
1358                 if (!p) {
1359                         r = -ENOMEM;
1360                         goto finish;
1361                 }
1362
1363                 de->d_name[q-8-16-1-16-1] = 0;
1364                 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
1365                         free(p);
1366                         continue;
1367                 }
1368
1369                 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
1370                         free(p);
1371                         continue;
1372                 }
1373
1374                 if (n_list >= n_allocated) {
1375                         struct vacuum_info *j;
1376
1377                         n_allocated = MAX(n_allocated * 2U, 8U);
1378                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
1379                         if (!j) {
1380                                 free(p);
1381                                 r = -ENOMEM;
1382                                 goto finish;
1383                         }
1384
1385                         list = j;
1386                 }
1387
1388                 list[n_list].filename = p;
1389                 list[n_list].usage = (uint64_t) st.st_blksize * (uint64_t) st.st_blocks;
1390                 list[n_list].seqnum = seqnum;
1391                 list[n_list].realtime = realtime;
1392                 list[n_list].seqnum_id = seqnum_id;
1393
1394                 sum += list[n_list].usage;
1395
1396                 n_list ++;
1397         }
1398
1399         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
1400
1401         for(i = 0; i < n_list; i++) {
1402                 struct statvfs ss;
1403
1404                 if (fstatvfs(dirfd(d), &ss) < 0) {
1405                         r = -errno;
1406                         goto finish;
1407                 }
1408
1409                 if (sum <= max_use &&
1410                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
1411                         break;
1412
1413                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
1414                         log_debug("Deleted archived journal %s/%s.", directory, list[i].filename);
1415                         sum -= list[i].usage;
1416                 } else if (errno != ENOENT)
1417                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
1418         }
1419
1420 finish:
1421         for (i = 0; i < n_list; i++)
1422                 free(list[i].filename);
1423
1424         free(list);
1425
1426         return r;
1427 }