chiark / gitweb /
journal: synchronize seqnum across files
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33
34 #define DEFAULT_ARENA_MAX_SIZE (16ULL*1024ULL*1024ULL*1024ULL)
35 #define DEFAULT_ARENA_MIN_SIZE (256ULL*1024ULL)
36 #define DEFAULT_ARENA_KEEP_FREE (1ULL*1024ULL*1024ULL)
37
38 #define DEFAULT_MAX_USE (16ULL*1024ULL*1024ULL*16ULL)
39
40 #define DEFAULT_HASH_TABLE_SIZE (2047ULL*16ULL)
41 #define DEFAULT_BISECT_TABLE_SIZE ((DEFAULT_ARENA_MAX_SIZE/(64ULL*1024ULL))*8ULL)
42
43 #define DEFAULT_WINDOW_SIZE (128ULL*1024ULL*1024ULL)
44
45 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
46
47 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
48
49 void journal_file_close(JournalFile *f) {
50         assert(f);
51
52         if (f->header) {
53                 if (f->writable && f->header->state == htole32(STATE_ONLINE))
54                         f->header->state = htole32(STATE_OFFLINE);
55
56                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
57         }
58
59         if (f->hash_table_window)
60                 munmap(f->hash_table_window, f->hash_table_window_size);
61
62         if (f->bisect_table_window)
63                 munmap(f->bisect_table_window, f->bisect_table_window_size);
64
65         if (f->window)
66                 munmap(f->window, f->window_size);
67
68         if (f->fd >= 0)
69                 close_nointr_nofail(f->fd);
70
71         free(f->path);
72         free(f);
73 }
74
75 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
76         Header h;
77         ssize_t k;
78         int r;
79
80         assert(f);
81
82         zero(h);
83         memcpy(h.signature, signature, 8);
84         h.arena_offset = htole64(ALIGN64(sizeof(h)));
85         h.arena_max_size = htole64(DEFAULT_ARENA_MAX_SIZE);
86         h.arena_min_size = htole64(DEFAULT_ARENA_MIN_SIZE);
87         h.arena_keep_free = htole64(DEFAULT_ARENA_KEEP_FREE);
88
89         r = sd_id128_randomize(&h.file_id);
90         if (r < 0)
91                 return r;
92
93         if (template) {
94                 h.seqnum_id = template->header->seqnum_id;
95                 h.seqnum = template->header->seqnum;
96         } else
97                 h.seqnum_id = h.file_id;
98
99         k = pwrite(f->fd, &h, sizeof(h), 0);
100         if (k < 0)
101                 return -errno;
102
103         if (k != sizeof(h))
104                 return -EIO;
105
106         return 0;
107 }
108
109 static int journal_file_refresh_header(JournalFile *f) {
110         int r;
111
112         assert(f);
113
114         r = sd_id128_get_machine(&f->header->machine_id);
115         if (r < 0)
116                 return r;
117
118         r = sd_id128_get_boot(&f->header->boot_id);
119         if (r < 0)
120                 return r;
121
122         f->header->state = htole32(STATE_ONLINE);
123         return 0;
124 }
125
126 static int journal_file_verify_header(JournalFile *f) {
127         assert(f);
128
129         if (memcmp(f->header, signature, 8))
130                 return -EBADMSG;
131
132         if (f->header->incompatible_flags != 0)
133                 return -EPROTONOSUPPORT;
134
135         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
136                 return -ENODATA;
137
138         if (f->writable) {
139                 uint32_t state;
140                 sd_id128_t machine_id;
141                 int r;
142
143                 r = sd_id128_get_machine(&machine_id);
144                 if (r < 0)
145                         return r;
146
147                 if (!sd_id128_equal(machine_id, f->header->machine_id))
148                         return -EHOSTDOWN;
149
150                 state = le32toh(f->header->state);
151
152                 if (state == STATE_ONLINE)
153                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
154                 else if (state == STATE_ARCHIVED)
155                         return -ESHUTDOWN;
156                 else if (state != STATE_OFFLINE)
157                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
158         }
159
160         return 0;
161 }
162
163 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
164         uint64_t asize;
165         uint64_t old_size, new_size;
166
167         assert(f);
168
169         if (offset < le64toh(f->header->arena_offset))
170                 return -EINVAL;
171
172         new_size = PAGE_ALIGN(offset + size);
173
174         /* We assume that this file is not sparse, and we know that
175          * for sure, since we always call posix_fallocate()
176          * ourselves */
177
178         old_size =
179                 le64toh(f->header->arena_offset) +
180                 le64toh(f->header->arena_size);
181
182         if (old_size >= new_size)
183                 return 0;
184
185         asize = new_size - le64toh(f->header->arena_offset);
186
187         if (asize > le64toh(f->header->arena_min_size)) {
188                 struct statvfs svfs;
189
190                 if (fstatvfs(f->fd, &svfs) >= 0) {
191                         uint64_t available;
192
193                         available = svfs.f_bfree * svfs.f_bsize;
194
195                         if (available >= f->header->arena_keep_free)
196                                 available -= f->header->arena_keep_free;
197                         else
198                                 available = 0;
199
200                         if (new_size - old_size > available)
201                                 return -E2BIG;
202                 }
203         }
204
205         if (asize > le64toh(f->header->arena_max_size))
206                 return -E2BIG;
207
208         if (posix_fallocate(f->fd, old_size, new_size - old_size) < 0)
209                 return -errno;
210
211         if (fstat(f->fd, &f->last_stat) < 0)
212                 return -errno;
213
214         f->header->arena_size = htole64(asize);
215
216         return 0;
217 }
218
219 static int journal_file_map(
220                 JournalFile *f,
221                 uint64_t offset,
222                 uint64_t size,
223                 void **_window,
224                 uint64_t *_woffset,
225                 uint64_t *_wsize,
226                 void **ret) {
227
228         uint64_t woffset, wsize;
229         void *window;
230
231         assert(f);
232         assert(size > 0);
233         assert(ret);
234
235         woffset = offset & ~((uint64_t) page_size() - 1ULL);
236         wsize = size + (offset - woffset);
237         wsize = PAGE_ALIGN(wsize);
238
239         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
240         if (window == MAP_FAILED)
241                 return -errno;
242
243         if (_window)
244                 *_window = window;
245
246         if (_woffset)
247                 *_woffset = woffset;
248
249         if (_wsize)
250                 *_wsize = wsize;
251
252         *ret = (uint8_t*) window + (offset - woffset);
253
254         return 0;
255 }
256
257 static int journal_file_move_to(JournalFile *f, uint64_t offset, uint64_t size, void **ret) {
258         void *p;
259         uint64_t delta;
260         int r;
261
262         assert(f);
263         assert(ret);
264
265         if (_likely_(f->window &&
266                      f->window_offset <= offset &&
267                      f->window_offset+f->window_size >= offset + size)) {
268
269                 *ret = (uint8_t*) f->window + (offset - f->window_offset);
270                 return 0;
271         }
272
273         if (f->window) {
274                 if (munmap(f->window, f->window_size) < 0)
275                         return -errno;
276
277                 f->window = NULL;
278                 f->window_size = f->window_offset = 0;
279         }
280
281         if (size < DEFAULT_WINDOW_SIZE) {
282                 /* If the default window size is larger then what was
283                  * asked for extend the mapping a bit in the hope to
284                  * minimize needed remappings later on. We add half
285                  * the window space before and half behind the
286                  * requested mapping */
287
288                 delta = PAGE_ALIGN((DEFAULT_WINDOW_SIZE - size) / 2);
289
290                 if (offset < delta)
291                         delta = offset;
292
293                 offset -= delta;
294                 size += (DEFAULT_WINDOW_SIZE - delta);
295         } else
296                 delta = 0;
297
298         r = journal_file_map(f,
299                              offset, size,
300                              &f->window, &f->window_offset, &f->window_size,
301                              & p);
302
303         if (r < 0)
304                 return r;
305
306         *ret = (uint8_t*) p + delta;
307         return 0;
308 }
309
310 static bool verify_hash(Object *o) {
311         uint64_t t;
312
313         assert(o);
314
315         t = le64toh(o->object.type);
316         if (t == OBJECT_DATA) {
317                 uint64_t s, h1, h2;
318
319                 s = le64toh(o->object.size);
320
321                 h1 = le64toh(o->data.hash);
322                 h2 = hash64(o->data.payload, s - offsetof(Object, data.payload));
323
324                 return h1 == h2;
325         }
326
327         return true;
328 }
329
330 int journal_file_move_to_object(JournalFile *f, uint64_t offset, int type, Object **ret) {
331         int r;
332         void *t;
333         Object *o;
334         uint64_t s;
335
336         assert(f);
337         assert(ret);
338
339         r = journal_file_move_to(f, offset, sizeof(ObjectHeader), &t);
340         if (r < 0)
341                 return r;
342
343         o = (Object*) t;
344         s = le64toh(o->object.size);
345
346         if (s < sizeof(ObjectHeader))
347                 return -EBADMSG;
348
349         if (type >= 0 && le64toh(o->object.type) != type)
350                 return -EBADMSG;
351
352         if (s > sizeof(ObjectHeader)) {
353                 r = journal_file_move_to(f, offset, s, &t);
354                 if (r < 0)
355                         return r;
356
357                 o = (Object*) t;
358         }
359
360         if (!verify_hash(o))
361                 return -EBADMSG;
362
363         *ret = o;
364         return 0;
365 }
366
367 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
368         uint64_t r;
369
370         assert(f);
371
372         r = le64toh(f->header->seqnum) + 1;
373
374         if (seqnum) {
375                 /* If an external seqno counter was passed, we update
376                  * both the local and the external one, and set it to
377                  * the maximum of both */
378
379                 if (*seqnum + 1 > r)
380                         r = *seqnum + 1;
381
382                 *seqnum = r;
383         }
384
385         f->header->seqnum = htole64(r);
386
387         return r;
388 }
389
390 static int journal_file_append_object(JournalFile *f, uint64_t size, Object **ret, uint64_t *offset) {
391         int r;
392         uint64_t p;
393         Object *tail, *o;
394         void *t;
395
396         assert(f);
397         assert(size >= sizeof(ObjectHeader));
398         assert(offset);
399         assert(ret);
400
401         p = le64toh(f->header->tail_object_offset);
402
403         if (p == 0)
404                 p = le64toh(f->header->arena_offset);
405         else {
406                 r = journal_file_move_to_object(f, p, -1, &tail);
407                 if (r < 0)
408                         return r;
409
410                 p += ALIGN64(le64toh(tail->object.size));
411         }
412
413         r = journal_file_allocate(f, p, size);
414         if (r < 0)
415                 return r;
416
417         r = journal_file_move_to(f, p, size, &t);
418         if (r < 0)
419                 return r;
420
421         o = (Object*) t;
422
423         zero(o->object);
424         o->object.type = htole64(OBJECT_UNUSED);
425         zero(o->object.reserved);
426         o->object.size = htole64(size);
427
428         f->header->tail_object_offset = htole64(p);
429         if (f->header->head_object_offset == 0)
430                 f->header->head_object_offset = htole64(p);
431
432         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
433
434         *ret = o;
435         *offset = p;
436
437         return 0;
438 }
439
440 static int journal_file_setup_hash_table(JournalFile *f) {
441         uint64_t s, p;
442         Object *o;
443         int r;
444
445         assert(f);
446
447         s = DEFAULT_HASH_TABLE_SIZE;
448         r = journal_file_append_object(f, offsetof(Object, hash_table.table) + s, &o, &p);
449         if (r < 0)
450                 return r;
451
452         o->object.type = htole64(OBJECT_HASH_TABLE);
453         memset(o->hash_table.table, 0, s);
454
455         f->header->hash_table_offset = htole64(p + offsetof(Object, hash_table.table));
456         f->header->hash_table_size = htole64(s);
457
458         return 0;
459 }
460
461 static int journal_file_setup_bisect_table(JournalFile *f) {
462         uint64_t s, p;
463         Object *o;
464         int r;
465
466         assert(f);
467
468         s = DEFAULT_BISECT_TABLE_SIZE;
469         r = journal_file_append_object(f, offsetof(Object, bisect_table.table) + s, &o, &p);
470         if (r < 0)
471                 return r;
472
473         o->object.type = htole64(OBJECT_BISECT_TABLE);
474         memset(o->bisect_table.table, 0, s);
475
476         f->header->bisect_table_offset = htole64(p + offsetof(Object, bisect_table.table));
477         f->header->bisect_table_size = htole64(s);
478
479         return 0;
480 }
481
482 static int journal_file_map_hash_table(JournalFile *f) {
483         uint64_t s, p;
484         void *t;
485         int r;
486
487         assert(f);
488
489         p = le64toh(f->header->hash_table_offset);
490         s = le64toh(f->header->hash_table_size);
491
492         r = journal_file_map(f,
493                              p, s,
494                              &f->hash_table_window, NULL, &f->hash_table_window_size,
495                              &t);
496         if (r < 0)
497                 return r;
498
499         f->hash_table = t;
500         return 0;
501 }
502
503 static int journal_file_map_bisect_table(JournalFile *f) {
504         uint64_t s, p;
505         void *t;
506         int r;
507
508         assert(f);
509
510         p = le64toh(f->header->bisect_table_offset);
511         s = le64toh(f->header->bisect_table_size);
512
513         r = journal_file_map(f,
514                              p, s,
515                              &f->bisect_table_window, NULL, &f->bisect_table_window_size,
516                              &t);
517
518         if (r < 0)
519                 return r;
520
521         f->bisect_table = t;
522         return 0;
523 }
524
525 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash_index) {
526         uint64_t p;
527         int r;
528
529         assert(f);
530         assert(o);
531         assert(offset > 0);
532         assert(o->object.type == htole64(OBJECT_DATA));
533
534         o->data.head_entry_offset = o->data.tail_entry_offset = 0;
535         o->data.next_hash_offset = 0;
536
537         p = le64toh(f->hash_table[hash_index].tail_hash_offset);
538         if (p == 0) {
539                 /* Only entry in the hash table is easy */
540
541                 o->data.prev_hash_offset = 0;
542                 f->hash_table[hash_index].head_hash_offset = htole64(offset);
543         } else {
544                 o->data.prev_hash_offset = htole64(p);
545
546                 /* Temporarily move back to the previous data object,
547                  * to patch in pointer */
548
549                 r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
550                 if (r < 0)
551                         return r;
552
553                 o->data.next_hash_offset = offset;
554
555                 r = journal_file_move_to_object(f, offset, OBJECT_DATA, &o);
556                 if (r < 0)
557                         return r;
558         }
559
560         f->hash_table[hash_index].tail_hash_offset = htole64(offset);
561
562         return 0;
563 }
564
565 static int journal_file_append_data(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
566         uint64_t hash, h, p, np;
567         uint64_t osize;
568         Object *o;
569         int r;
570
571         assert(f);
572         assert(data || size == 0);
573
574         osize = offsetof(Object, data.payload) + size;
575
576         hash = hash64(data, size);
577         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
578         p = le64toh(f->hash_table[h].head_hash_offset);
579
580         while (p != 0) {
581                 /* Look for this data object in the hash table */
582
583                 r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
584                 if (r < 0)
585                         return r;
586
587                 if (le64toh(o->object.size) == osize &&
588                     memcmp(o->data.payload, data, size) == 0) {
589
590                         if (le64toh(o->data.hash) != hash)
591                                 return -EBADMSG;
592
593                         if (ret)
594                                 *ret = o;
595
596                         if (offset)
597                                 *offset = p;
598
599                         return 0;
600                 }
601
602                 p = le64toh(o->data.next_hash_offset);
603         }
604
605         r = journal_file_append_object(f, osize, &o, &np);
606         if (r < 0)
607                 return r;
608
609         o->object.type = htole64(OBJECT_DATA);
610         o->data.hash = htole64(hash);
611         memcpy(o->data.payload, data, size);
612
613         r = journal_file_link_data(f, o, np, h);
614         if (r < 0)
615                 return r;
616
617         if (ret)
618                 *ret = o;
619
620         if (offset)
621                 *offset = np;
622
623         return 0;
624 }
625
626 uint64_t journal_file_entry_n_items(Object *o) {
627         assert(o);
628         assert(o->object.type == htole64(OBJECT_ENTRY));
629
630         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
631 }
632
633 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
634         uint64_t p, q;
635         int r;
636         assert(f);
637         assert(o);
638         assert(offset > 0);
639
640         p = le64toh(o->entry.items[i].object_offset);
641         if (p == 0)
642                 return -EINVAL;
643
644         o->entry.items[i].next_entry_offset = 0;
645
646         /* Move to the data object */
647         r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
648         if (r < 0)
649                 return r;
650
651         q = le64toh(o->data.tail_entry_offset);
652         o->data.tail_entry_offset = htole64(offset);
653
654         if (q == 0)
655                 o->data.head_entry_offset = htole64(offset);
656         else {
657                 uint64_t n, j;
658
659                 /* Move to previous entry */
660                 r = journal_file_move_to_object(f, q, OBJECT_ENTRY, &o);
661                 if (r < 0)
662                         return r;
663
664                 n = journal_file_entry_n_items(o);
665                 for (j = 0; j < n; j++)
666                         if (le64toh(o->entry.items[j].object_offset) == p)
667                                 break;
668
669                 if (j >= n)
670                         return -EBADMSG;
671
672                 o->entry.items[j].next_entry_offset = offset;
673         }
674
675         /* Move back to original entry */
676         r = journal_file_move_to_object(f, offset, OBJECT_ENTRY, &o);
677         if (r < 0)
678                 return r;
679
680         o->entry.items[i].prev_entry_offset = q;
681         return 0;
682 }
683
684 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
685         uint64_t p, i, n, k, a, b;
686         int r;
687
688         assert(f);
689         assert(o);
690         assert(offset > 0);
691         assert(o->object.type == htole64(OBJECT_ENTRY));
692
693         /* Link up the entry itself */
694         p = le64toh(f->header->tail_entry_offset);
695
696         o->entry.prev_entry_offset = f->header->tail_entry_offset;
697         o->entry.next_entry_offset = 0;
698
699         if (p == 0) {
700                 f->header->head_entry_offset = htole64(offset);
701                 f->header->head_entry_realtime = o->entry.realtime;
702         } else {
703                 /* Temporarily move back to the previous entry, to
704                  * patch in pointer */
705
706                 r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
707                 if (r < 0)
708                         return r;
709
710                 o->entry.next_entry_offset = htole64(offset);
711
712                 r = journal_file_move_to_object(f, offset, OBJECT_ENTRY, &o);
713                 if (r < 0)
714                         return r;
715         }
716
717         f->header->tail_entry_offset = htole64(offset);
718         f->header->tail_entry_realtime = o->entry.realtime;
719
720         /* Link up the items */
721         n = journal_file_entry_n_items(o);
722         for (i = 0; i < n; i++) {
723                 r = journal_file_link_entry_item(f, o, offset, i);
724                 if (r < 0)
725                         return r;
726         }
727
728         /* Link up the entry in the bisect table */
729         n = le64toh(f->header->bisect_table_size) / sizeof(uint64_t);
730         k = le64toh(f->header->arena_max_size) / n;
731
732         a = (le64toh(f->header->last_bisect_offset) + k - 1) / k;
733         b = offset / k;
734
735         for (; a <= b; a++)
736                 f->bisect_table[a] = htole64(offset);
737
738         f->header->last_bisect_offset = htole64(offset + le64toh(o->object.size));
739
740         return 0;
741 }
742
743 static int journal_file_append_entry_internal(
744                 JournalFile *f,
745                 const dual_timestamp *ts,
746                 uint64_t xor_hash,
747                 const EntryItem items[], unsigned n_items,
748                 uint64_t *seqno,
749                 Object **ret, uint64_t *offset) {
750         uint64_t np;
751         uint64_t osize;
752         Object *o;
753         int r;
754
755         assert(f);
756         assert(items || n_items == 0);
757
758         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
759
760         r = journal_file_append_object(f, osize, &o, &np);
761         if (r < 0)
762                 return r;
763
764         o->object.type = htole64(OBJECT_ENTRY);
765         o->entry.seqnum = htole64(journal_file_seqnum(f, seqno));
766         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
767         o->entry.realtime = htole64(ts ? ts->realtime : now(CLOCK_REALTIME));
768         o->entry.monotonic = htole64(ts ? ts->monotonic : now(CLOCK_MONOTONIC));
769         o->entry.xor_hash = htole64(xor_hash);
770         o->entry.boot_id = f->header->boot_id;
771
772         r = journal_file_link_entry(f, o, np);
773         if (r < 0)
774                 return r;
775
776         if (ret)
777                 *ret = o;
778
779         if (offset)
780                 *offset = np;
781
782         return 0;
783 }
784
785 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqno, Object **ret, uint64_t *offset) {
786         unsigned i;
787         EntryItem *items;
788         int r;
789         uint64_t xor_hash = 0;
790
791         assert(f);
792         assert(iovec || n_iovec == 0);
793
794         items = new(EntryItem, n_iovec);
795         if (!items)
796                 return -ENOMEM;
797
798         for (i = 0; i < n_iovec; i++) {
799                 uint64_t p;
800                 Object *o;
801
802                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
803                 if (r < 0)
804                         goto finish;
805
806                 xor_hash ^= le64toh(o->data.hash);
807                 items[i].object_offset = htole64(p);
808         }
809
810         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqno, ret, offset);
811
812 finish:
813         free(items);
814
815         return r;
816 }
817
818 int journal_file_move_to_entry(JournalFile *f, uint64_t seqnum, Object **ret, uint64_t *offset) {
819         Object *o;
820         uint64_t lower, upper, p, n, k;
821         int r;
822
823         assert(f);
824
825         n = le64toh(f->header->bisect_table_size) / sizeof(uint64_t);
826         k = le64toh(f->header->arena_max_size) / n;
827
828         lower = 0;
829         upper = le64toh(f->header->last_bisect_offset)/k+1;
830
831         while (lower < upper) {
832                 k = (upper + lower) / 2;
833                 p = le64toh(f->bisect_table[k]);
834
835                 if (p == 0) {
836                         upper = k;
837                         continue;
838                 }
839
840                 r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
841                 if (r < 0)
842                         return r;
843
844                 if (o->entry.seqnum == seqnum) {
845                         if (ret)
846                                 *ret = o;
847
848                         if (offset)
849                                 *offset = p;
850
851                         return 1;
852                 } else if (seqnum < o->entry.seqnum)
853                         upper = k;
854                 else if (seqnum > o->entry.seqnum)
855                         lower = k+1;
856         }
857
858         assert(lower == upper);
859
860         if (lower <= 0)
861                 return 0;
862
863         /* The object we are looking for is between
864          * bisect_table[lower-1] and bisect_table[lower] */
865
866         p = le64toh(f->bisect_table[lower-1]);
867
868         for (;;) {
869                 r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
870                 if (r < 0)
871                         return r;
872
873                 if (o->entry.seqnum == seqnum) {
874                         if (ret)
875                                 *ret = o;
876
877                         if (offset)
878                                 *offset = p;
879
880                         return 1;
881
882                 } if (seqnum < o->entry.seqnum)
883                         return 0;
884
885                 if (o->entry.next_entry_offset == 0)
886                         return 0;
887
888                 p = le64toh(o->entry.next_entry_offset);
889         }
890
891         return 0;
892 }
893
894 int journal_file_next_entry(JournalFile *f, Object *o, Object **ret, uint64_t *offset) {
895         uint64_t np;
896         int r;
897
898         assert(f);
899
900         if (!o)
901                 np = le64toh(f->header->head_entry_offset);
902         else {
903                 if (le64toh(o->object.type) != OBJECT_ENTRY)
904                         return -EINVAL;
905
906                 np = le64toh(o->entry.next_entry_offset);
907         }
908
909         if (np == 0)
910                 return 0;
911
912         r = journal_file_move_to_object(f, np, OBJECT_ENTRY, &o);
913         if (r < 0)
914                 return r;
915
916         if (ret)
917                 *ret = o;
918
919         if (offset)
920                 *offset = np;
921
922         return 1;
923 }
924
925 int journal_file_prev_entry(JournalFile *f, Object *o, Object **ret, uint64_t *offset) {
926         uint64_t np;
927         int r;
928
929         assert(f);
930
931         if (!o)
932                 np = le64toh(f->header->tail_entry_offset);
933         else {
934                 if (le64toh(o->object.type) != OBJECT_ENTRY)
935                         return -EINVAL;
936
937                 np = le64toh(o->entry.prev_entry_offset);
938         }
939
940         if (np == 0)
941                 return 0;
942
943         r = journal_file_move_to_object(f, np, OBJECT_ENTRY, &o);
944         if (r < 0)
945                 return r;
946
947         if (ret)
948                 *ret = o;
949
950         if (offset)
951                 *offset = np;
952
953         return 1;
954 }
955
956 int journal_file_find_first_entry(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
957         uint64_t p, osize, hash, h;
958         int r;
959
960         assert(f);
961         assert(data || size == 0);
962
963         osize = offsetof(Object, data.payload) + size;
964
965         hash = hash64(data, size);
966         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
967         p = le64toh(f->hash_table[h].head_hash_offset);
968
969         while (p != 0) {
970                 Object *o;
971
972                 r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
973                 if (r < 0)
974                         return r;
975
976                 if (le64toh(o->object.size) == osize &&
977                     memcmp(o->data.payload, data, size) == 0) {
978
979                         if (le64toh(o->data.hash) != hash)
980                                 return -EBADMSG;
981
982                         if (o->data.head_entry_offset == 0)
983                                 return 0;
984
985                         p = le64toh(o->data.head_entry_offset);
986                         r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
987                         if (r < 0)
988                                 return r;
989
990                         if (ret)
991                                 *ret = o;
992
993                         if (offset)
994                                 *offset = p;
995
996                         return 1;
997                 }
998
999                 p = le64toh(o->data.next_hash_offset);
1000         }
1001
1002         return 0;
1003 }
1004
1005 int journal_file_find_last_entry(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
1006         uint64_t p, osize, hash, h;
1007         int r;
1008
1009         assert(f);
1010         assert(data || size == 0);
1011
1012         osize = offsetof(Object, data.payload) + size;
1013
1014         hash = hash64(data, size);
1015         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
1016         p = le64toh(f->hash_table[h].tail_hash_offset);
1017
1018         while (p != 0) {
1019                 Object *o;
1020
1021                 r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
1022                 if (r < 0)
1023                         return r;
1024
1025                 if (le64toh(o->object.size) == osize &&
1026                     memcmp(o->data.payload, data, size) == 0) {
1027
1028                         if (le64toh(o->data.hash) != hash)
1029                                 return -EBADMSG;
1030
1031                         if (o->data.tail_entry_offset == 0)
1032                                 return 0;
1033
1034                         p = le64toh(o->data.tail_entry_offset);
1035                         r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
1036                         if (r < 0)
1037                                 return r;
1038
1039                         if (ret)
1040                                 *ret = o;
1041
1042                         if (offset)
1043                                 *offset = p;
1044
1045                         return 1;
1046                 }
1047
1048                 p = le64toh(o->data.prev_hash_offset);
1049         }
1050
1051         return 0;
1052 }
1053
1054 void journal_file_dump(JournalFile *f) {
1055         char a[33], b[33], c[33];
1056         Object *o;
1057         int r;
1058         uint64_t p;
1059
1060         assert(f);
1061
1062         printf("File ID: %s\n"
1063                "Machine ID: %s\n"
1064                "Boot ID: %s\n"
1065                "Arena size: %llu\n",
1066                sd_id128_to_string(f->header->file_id, a),
1067                sd_id128_to_string(f->header->machine_id, b),
1068                sd_id128_to_string(f->header->boot_id, c),
1069                (unsigned long long) le64toh(f->header->arena_size));
1070
1071         p = le64toh(f->header->head_object_offset);
1072         while (p != 0) {
1073                 r = journal_file_move_to_object(f, p, -1, &o);
1074                 if (r < 0)
1075                         goto fail;
1076
1077                 switch (o->object.type) {
1078
1079                 case OBJECT_UNUSED:
1080                         printf("Type: OBJECT_UNUSED\n");
1081                         break;
1082
1083                 case OBJECT_DATA:
1084                         printf("Type: OBJECT_DATA\n");
1085                         break;
1086
1087                 case OBJECT_ENTRY:
1088                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1089                                (unsigned long long) le64toh(o->entry.seqnum),
1090                                (unsigned long long) le64toh(o->entry.monotonic),
1091                                (unsigned long long) le64toh(o->entry.realtime));
1092                         break;
1093
1094                 case OBJECT_HASH_TABLE:
1095                         printf("Type: OBJECT_HASH_TABLE\n");
1096                         break;
1097
1098                 case OBJECT_BISECT_TABLE:
1099                         printf("Type: OBJECT_BISECT_TABLE\n");
1100                         break;
1101                 }
1102
1103                 if (p == le64toh(f->header->tail_object_offset))
1104                         p = 0;
1105                 else
1106                         p = p + ALIGN64(le64toh(o->object.size));
1107         }
1108
1109         return;
1110 fail:
1111         log_error("File corrupt");
1112 }
1113
1114 int journal_file_open(
1115                 const char *fname,
1116                 int flags,
1117                 mode_t mode,
1118                 JournalFile *template,
1119                 JournalFile **ret) {
1120
1121         JournalFile *f;
1122         int r;
1123         bool newly_created = false;
1124
1125         assert(fname);
1126
1127         if ((flags & O_ACCMODE) != O_RDONLY &&
1128             (flags & O_ACCMODE) != O_RDWR)
1129                 return -EINVAL;
1130
1131         f = new0(JournalFile, 1);
1132         if (!f)
1133                 return -ENOMEM;
1134
1135         f->fd = -1;
1136         f->flags = flags;
1137         f->mode = mode;
1138         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1139         f->prot = prot_from_flags(flags);
1140
1141         f->path = strdup(fname);
1142         if (!f->path) {
1143                 r = -ENOMEM;
1144                 goto fail;
1145         }
1146
1147         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1148         if (f->fd < 0) {
1149                 r = -errno;
1150                 goto fail;
1151         }
1152
1153         if (fstat(f->fd, &f->last_stat) < 0) {
1154                 r = -errno;
1155                 goto fail;
1156         }
1157
1158         if (f->last_stat.st_size == 0 && f->writable) {
1159                 newly_created = true;
1160
1161                 r = journal_file_init_header(f, template);
1162                 if (r < 0)
1163                         goto fail;
1164
1165                 if (fstat(f->fd, &f->last_stat) < 0) {
1166                         r = -errno;
1167                         goto fail;
1168                 }
1169         }
1170
1171         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1172                 r = -EIO;
1173                 goto fail;
1174         }
1175
1176         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1177         if (f->header == MAP_FAILED) {
1178                 f->header = NULL;
1179                 r = -errno;
1180                 goto fail;
1181         }
1182
1183         if (!newly_created) {
1184                 r = journal_file_verify_header(f);
1185                 if (r < 0)
1186                         goto fail;
1187         }
1188
1189         if (f->writable) {
1190                 r = journal_file_refresh_header(f);
1191                 if (r < 0)
1192                         goto fail;
1193         }
1194
1195         if (newly_created) {
1196
1197                 r = journal_file_setup_hash_table(f);
1198                 if (r < 0)
1199                         goto fail;
1200
1201                 r = journal_file_setup_bisect_table(f);
1202                 if (r < 0)
1203                         goto fail;
1204         }
1205
1206         r = journal_file_map_hash_table(f);
1207         if (r < 0)
1208                 goto fail;
1209
1210         r = journal_file_map_bisect_table(f);
1211         if (r < 0)
1212                 goto fail;
1213
1214         if (ret)
1215                 *ret = f;
1216
1217         return 0;
1218
1219 fail:
1220         journal_file_close(f);
1221
1222         return r;
1223 }
1224
1225 int journal_file_rotate(JournalFile **f) {
1226         char *p;
1227         size_t l;
1228         JournalFile *old_file, *new_file = NULL;
1229         int r;
1230
1231         assert(f);
1232         assert(*f);
1233
1234         old_file = *f;
1235
1236         if (!old_file->writable)
1237                 return -EINVAL;
1238
1239         if (!endswith(old_file->path, ".journal"))
1240                 return -EINVAL;
1241
1242         l = strlen(old_file->path);
1243
1244         p = new(char, l + 1 + 16 + 1 + 32 + 1 + 16 + 1);
1245         if (!p)
1246                 return -ENOMEM;
1247
1248         memcpy(p, old_file->path, l - 8);
1249         p[l-8] = '@';
1250         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1251         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1252                  "-%016llx-%016llx.journal",
1253                  (unsigned long long) le64toh((*f)->header->seqnum),
1254                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1255
1256         r = rename(old_file->path, p);
1257         free(p);
1258
1259         if (r < 0)
1260                 return -errno;
1261
1262         old_file->header->state = le32toh(STATE_ARCHIVED);
1263
1264         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1265         journal_file_close(old_file);
1266
1267         *f = new_file;
1268         return r;
1269 }
1270
1271 struct vacuum_info {
1272         off_t usage;
1273         char *filename;
1274
1275         uint64_t realtime;
1276         sd_id128_t seqnum_id;
1277         uint64_t seqnum;
1278 };
1279
1280 static int vacuum_compare(const void *_a, const void *_b) {
1281         const struct vacuum_info *a, *b;
1282
1283         a = _a;
1284         b = _b;
1285
1286         if (sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1287                 if (a->seqnum < b->seqnum)
1288                         return -1;
1289                 else if (a->seqnum > b->seqnum)
1290                         return 1;
1291                 else
1292                         return 0;
1293         }
1294
1295         if (a->realtime < b->realtime)
1296                 return -1;
1297         else if (a->realtime > b->realtime)
1298                 return 1;
1299         else
1300                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1301 }
1302
1303 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1304         DIR *d;
1305         int r = 0;
1306         struct vacuum_info *list = NULL;
1307         unsigned n_list = 0, n_allocated = 0, i;
1308         uint64_t sum = 0;
1309
1310         assert(directory);
1311
1312         if (max_use <= 0)
1313                 max_use = DEFAULT_MAX_USE;
1314
1315         d = opendir(directory);
1316         if (!d)
1317                 return -errno;
1318
1319         for (;;) {
1320                 int k;
1321                 struct dirent buf, *de;
1322                 size_t q;
1323                 struct stat st;
1324                 char *p;
1325                 unsigned long long seqnum, realtime;
1326                 sd_id128_t seqnum_id;
1327
1328                 k = readdir_r(d, &buf, &de);
1329                 if (k != 0) {
1330                         r = -k;
1331                         goto finish;
1332                 }
1333
1334                 if (!de)
1335                         break;
1336
1337                 if (!dirent_is_file_with_suffix(de, ".journal"))
1338                         continue;
1339
1340                 q = strlen(de->d_name);
1341
1342                 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
1343                         continue;
1344
1345                 if (de->d_name[q-8-16-1] != '-' ||
1346                     de->d_name[q-8-16-1-16-1] != '-' ||
1347                     de->d_name[q-8-16-1-16-1-32-1] != '@')
1348                         continue;
1349
1350                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
1351                         continue;
1352
1353                 if (!S_ISREG(st.st_mode))
1354                         continue;
1355
1356                 p = strdup(de->d_name);
1357                 if (!p) {
1358                         r = -ENOMEM;
1359                         goto finish;
1360                 }
1361
1362                 de->d_name[q-8-16-1-16-1] = 0;
1363                 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
1364                         free(p);
1365                         continue;
1366                 }
1367
1368                 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
1369                         free(p);
1370                         continue;
1371                 }
1372
1373                 if (n_list >= n_allocated) {
1374                         struct vacuum_info *j;
1375
1376                         n_allocated = MAX(n_allocated * 2U, 8U);
1377                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
1378                         if (!j) {
1379                                 free(p);
1380                                 r = -ENOMEM;
1381                                 goto finish;
1382                         }
1383
1384                         list = j;
1385                 }
1386
1387                 list[n_list].filename = p;
1388                 list[n_list].usage = (uint64_t) st.st_blksize * (uint64_t) st.st_blocks;
1389                 list[n_list].seqnum = seqnum;
1390                 list[n_list].realtime = realtime;
1391                 list[n_list].seqnum_id = seqnum_id;
1392
1393                 sum += list[n_list].usage;
1394
1395                 n_list ++;
1396         }
1397
1398         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
1399
1400         for(i = 0; i < n_list; i++) {
1401                 struct statvfs ss;
1402
1403                 if (fstatvfs(dirfd(d), &ss) < 0) {
1404                         r = -errno;
1405                         goto finish;
1406                 }
1407
1408                 if (sum <= max_use &&
1409                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
1410                         break;
1411
1412                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
1413                         log_debug("Deleted archived journal %s/%s.", directory, list[i].filename);
1414                         sum -= list[i].usage;
1415                 } else if (errno != ENOENT)
1416                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
1417         }
1418
1419 finish:
1420         for (i = 0; i < n_list; i++)
1421                 free(list[i].filename);
1422
1423         free(list);
1424
1425         return r;
1426 }