chiark / gitweb /
journal: implementation rotation
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33
34 #define DEFAULT_ARENA_MAX_SIZE (16ULL*1024ULL*1024ULL*1024ULL)
35 #define DEFAULT_ARENA_MIN_SIZE (256ULL*1024ULL)
36 #define DEFAULT_ARENA_KEEP_FREE (1ULL*1024ULL*1024ULL)
37
38 #define DEFAULT_MAX_USE (16ULL*1024ULL*1024ULL*16ULL)
39
40 #define DEFAULT_HASH_TABLE_SIZE (2047ULL*16ULL)
41 #define DEFAULT_BISECT_TABLE_SIZE ((DEFAULT_ARENA_MAX_SIZE/(64ULL*1024ULL))*8ULL)
42
43 #define DEFAULT_WINDOW_SIZE (128ULL*1024ULL*1024ULL)
44
45 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
46
47 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
48
49 void journal_file_close(JournalFile *f) {
50         assert(f);
51
52         if (f->header) {
53                 if (f->writable && f->header->state == htole32(STATE_ONLINE))
54                         f->header->state = htole32(STATE_OFFLINE);
55
56                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
57         }
58
59         if (f->hash_table_window)
60                 munmap(f->hash_table_window, f->hash_table_window_size);
61
62         if (f->bisect_table_window)
63                 munmap(f->bisect_table_window, f->bisect_table_window_size);
64
65         if (f->window)
66                 munmap(f->window, f->window_size);
67
68         if (f->fd >= 0)
69                 close_nointr_nofail(f->fd);
70
71         free(f->path);
72         free(f);
73 }
74
75 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
76         Header h;
77         ssize_t k;
78         int r;
79
80         assert(f);
81
82         zero(h);
83         memcpy(h.signature, signature, 8);
84         h.arena_offset = htole64(ALIGN64(sizeof(h)));
85         h.arena_max_size = htole64(DEFAULT_ARENA_MAX_SIZE);
86         h.arena_min_size = htole64(DEFAULT_ARENA_MIN_SIZE);
87         h.arena_keep_free = htole64(DEFAULT_ARENA_KEEP_FREE);
88
89         r = sd_id128_randomize(&h.file_id);
90         if (r < 0)
91                 return r;
92
93         if (template) {
94                 h.seqnum_id = template->header->seqnum_id;
95                 h.seqnum = template->header->seqnum;
96         } else
97                 h.seqnum_id = h.file_id;
98
99         k = pwrite(f->fd, &h, sizeof(h), 0);
100         if (k < 0)
101                 return -errno;
102
103         if (k != sizeof(h))
104                 return -EIO;
105
106         return 0;
107 }
108
109 static int journal_file_refresh_header(JournalFile *f) {
110         int r;
111
112         assert(f);
113
114         r = sd_id128_get_machine(&f->header->machine_id);
115         if (r < 0)
116                 return r;
117
118         r = sd_id128_get_boot(&f->header->boot_id);
119         if (r < 0)
120                 return r;
121
122         f->header->state = htole32(STATE_ONLINE);
123         return 0;
124 }
125
126 static int journal_file_verify_header(JournalFile *f) {
127         assert(f);
128
129         if (memcmp(f->header, signature, 8))
130                 return -EBADMSG;
131
132         if (f->header->incompatible_flags != 0)
133                 return -EPROTONOSUPPORT;
134
135         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
136                 return -ENODATA;
137
138         if (f->writable) {
139                 uint32_t state;
140                 sd_id128_t machine_id;
141                 int r;
142
143                 r = sd_id128_get_machine(&machine_id);
144                 if (r < 0)
145                         return r;
146
147                 if (!sd_id128_equal(machine_id, f->header->machine_id))
148                         return -EHOSTDOWN;
149
150                 state = le32toh(f->header->state);
151
152                 if (state == STATE_ONLINE)
153                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
154                 else if (state == STATE_ARCHIVED)
155                         return -ESHUTDOWN;
156                 else if (state != STATE_OFFLINE)
157                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
158         }
159
160         return 0;
161 }
162
163 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
164         uint64_t asize;
165         uint64_t old_size, new_size;
166
167         assert(f);
168
169         if (offset < le64toh(f->header->arena_offset))
170                 return -EINVAL;
171
172         new_size = PAGE_ALIGN(offset + size);
173
174         /* We assume that this file is not sparse, and we know that
175          * for sure, since we always call posix_fallocate()
176          * ourselves */
177
178         old_size =
179                 le64toh(f->header->arena_offset) +
180                 le64toh(f->header->arena_size);
181
182         if (old_size >= new_size)
183                 return 0;
184
185         asize = new_size - le64toh(f->header->arena_offset);
186
187         if (asize > le64toh(f->header->arena_min_size)) {
188                 struct statvfs svfs;
189
190                 if (fstatvfs(f->fd, &svfs) >= 0) {
191                         uint64_t available;
192
193                         available = svfs.f_bfree * svfs.f_bsize;
194
195                         if (available >= f->header->arena_keep_free)
196                                 available -= f->header->arena_keep_free;
197                         else
198                                 available = 0;
199
200                         if (new_size - old_size > available)
201                                 return -E2BIG;
202                 }
203         }
204
205         if (asize > le64toh(f->header->arena_max_size))
206                 return -E2BIG;
207
208         if (posix_fallocate(f->fd, old_size, new_size - old_size) < 0)
209                 return -errno;
210
211         if (fstat(f->fd, &f->last_stat) < 0)
212                 return -errno;
213
214         f->header->arena_size = htole64(asize);
215
216         return 0;
217 }
218
219 static int journal_file_map(
220                 JournalFile *f,
221                 uint64_t offset,
222                 uint64_t size,
223                 void **_window,
224                 uint64_t *_woffset,
225                 uint64_t *_wsize,
226                 void **ret) {
227
228         uint64_t woffset, wsize;
229         void *window;
230
231         assert(f);
232         assert(size > 0);
233         assert(ret);
234
235         woffset = offset & ~((uint64_t) page_size() - 1ULL);
236         wsize = size + (offset - woffset);
237         wsize = PAGE_ALIGN(wsize);
238
239         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
240         if (window == MAP_FAILED)
241                 return -errno;
242
243         if (_window)
244                 *_window = window;
245
246         if (_woffset)
247                 *_woffset = woffset;
248
249         if (_wsize)
250                 *_wsize = wsize;
251
252         *ret = (uint8_t*) window + (offset - woffset);
253
254         return 0;
255 }
256
257 static int journal_file_move_to(JournalFile *f, uint64_t offset, uint64_t size, void **ret) {
258         void *p;
259         uint64_t delta;
260         int r;
261
262         assert(f);
263         assert(ret);
264
265         if (_likely_(f->window &&
266                      f->window_offset <= offset &&
267                      f->window_offset+f->window_size >= offset + size)) {
268
269                 *ret = (uint8_t*) f->window + (offset - f->window_offset);
270                 return 0;
271         }
272
273         if (f->window) {
274                 if (munmap(f->window, f->window_size) < 0)
275                         return -errno;
276
277                 f->window = NULL;
278                 f->window_size = f->window_offset = 0;
279         }
280
281         if (size < DEFAULT_WINDOW_SIZE) {
282                 /* If the default window size is larger then what was
283                  * asked for extend the mapping a bit in the hope to
284                  * minimize needed remappings later on. We add half
285                  * the window space before and half behind the
286                  * requested mapping */
287
288                 delta = PAGE_ALIGN((DEFAULT_WINDOW_SIZE - size) / 2);
289
290                 if (offset < delta)
291                         delta = offset;
292
293                 offset -= delta;
294                 size += (DEFAULT_WINDOW_SIZE - delta);
295         } else
296                 delta = 0;
297
298         r = journal_file_map(f,
299                              offset, size,
300                              &f->window, &f->window_offset, &f->window_size,
301                              & p);
302
303         if (r < 0)
304                 return r;
305
306         *ret = (uint8_t*) p + delta;
307         return 0;
308 }
309
310 static bool verify_hash(Object *o) {
311         uint64_t t;
312
313         assert(o);
314
315         t = le64toh(o->object.type);
316         if (t == OBJECT_DATA) {
317                 uint64_t s, h1, h2;
318
319                 s = le64toh(o->object.size);
320
321                 h1 = le64toh(o->data.hash);
322                 h2 = hash64(o->data.payload, s - offsetof(Object, data.payload));
323
324                 return h1 == h2;
325         }
326
327         return true;
328 }
329
330 int journal_file_move_to_object(JournalFile *f, uint64_t offset, int type, Object **ret) {
331         int r;
332         void *t;
333         Object *o;
334         uint64_t s;
335
336         assert(f);
337         assert(ret);
338
339         r = journal_file_move_to(f, offset, sizeof(ObjectHeader), &t);
340         if (r < 0)
341                 return r;
342
343         o = (Object*) t;
344         s = le64toh(o->object.size);
345
346         if (s < sizeof(ObjectHeader))
347                 return -EBADMSG;
348
349         if (type >= 0 && le64toh(o->object.type) != type)
350                 return -EBADMSG;
351
352         if (s > sizeof(ObjectHeader)) {
353                 r = journal_file_move_to(f, offset, s, &t);
354                 if (r < 0)
355                         return r;
356
357                 o = (Object*) t;
358         }
359
360         if (!verify_hash(o))
361                 return -EBADMSG;
362
363         *ret = o;
364         return 0;
365 }
366
367 static uint64_t journal_file_seqnum(JournalFile *f) {
368         uint64_t r;
369
370         assert(f);
371
372         r = le64toh(f->header->seqnum) + 1;
373         f->header->seqnum = htole64(r);
374
375         return r;
376 }
377
378 static int journal_file_append_object(JournalFile *f, uint64_t size, Object **ret, uint64_t *offset) {
379         int r;
380         uint64_t p;
381         Object *tail, *o;
382         void *t;
383
384         assert(f);
385         assert(size >= sizeof(ObjectHeader));
386         assert(offset);
387         assert(ret);
388
389         p = le64toh(f->header->tail_object_offset);
390
391         if (p == 0)
392                 p = le64toh(f->header->arena_offset);
393         else {
394                 r = journal_file_move_to_object(f, p, -1, &tail);
395                 if (r < 0)
396                         return r;
397
398                 p += ALIGN64(le64toh(tail->object.size));
399         }
400
401         r = journal_file_allocate(f, p, size);
402         if (r < 0)
403                 return r;
404
405         r = journal_file_move_to(f, p, size, &t);
406         if (r < 0)
407                 return r;
408
409         o = (Object*) t;
410
411         zero(o->object);
412         o->object.type = htole64(OBJECT_UNUSED);
413         zero(o->object.reserved);
414         o->object.size = htole64(size);
415
416         f->header->tail_object_offset = htole64(p);
417         if (f->header->head_object_offset == 0)
418                 f->header->head_object_offset = htole64(p);
419
420         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
421
422         *ret = o;
423         *offset = p;
424
425         return 0;
426 }
427
428 static int journal_file_setup_hash_table(JournalFile *f) {
429         uint64_t s, p;
430         Object *o;
431         int r;
432
433         assert(f);
434
435         s = DEFAULT_HASH_TABLE_SIZE;
436         r = journal_file_append_object(f, offsetof(Object, hash_table.table) + s, &o, &p);
437         if (r < 0)
438                 return r;
439
440         o->object.type = htole64(OBJECT_HASH_TABLE);
441         memset(o->hash_table.table, 0, s);
442
443         f->header->hash_table_offset = htole64(p + offsetof(Object, hash_table.table));
444         f->header->hash_table_size = htole64(s);
445
446         return 0;
447 }
448
449 static int journal_file_setup_bisect_table(JournalFile *f) {
450         uint64_t s, p;
451         Object *o;
452         int r;
453
454         assert(f);
455
456         s = DEFAULT_BISECT_TABLE_SIZE;
457         r = journal_file_append_object(f, offsetof(Object, bisect_table.table) + s, &o, &p);
458         if (r < 0)
459                 return r;
460
461         o->object.type = htole64(OBJECT_BISECT_TABLE);
462         memset(o->bisect_table.table, 0, s);
463
464         f->header->bisect_table_offset = htole64(p + offsetof(Object, bisect_table.table));
465         f->header->bisect_table_size = htole64(s);
466
467         return 0;
468 }
469
470 static int journal_file_map_hash_table(JournalFile *f) {
471         uint64_t s, p;
472         void *t;
473         int r;
474
475         assert(f);
476
477         p = le64toh(f->header->hash_table_offset);
478         s = le64toh(f->header->hash_table_size);
479
480         r = journal_file_map(f,
481                              p, s,
482                              &f->hash_table_window, NULL, &f->hash_table_window_size,
483                              &t);
484         if (r < 0)
485                 return r;
486
487         f->hash_table = t;
488         return 0;
489 }
490
491 static int journal_file_map_bisect_table(JournalFile *f) {
492         uint64_t s, p;
493         void *t;
494         int r;
495
496         assert(f);
497
498         p = le64toh(f->header->bisect_table_offset);
499         s = le64toh(f->header->bisect_table_size);
500
501         r = journal_file_map(f,
502                              p, s,
503                              &f->bisect_table_window, NULL, &f->bisect_table_window_size,
504                              &t);
505
506         if (r < 0)
507                 return r;
508
509         f->bisect_table = t;
510         return 0;
511 }
512
513 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash_index) {
514         uint64_t p;
515         int r;
516
517         assert(f);
518         assert(o);
519         assert(offset > 0);
520         assert(o->object.type == htole64(OBJECT_DATA));
521
522         o->data.head_entry_offset = o->data.tail_entry_offset = 0;
523         o->data.next_hash_offset = 0;
524
525         p = le64toh(f->hash_table[hash_index].tail_hash_offset);
526         if (p == 0) {
527                 /* Only entry in the hash table is easy */
528
529                 o->data.prev_hash_offset = 0;
530                 f->hash_table[hash_index].head_hash_offset = htole64(offset);
531         } else {
532                 o->data.prev_hash_offset = htole64(p);
533
534                 /* Temporarily move back to the previous data object,
535                  * to patch in pointer */
536
537                 r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
538                 if (r < 0)
539                         return r;
540
541                 o->data.next_hash_offset = offset;
542
543                 r = journal_file_move_to_object(f, offset, OBJECT_DATA, &o);
544                 if (r < 0)
545                         return r;
546         }
547
548         f->hash_table[hash_index].tail_hash_offset = htole64(offset);
549
550         return 0;
551 }
552
553 static int journal_file_append_data(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
554         uint64_t hash, h, p, np;
555         uint64_t osize;
556         Object *o;
557         int r;
558
559         assert(f);
560         assert(data || size == 0);
561
562         osize = offsetof(Object, data.payload) + size;
563
564         hash = hash64(data, size);
565         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
566         p = le64toh(f->hash_table[h].head_hash_offset);
567
568         while (p != 0) {
569                 /* Look for this data object in the hash table */
570
571                 r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
572                 if (r < 0)
573                         return r;
574
575                 if (le64toh(o->object.size) == osize &&
576                     memcmp(o->data.payload, data, size) == 0) {
577
578                         if (le64toh(o->data.hash) != hash)
579                                 return -EBADMSG;
580
581                         if (ret)
582                                 *ret = o;
583
584                         if (offset)
585                                 *offset = p;
586
587                         return 0;
588                 }
589
590                 p = le64toh(o->data.next_hash_offset);
591         }
592
593         r = journal_file_append_object(f, osize, &o, &np);
594         if (r < 0)
595                 return r;
596
597         o->object.type = htole64(OBJECT_DATA);
598         o->data.hash = htole64(hash);
599         memcpy(o->data.payload, data, size);
600
601         r = journal_file_link_data(f, o, np, h);
602         if (r < 0)
603                 return r;
604
605         if (ret)
606                 *ret = o;
607
608         if (offset)
609                 *offset = np;
610
611         return 0;
612 }
613
614 uint64_t journal_file_entry_n_items(Object *o) {
615         assert(o);
616         assert(o->object.type == htole64(OBJECT_ENTRY));
617
618         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
619 }
620
621 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
622         uint64_t p, q;
623         int r;
624         assert(f);
625         assert(o);
626         assert(offset > 0);
627
628         p = le64toh(o->entry.items[i].object_offset);
629         if (p == 0)
630                 return -EINVAL;
631
632         o->entry.items[i].next_entry_offset = 0;
633
634         /* Move to the data object */
635         r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
636         if (r < 0)
637                 return r;
638
639         q = le64toh(o->data.tail_entry_offset);
640         o->data.tail_entry_offset = htole64(offset);
641
642         if (q == 0)
643                 o->data.head_entry_offset = htole64(offset);
644         else {
645                 uint64_t n, j;
646
647                 /* Move to previous entry */
648                 r = journal_file_move_to_object(f, q, OBJECT_ENTRY, &o);
649                 if (r < 0)
650                         return r;
651
652                 n = journal_file_entry_n_items(o);
653                 for (j = 0; j < n; j++)
654                         if (le64toh(o->entry.items[j].object_offset) == p)
655                                 break;
656
657                 if (j >= n)
658                         return -EBADMSG;
659
660                 o->entry.items[j].next_entry_offset = offset;
661         }
662
663         /* Move back to original entry */
664         r = journal_file_move_to_object(f, offset, OBJECT_ENTRY, &o);
665         if (r < 0)
666                 return r;
667
668         o->entry.items[i].prev_entry_offset = q;
669         return 0;
670 }
671
672 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
673         uint64_t p, i, n, k, a, b;
674         int r;
675
676         assert(f);
677         assert(o);
678         assert(offset > 0);
679         assert(o->object.type == htole64(OBJECT_ENTRY));
680
681         /* Link up the entry itself */
682         p = le64toh(f->header->tail_entry_offset);
683
684         o->entry.prev_entry_offset = f->header->tail_entry_offset;
685         o->entry.next_entry_offset = 0;
686
687         if (p == 0) {
688                 f->header->head_entry_offset = htole64(offset);
689                 f->header->head_entry_realtime = o->entry.realtime;
690         } else {
691                 /* Temporarily move back to the previous entry, to
692                  * patch in pointer */
693
694                 r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
695                 if (r < 0)
696                         return r;
697
698                 o->entry.next_entry_offset = htole64(offset);
699
700                 r = journal_file_move_to_object(f, offset, OBJECT_ENTRY, &o);
701                 if (r < 0)
702                         return r;
703         }
704
705         f->header->tail_entry_offset = htole64(offset);
706         f->header->tail_entry_realtime = o->entry.realtime;
707
708         /* Link up the items */
709         n = journal_file_entry_n_items(o);
710         for (i = 0; i < n; i++) {
711                 r = journal_file_link_entry_item(f, o, offset, i);
712                 if (r < 0)
713                         return r;
714         }
715
716         /* Link up the entry in the bisect table */
717         n = le64toh(f->header->bisect_table_size) / sizeof(uint64_t);
718         k = le64toh(f->header->arena_max_size) / n;
719
720         a = (le64toh(f->header->last_bisect_offset) + k - 1) / k;
721         b = offset / k;
722
723         for (; a <= b; a++)
724                 f->bisect_table[a] = htole64(offset);
725
726         f->header->last_bisect_offset = htole64(offset + le64toh(o->object.size));
727
728         return 0;
729 }
730
731 static int journal_file_append_entry_internal(
732                 JournalFile *f,
733                 const dual_timestamp *ts,
734                 uint64_t xor_hash,
735                 const EntryItem items[], unsigned n_items,
736                 Object **ret, uint64_t *offset) {
737         uint64_t np;
738         uint64_t osize;
739         Object *o;
740         int r;
741
742         assert(f);
743         assert(items || n_items == 0);
744
745         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
746
747         r = journal_file_append_object(f, osize, &o, &np);
748         if (r < 0)
749                 return r;
750
751         o->object.type = htole64(OBJECT_ENTRY);
752         o->entry.seqnum = htole64(journal_file_seqnum(f));
753         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
754         o->entry.realtime = ts ? htole64(ts->realtime) : 0;
755         o->entry.monotonic = ts ? htole64(ts->monotonic) : 0;
756         o->entry.xor_hash = htole64(xor_hash);
757         o->entry.boot_id = f->header->boot_id;
758
759         r = journal_file_link_entry(f, o, np);
760         if (r < 0)
761                 return r;
762
763         if (ret)
764                 *ret = o;
765
766         if (offset)
767                 *offset = np;
768
769         return 0;
770 }
771
772 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, Object **ret, uint64_t *offset) {
773         unsigned i;
774         EntryItem *items;
775         int r;
776         uint64_t xor_hash = 0;
777
778         assert(f);
779         assert(iovec || n_iovec == 0);
780
781         items = new(EntryItem, n_iovec);
782         if (!items)
783                 return -ENOMEM;
784
785         for (i = 0; i < n_iovec; i++) {
786                 uint64_t p;
787                 Object *o;
788
789                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
790                 if (r < 0)
791                         goto finish;
792
793                 xor_hash ^= le64toh(o->data.hash);
794                 items[i].object_offset = htole64(p);
795         }
796
797         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, ret, offset);
798
799 finish:
800         free(items);
801
802         return r;
803 }
804
805 int journal_file_move_to_entry(JournalFile *f, uint64_t seqnum, Object **ret, uint64_t *offset) {
806         Object *o;
807         uint64_t lower, upper, p, n, k;
808         int r;
809
810         assert(f);
811
812         n = le64toh(f->header->bisect_table_size) / sizeof(uint64_t);
813         k = le64toh(f->header->arena_max_size) / n;
814
815         lower = 0;
816         upper = le64toh(f->header->last_bisect_offset)/k+1;
817
818         while (lower < upper) {
819                 k = (upper + lower) / 2;
820                 p = le64toh(f->bisect_table[k]);
821
822                 if (p == 0) {
823                         upper = k;
824                         continue;
825                 }
826
827                 r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
828                 if (r < 0)
829                         return r;
830
831                 if (o->entry.seqnum == seqnum) {
832                         if (ret)
833                                 *ret = o;
834
835                         if (offset)
836                                 *offset = p;
837
838                         return 1;
839                 } else if (seqnum < o->entry.seqnum)
840                         upper = k;
841                 else if (seqnum > o->entry.seqnum)
842                         lower = k+1;
843         }
844
845         assert(lower == upper);
846
847         if (lower <= 0)
848                 return 0;
849
850         /* The object we are looking for is between
851          * bisect_table[lower-1] and bisect_table[lower] */
852
853         p = le64toh(f->bisect_table[lower-1]);
854
855         for (;;) {
856                 r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
857                 if (r < 0)
858                         return r;
859
860                 if (o->entry.seqnum == seqnum) {
861                         if (ret)
862                                 *ret = o;
863
864                         if (offset)
865                                 *offset = p;
866
867                         return 1;
868
869                 } if (seqnum < o->entry.seqnum)
870                         return 0;
871
872                 if (o->entry.next_entry_offset == 0)
873                         return 0;
874
875                 p = le64toh(o->entry.next_entry_offset);
876         }
877
878         return 0;
879 }
880
881 int journal_file_next_entry(JournalFile *f, Object *o, Object **ret, uint64_t *offset) {
882         uint64_t np;
883         int r;
884
885         assert(f);
886
887         if (!o)
888                 np = le64toh(f->header->head_entry_offset);
889         else {
890                 if (le64toh(o->object.type) != OBJECT_ENTRY)
891                         return -EINVAL;
892
893                 np = le64toh(o->entry.next_entry_offset);
894         }
895
896         if (np == 0)
897                 return 0;
898
899         r = journal_file_move_to_object(f, np, OBJECT_ENTRY, &o);
900         if (r < 0)
901                 return r;
902
903         if (ret)
904                 *ret = o;
905
906         if (offset)
907                 *offset = np;
908
909         return 1;
910 }
911
912 int journal_file_prev_entry(JournalFile *f, Object *o, Object **ret, uint64_t *offset) {
913         uint64_t np;
914         int r;
915
916         assert(f);
917
918         if (!o)
919                 np = le64toh(f->header->tail_entry_offset);
920         else {
921                 if (le64toh(o->object.type) != OBJECT_ENTRY)
922                         return -EINVAL;
923
924                 np = le64toh(o->entry.prev_entry_offset);
925         }
926
927         if (np == 0)
928                 return 0;
929
930         r = journal_file_move_to_object(f, np, OBJECT_ENTRY, &o);
931         if (r < 0)
932                 return r;
933
934         if (ret)
935                 *ret = o;
936
937         if (offset)
938                 *offset = np;
939
940         return 1;
941 }
942
943 int journal_file_find_first_entry(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
944         uint64_t p, osize, hash, h;
945         int r;
946
947         assert(f);
948         assert(data || size == 0);
949
950         osize = offsetof(Object, data.payload) + size;
951
952         hash = hash64(data, size);
953         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
954         p = le64toh(f->hash_table[h].head_hash_offset);
955
956         while (p != 0) {
957                 Object *o;
958
959                 r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
960                 if (r < 0)
961                         return r;
962
963                 if (le64toh(o->object.size) == osize &&
964                     memcmp(o->data.payload, data, size) == 0) {
965
966                         if (le64toh(o->data.hash) != hash)
967                                 return -EBADMSG;
968
969                         if (o->data.head_entry_offset == 0)
970                                 return 0;
971
972                         p = le64toh(o->data.head_entry_offset);
973                         r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
974                         if (r < 0)
975                                 return r;
976
977                         if (ret)
978                                 *ret = o;
979
980                         if (offset)
981                                 *offset = p;
982
983                         return 1;
984                 }
985
986                 p = le64toh(o->data.next_hash_offset);
987         }
988
989         return 0;
990 }
991
992 int journal_file_find_last_entry(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
993         uint64_t p, osize, hash, h;
994         int r;
995
996         assert(f);
997         assert(data || size == 0);
998
999         osize = offsetof(Object, data.payload) + size;
1000
1001         hash = hash64(data, size);
1002         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
1003         p = le64toh(f->hash_table[h].tail_hash_offset);
1004
1005         while (p != 0) {
1006                 Object *o;
1007
1008                 r = journal_file_move_to_object(f, p, OBJECT_DATA, &o);
1009                 if (r < 0)
1010                         return r;
1011
1012                 if (le64toh(o->object.size) == osize &&
1013                     memcmp(o->data.payload, data, size) == 0) {
1014
1015                         if (le64toh(o->data.hash) != hash)
1016                                 return -EBADMSG;
1017
1018                         if (o->data.tail_entry_offset == 0)
1019                                 return 0;
1020
1021                         p = le64toh(o->data.tail_entry_offset);
1022                         r = journal_file_move_to_object(f, p, OBJECT_ENTRY, &o);
1023                         if (r < 0)
1024                                 return r;
1025
1026                         if (ret)
1027                                 *ret = o;
1028
1029                         if (offset)
1030                                 *offset = p;
1031
1032                         return 1;
1033                 }
1034
1035                 p = le64toh(o->data.prev_hash_offset);
1036         }
1037
1038         return 0;
1039 }
1040
1041 void journal_file_dump(JournalFile *f) {
1042         char a[33], b[33], c[33];
1043         Object *o;
1044         int r;
1045         uint64_t p;
1046
1047         assert(f);
1048
1049         printf("File ID: %s\n"
1050                "Machine ID: %s\n"
1051                "Boot ID: %s\n"
1052                "Arena size: %llu\n",
1053                sd_id128_to_string(f->header->file_id, a),
1054                sd_id128_to_string(f->header->machine_id, b),
1055                sd_id128_to_string(f->header->boot_id, c),
1056                (unsigned long long) le64toh(f->header->arena_size));
1057
1058         p = le64toh(f->header->head_object_offset);
1059         while (p != 0) {
1060                 r = journal_file_move_to_object(f, p, -1, &o);
1061                 if (r < 0)
1062                         goto fail;
1063
1064                 switch (o->object.type) {
1065
1066                 case OBJECT_UNUSED:
1067                         printf("Type: OBJECT_UNUSED\n");
1068                         break;
1069
1070                 case OBJECT_DATA:
1071                         printf("Type: OBJECT_DATA\n");
1072                         break;
1073
1074                 case OBJECT_ENTRY:
1075                         printf("Type: OBJECT_ENTRY %llu\n", (unsigned long long) le64toh(o->entry.seqnum));
1076                         break;
1077
1078                 case OBJECT_HASH_TABLE:
1079                         printf("Type: OBJECT_HASH_TABLE\n");
1080                         break;
1081
1082                 case OBJECT_BISECT_TABLE:
1083                         printf("Type: OBJECT_BISECT_TABLE\n");
1084                         break;
1085                 }
1086
1087                 if (p == le64toh(f->header->tail_object_offset))
1088                         p = 0;
1089                 else
1090                         p = p + ALIGN64(le64toh(o->object.size));
1091         }
1092
1093         return;
1094 fail:
1095         log_error("File corrupt");
1096 }
1097
1098 int journal_file_open(
1099                 const char *fname,
1100                 int flags,
1101                 mode_t mode,
1102                 JournalFile *template,
1103                 JournalFile **ret) {
1104
1105         JournalFile *f;
1106         int r;
1107         bool newly_created = false;
1108
1109         assert(fname);
1110
1111         if ((flags & O_ACCMODE) != O_RDONLY &&
1112             (flags & O_ACCMODE) != O_RDWR)
1113                 return -EINVAL;
1114
1115         f = new0(JournalFile, 1);
1116         if (!f)
1117                 return -ENOMEM;
1118
1119         f->fd = -1;
1120         f->flags = flags;
1121         f->mode = mode;
1122         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1123         f->prot = prot_from_flags(flags);
1124
1125         f->path = strdup(fname);
1126         if (!f->path) {
1127                 r = -ENOMEM;
1128                 goto fail;
1129         }
1130
1131         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1132         if (f->fd < 0) {
1133                 r = -errno;
1134                 goto fail;
1135         }
1136
1137         if (fstat(f->fd, &f->last_stat) < 0) {
1138                 r = -errno;
1139                 goto fail;
1140         }
1141
1142         if (f->last_stat.st_size == 0 && f->writable) {
1143                 newly_created = true;
1144
1145                 r = journal_file_init_header(f, template);
1146                 if (r < 0)
1147                         goto fail;
1148
1149                 if (fstat(f->fd, &f->last_stat) < 0) {
1150                         r = -errno;
1151                         goto fail;
1152                 }
1153         }
1154
1155         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1156                 r = -EIO;
1157                 goto fail;
1158         }
1159
1160         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1161         if (f->header == MAP_FAILED) {
1162                 f->header = NULL;
1163                 r = -errno;
1164                 goto fail;
1165         }
1166
1167         if (!newly_created) {
1168                 r = journal_file_verify_header(f);
1169                 if (r < 0)
1170                         goto fail;
1171         }
1172
1173         if (f->writable) {
1174                 r = journal_file_refresh_header(f);
1175                 if (r < 0)
1176                         goto fail;
1177         }
1178
1179         if (newly_created) {
1180
1181                 r = journal_file_setup_hash_table(f);
1182                 if (r < 0)
1183                         goto fail;
1184
1185                 r = journal_file_setup_bisect_table(f);
1186                 if (r < 0)
1187                         goto fail;
1188         }
1189
1190         r = journal_file_map_hash_table(f);
1191         if (r < 0)
1192                 goto fail;
1193
1194         r = journal_file_map_bisect_table(f);
1195         if (r < 0)
1196                 goto fail;
1197
1198         if (ret)
1199                 *ret = f;
1200
1201         return 0;
1202
1203 fail:
1204         journal_file_close(f);
1205
1206         return r;
1207 }
1208
1209 int journal_file_rotate(JournalFile **f) {
1210         char *p;
1211         size_t l;
1212         JournalFile *old_file, *new_file = NULL;
1213         int r;
1214
1215         assert(f);
1216         assert(*f);
1217
1218         old_file = *f;
1219
1220         if (!old_file->writable)
1221                 return -EINVAL;
1222
1223         if (!endswith(old_file->path, ".journal"))
1224                 return -EINVAL;
1225
1226         l = strlen(old_file->path);
1227
1228         p = new(char, l + 1 + 16 + 1 + 32 + 1 + 16 + 1);
1229         if (!p)
1230                 return -ENOMEM;
1231
1232         memcpy(p, old_file->path, l - 8);
1233         p[l-8] = '@';
1234         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1235         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1236                  "-%016llx-%016llx.journal",
1237                  (unsigned long long) le64toh((*f)->header->seqnum),
1238                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1239
1240         r = rename(old_file->path, p);
1241         free(p);
1242
1243         if (r < 0)
1244                 return -errno;
1245
1246         old_file->header->state = le32toh(STATE_ARCHIVED);
1247
1248         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1249         journal_file_close(old_file);
1250
1251         *f = new_file;
1252         return r;
1253 }
1254
1255 struct vacuum_info {
1256         off_t usage;
1257         char *filename;
1258
1259         uint64_t realtime;
1260         sd_id128_t seqnum_id;
1261         uint64_t seqnum;
1262 };
1263
1264 static int vacuum_compare(const void *_a, const void *_b) {
1265         const struct vacuum_info *a, *b;
1266
1267         a = _a;
1268         b = _b;
1269
1270         if (sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1271                 if (a->seqnum < b->seqnum)
1272                         return -1;
1273                 else if (a->seqnum > b->seqnum)
1274                         return 1;
1275                 else
1276                         return 0;
1277         }
1278
1279         if (a->realtime < b->realtime)
1280                 return -1;
1281         else if (a->realtime > b->realtime)
1282                 return 1;
1283         else
1284                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1285 }
1286
1287 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1288         DIR *d;
1289         int r = 0;
1290         struct vacuum_info *list = NULL;
1291         unsigned n_list = 0, n_allocated = 0, i;
1292         uint64_t sum = 0;
1293
1294         assert(directory);
1295
1296         if (max_use <= 0)
1297                 max_use = DEFAULT_MAX_USE;
1298
1299         d = opendir(directory);
1300         if (!d)
1301                 return -errno;
1302
1303         for (;;) {
1304                 int k;
1305                 struct dirent buf, *de;
1306                 size_t q;
1307                 struct stat st;
1308                 char *p;
1309                 unsigned long long seqnum, realtime;
1310                 sd_id128_t seqnum_id;
1311
1312                 k = readdir_r(d, &buf, &de);
1313                 if (k != 0) {
1314                         r = -k;
1315                         goto finish;
1316                 }
1317
1318                 if (!de)
1319                         break;
1320
1321                 if (!dirent_is_file_with_suffix(de, ".journal"))
1322                         continue;
1323
1324                 q = strlen(de->d_name);
1325
1326                 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
1327                         continue;
1328
1329                 if (de->d_name[q-8-16-1] != '-' ||
1330                     de->d_name[q-8-16-1-16-1] != '-' ||
1331                     de->d_name[q-8-16-1-16-1-32-1] != '@')
1332                         continue;
1333
1334                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
1335                         continue;
1336
1337                 if (!S_ISREG(st.st_mode))
1338                         continue;
1339
1340                 p = strdup(de->d_name);
1341                 if (!p) {
1342                         r = -ENOMEM;
1343                         goto finish;
1344                 }
1345
1346                 de->d_name[q-8-16-1-16-1] = 0;
1347                 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
1348                         free(p);
1349                         continue;
1350                 }
1351
1352                 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
1353                         free(p);
1354                         continue;
1355                 }
1356
1357                 if (n_list >= n_allocated) {
1358                         struct vacuum_info *j;
1359
1360                         n_allocated = MAX(n_allocated * 2U, 8U);
1361                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
1362                         if (!j) {
1363                                 free(p);
1364                                 r = -ENOMEM;
1365                                 goto finish;
1366                         }
1367
1368                         list = j;
1369                 }
1370
1371                 list[n_list].filename = p;
1372                 list[n_list].usage = (uint64_t) st.st_blksize * (uint64_t) st.st_blocks;
1373                 list[n_list].seqnum = seqnum;
1374                 list[n_list].realtime = realtime;
1375                 list[n_list].seqnum_id = seqnum_id;
1376
1377                 sum += list[n_list].usage;
1378
1379                 n_list ++;
1380         }
1381
1382         qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
1383
1384         for(i = 0; i < n_list; i++) {
1385                 struct statvfs ss;
1386
1387                 if (fstatvfs(dirfd(d), &ss) < 0) {
1388                         r = -errno;
1389                         goto finish;
1390                 }
1391
1392                 if (sum <= max_use &&
1393                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
1394                         break;
1395
1396                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
1397                         log_debug("Deleted archived journal %s/%s.", directory, list[i].filename);
1398                         sum -= list[i].usage;
1399                 } else if (errno != ENOENT)
1400                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
1401         }
1402
1403 finish:
1404         for (i = 0; i < n_list; i++)
1405                 free(list[i].filename);
1406
1407         free(list);
1408
1409         return r;
1410 }